141 files changed, 29980 insertions, 877 deletions
diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
index 7fe45657bc..8a966dcb05 100644
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@@ -141,6 +141,11 @@ Comment: AMD FidelityFX Super Resolution
 Copyright: 2021, Advanced Micro Devices, Inc.
 License: Expat
 
+Files: ./thirdparty/astcenc/
+Comment: Arm ASTC Encoder
+Copyright: 2011-2023, Arm Limited
+License: Apache-2.0
+
 Files: ./thirdparty/basis_universal/
 Comment: Basis Universal
 Copyright: 2022, Binomial LLC.
diff --git a/core/config/project_settings.cpp b/core/config/project_settings.cpp
index de27b6d5ac..ed89bc15d3 100644
--- a/core/config/project_settings.cpp
+++ b/core/config/project_settings.cpp
@@ -1069,23 +1069,6 @@ Variant _GLOBAL_DEF(const PropertyInfo &p_info, const Variant &p_default, bool p
 	return ret;
 }
 
-Vector<String> ProjectSettings::get_optimizer_presets() const {
-	List<PropertyInfo> pi;
-	ProjectSettings::get_singleton()->get_property_list(&pi);
-	Vector<String> names;
-
-	for (const PropertyInfo &E : pi) {
-		if (!E.name.begins_with("optimizer_presets/")) {
-			continue;
-		}
-		names.push_back(E.name.get_slicec('/', 1));
-	}
-
-	names.sort();
-
-	return names;
-}
-
 void ProjectSettings::_add_property_info_bind(const Dictionary &p_info) {
 	ERR_FAIL_COND(!p_info.has("name"));
 	ERR_FAIL_COND(!p_info.has("type"));
diff --git a/core/config/project_settings.h b/core/config/project_settings.h
index 29c495c46b..d1704a7c31 100644
--- a/core/config/project_settings.h
+++ b/core/config/project_settings.h
@@ -178,8 +178,6 @@ public:
 	const HashMap<StringName, PropertyInfo> &get_custom_property_info() const;
 	uint64_t get_last_saved_time() { return last_save_time; }
 
-	Vector<String> get_optimizer_presets() const;
-
 	List<String> get_input_presets() const { return input_presets; }
 
 	Variant get_setting_with_override(const StringName &p_name) const;
diff --git a/core/debugger/script_debugger.cpp b/core/debugger/script_debugger.cpp
index 8af1573bff..32725b76c1 100644
--- a/core/debugger/script_debugger.cpp
+++ b/core/debugger/script_debugger.cpp
@@ -73,10 +73,6 @@ bool ScriptDebugger::is_breakpoint(int p_line, const StringName &p_source) const
 	return breakpoints[p_line].has(p_source);
 }
 
-bool ScriptDebugger::is_breakpoint_line(int p_line) const {
-	return breakpoints.has(p_line);
-}
-
 String ScriptDebugger::breakpoint_find_source(const String &p_source) const {
 	return p_source;
 }
diff --git a/core/debugger/script_debugger.h b/core/debugger/script_debugger.h
index c7aa90027b..edce089179 100644
--- a/core/debugger/script_debugger.h
+++ b/core/debugger/script_debugger.h
@@ -64,7 +64,6 @@ public:
 	void insert_breakpoint(int p_line, const StringName &p_source);
 	void remove_breakpoint(int p_line, const StringName &p_source);
 	bool is_breakpoint(int p_line, const StringName &p_source) const;
-	bool is_breakpoint_line(int p_line) const;
 	void clear_breakpoints();
 	const HashMap<int, HashSet<StringName>> &get_breakpoints() const { return breakpoints; }
 
diff --git a/core/io/image_loader.cpp b/core/io/image_loader.cpp
index 17fb199811..c6452f1033 100644
--- a/core/io/image_loader.cpp
+++ b/core/io/image_loader.cpp
@@ -135,10 +135,6 @@ void ImageLoader::remove_image_format_loader(Ref<ImageFormatLoader> p_loader) {
 	loader.erase(p_loader);
 }
 
-const Vector<Ref<ImageFormatLoader>> &ImageLoader::get_image_format_loaders() {
-	return loader;
-}
-
 void ImageLoader::cleanup() {
 	while (loader.size()) {
 		remove_image_format_loader(loader[0]);
diff --git a/core/io/image_loader.h b/core/io/image_loader.h
index 1473f24186..ac51f13376 100644
--- a/core/io/image_loader.h
+++ b/core/io/image_loader.h
@@ -98,8 +98,6 @@ public:
 	static void add_image_format_loader(Ref<ImageFormatLoader> p_loader);
 	static void remove_image_format_loader(Ref<ImageFormatLoader> p_loader);
 
-	static const Vector<Ref<ImageFormatLoader>> &get_image_format_loaders();
-
 	static void cleanup();
 };
 
diff --git a/core/io/resource.cpp b/core/io/resource.cpp
index 2d6f09725f..6d3575b9fa 100644
--- a/core/io/resource.cpp
+++ b/core/io/resource.cpp
@@ -385,10 +385,6 @@ void Resource::set_as_translation_remapped(bool p_remapped) {
 	ResourceCache::lock.unlock();
 }
 
-bool Resource::is_translation_remapped() const {
-	return remapped_list.in_list();
-}
-
 #ifdef TOOLS_ENABLED
 //helps keep IDs same number when loading/saving scenes. -1 clears ID and it Returns -1 when no id stored
 void Resource::set_id_for_path(const String &p_path, const String &p_id) {
@@ -481,9 +477,6 @@ void ResourceCache::clear() {
 	resources.clear();
 }
 
-void ResourceCache::reload_externals() {
-}
-
 bool ResourceCache::has(const String &p_path) {
 	lock.lock();
 
diff --git a/core/io/resource.h b/core/io/resource.h
index 22ce5cef43..5135664f36 100644
--- a/core/io/resource.h
+++ b/core/io/resource.h
@@ -136,7 +136,6 @@ public:
 #endif
 
 	void set_as_translation_remapped(bool p_remapped);
-	bool is_translation_remapped() const;
 
 	virtual RID get_rid() const; // some resources may offer conversion to RID
 
@@ -164,7 +163,6 @@ class ResourceCache {
 	friend void register_core_types();
 
 public:
-	static void reload_externals();
 	static bool has(const String &p_path);
 	static Ref<Resource> get_ref(const String &p_path);
 	static void get_cached_resources(List<Ref<Resource>> *p_resources);
diff --git a/core/io/resource_format_binary.cpp b/core/io/resource_format_binary.cpp
index 45e1301930..03beb25b03 100644
--- a/core/io/resource_format_binary.cpp
+++ b/core/io/resource_format_binary.cpp
@@ -661,10 +661,6 @@ Error ResourceLoaderBinary::parse_variant(Variant &r_v) {
 	return OK; //never reach anyway
 }
 
-void ResourceLoaderBinary::set_local_path(const String &p_local_path) {
-	res_path = p_local_path;
-}
-
 Ref<Resource> ResourceLoaderBinary::get_resource() {
 	return resource;
 }
diff --git a/core/io/resource_format_binary.h b/core/io/resource_format_binary.h
index 2e8988005f..9dd208e3cd 100644
--- a/core/io/resource_format_binary.h
+++ b/core/io/resource_format_binary.h
@@ -92,7 +92,6 @@ class ResourceLoaderBinary {
 	HashMap<String, Ref<Resource>> dependency_cache;
 
 public:
-	void set_local_path(const String &p_local_path);
 	Ref<Resource> get_resource();
 	Error load();
 	void set_translation_remapped(bool p_remapped);
diff --git a/core/io/resource_loader.cpp b/core/io/resource_loader.cpp
index 946c31cf0d..68b9f8b6f7 100644
--- a/core/io/resource_loader.cpp
+++ b/core/io/resource_loader.cpp
@@ -1011,13 +1011,6 @@ bool ResourceLoader::add_custom_resource_format_loader(String script_path) {
 	return true;
 }
 
-void ResourceLoader::remove_custom_resource_format_loader(String script_path) {
-	Ref<ResourceFormatLoader> custom_loader = _find_custom_resource_format_loader(script_path);
-	if (custom_loader.is_valid()) {
-		remove_resource_format_loader(custom_loader);
-	}
-}
-
 void ResourceLoader::set_create_missing_resources_if_class_unavailable(bool p_enable) {
 	create_missing_resources_if_class_unavailable = p_enable;
 }
diff --git a/core/io/resource_loader.h b/core/io/resource_loader.h
index 41ba0dc6e6..e427a2f5fc 100644
--- a/core/io/resource_loader.h
+++ b/core/io/resource_loader.h
@@ -225,7 +225,6 @@ public:
 	static ResourceLoaderImport import;
 
 	static bool add_custom_resource_format_loader(String script_path);
-	static void remove_custom_resource_format_loader(String script_path);
 	static void add_custom_loaders();
 	static void remove_custom_loaders();
 
diff --git a/core/io/resource_saver.cpp b/core/io/resource_saver.cpp
index 9809b9a48f..b8201cc6b9 100644
--- a/core/io/resource_saver.cpp
+++ b/core/io/resource_saver.cpp
@@ -250,13 +250,6 @@ bool ResourceSaver::add_custom_resource_format_saver(String script_path) {
 	return true;
 }
 
-void ResourceSaver::remove_custom_resource_format_saver(String script_path) {
-	Ref<ResourceFormatSaver> custom_saver = _find_custom_resource_format_saver(script_path);
-	if (custom_saver.is_valid()) {
-		remove_resource_format_saver(custom_saver);
-	}
-}
-
 void ResourceSaver::add_custom_savers() {
 	// Custom resource savers exploits global class names
 
diff --git a/core/io/resource_saver.h b/core/io/resource_saver.h
index 2043947963..9e88b2086b 100644
--- a/core/io/resource_saver.h
+++ b/core/io/resource_saver.h
@@ -101,7 +101,6 @@ public:
 	static void set_get_resource_id_for_path(ResourceSaverGetResourceIDForPath p_callback);
 
 	static bool add_custom_resource_format_saver(String script_path);
-	static void remove_custom_resource_format_saver(String script_path);
 	static void add_custom_savers();
 	static void remove_custom_savers();
 };
diff --git a/core/math/basis.cpp b/core/math/basis.cpp
index 39e383fb49..234a4ddb79 100644
--- a/core/math/basis.cpp
+++ b/core/math/basis.cpp
@@ -36,23 +36,6 @@
 #define cofac(row1, col1, row2, col2) \
 	(rows[row1][col1] * rows[row2][col2] - rows[row1][col2] * rows[row2][col1])
 
-void Basis::from_z(const Vector3 &p_z) {
-	if (Math::abs(p_z.z) > (real_t)Math_SQRT12) {
-		// choose p in y-z plane
-		real_t a = p_z[1] * p_z[1] + p_z[2] * p_z[2];
-		real_t k = 1.0f / Math::sqrt(a);
-		rows[0] = Vector3(0, -p_z[2] * k, p_z[1] * k);
-		rows[1] = Vector3(a * k, -p_z[0] * rows[0][2], p_z[0] * rows[0][1]);
-	} else {
-		// choose p in x-y plane
-		real_t a = p_z.x * p_z.x + p_z.y * p_z.y;
-		real_t k = 1.0f / Math::sqrt(a);
-		rows[0] = Vector3(-p_z.y * k, p_z.x * k, 0);
-		rows[1] = Vector3(-p_z.z * rows[0].y, p_z.z * rows[0].x, a * k);
-	}
-	rows[2] = p_z;
-}
-
 void Basis::invert() {
 	real_t co[3] = {
 		cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1)
@@ -271,14 +254,6 @@ float Basis::get_uniform_scale() const {
 	return (rows[0].length() + rows[1].length() + rows[2].length()) / 3.0f;
 }
 
-void Basis::make_scale_uniform() {
-	float l = (rows[0].length() + rows[1].length() + rows[2].length()) / 3.0f;
-	for (int i = 0; i < 3; i++) {
-		rows[i].normalize();
-		rows[i] *= l;
-	}
-}
-
 Basis Basis::scaled_local(const Vector3 &p_scale) const {
 	return (*this) * Basis::from_scale(p_scale);
 }
diff --git a/core/math/basis.h b/core/math/basis.h
index b3197dbc84..bbc1d40469 100644
--- a/core/math/basis.h
+++ b/core/math/basis.h
@@ -56,8 +56,6 @@ struct _NO_DISCARD_ Basis {
 
 	_FORCE_INLINE_ real_t determinant() const;
 
-	void from_z(const Vector3 &p_z);
-
 	void rotate(const Vector3 &p_axis, real_t p_angle);
 	Basis rotated(const Vector3 &p_axis, real_t p_angle) const;
 
@@ -101,8 +99,6 @@ struct _NO_DISCARD_ Basis {
 
 	void scale_orthogonal(const Vector3 &p_scale);
 	Basis scaled_orthogonal(const Vector3 &p_scale) const;
-
-	void make_scale_uniform();
 	float get_uniform_scale() const;
 
 	Vector3 get_scale() const;
diff --git a/core/math/face3.cpp b/core/math/face3.cpp
index e53bbf872b..1dff0ee4a6 100644
--- a/core/math/face3.cpp
+++ b/core/math/face3.cpp
@@ -120,36 +120,6 @@ bool Face3::is_degenerate() const {
 	return (normal.length_squared() < (real_t)CMP_EPSILON2);
 }
 
-Face3::Side Face3::get_side_of(const Face3 &p_face, ClockDirection p_clock_dir) const {
-	int over = 0, under = 0;
-
-	Plane plane = get_plane(p_clock_dir);
-
-	for (int i = 0; i < 3; i++) {
-		const Vector3 &v = p_face.vertex[i];
-
-		if (plane.has_point(v)) { //coplanar, don't bother
-			continue;
-		}
-
-		if (plane.is_point_over(v)) {
-			over++;
-		} else {
-			under++;
-		}
-	}
-
-	if (over > 0 && under == 0) {
-		return SIDE_OVER;
-	} else if (under > 0 && over == 0) {
-		return SIDE_UNDER;
-	} else if (under == 0 && over == 0) {
-		return SIDE_COPLANAR;
-	} else {
-		return SIDE_SPANNING;
-	}
-}
-
 Vector3 Face3::get_random_point_inside() const {
 	real_t a = Math::random(0.0, 1.0);
 	real_t b = Math::random(0.0, 1.0);
@@ -164,20 +134,10 @@ Plane Face3::get_plane(ClockDirection p_dir) const {
 	return Plane(vertex[0], vertex[1], vertex[2], p_dir);
 }
 
-Vector3 Face3::get_median_point() const {
-	return (vertex[0] + vertex[1] + vertex[2]) / 3.0f;
-}
-
 real_t Face3::get_area() const {
 	return vec3_cross(vertex[0] - vertex[1], vertex[0] - vertex[2]).length() * 0.5f;
 }
 
-ClockDirection Face3::get_clock_dir() const {
-	Vector3 normal = vec3_cross(vertex[0] - vertex[1], vertex[0] - vertex[2]);
-	//printf("normal is %g,%g,%g x %g,%g,%g- wtfu is %g\n",tofloat(normal.x),tofloat(normal.y),tofloat(normal.z),tofloat(vertex[0].x),tofloat(vertex[0].y),tofloat(vertex[0].z),tofloat( normal.dot( vertex[0] ) ) );
-	return (normal.dot(vertex[0]) >= 0) ? CLOCKWISE : COUNTERCLOCKWISE;
-}
-
 bool Face3::intersects_aabb(const AABB &p_aabb) const {
 	/** TEST PLANE **/
 	if (!p_aabb.intersects_plane(get_plane())) {
diff --git a/core/math/face3.h b/core/math/face3.h
index 3d87de03dc..3dd47d0226 100644
--- a/core/math/face3.h
+++ b/core/math/face3.h
@@ -57,19 +57,14 @@ struct _NO_DISCARD_ Face3 {
 	Plane get_plane(ClockDirection p_dir = CLOCKWISE) const;
 	Vector3 get_random_point_inside() const;
 
-	Side get_side_of(const Face3 &p_face, ClockDirection p_clock_dir = CLOCKWISE) const;
-
 	bool is_degenerate() const;
 	real_t get_area() const;
 
-	Vector3 get_median_point() const;
 	Vector3 get_closest_point_to(const Vector3 &p_point) const;
 
 	bool intersects_ray(const Vector3 &p_from, const Vector3 &p_dir, Vector3 *p_intersection = nullptr) const;
 	bool intersects_segment(const Vector3 &p_from, const Vector3 &p_dir, Vector3 *p_intersection = nullptr) const;
 
-	ClockDirection get_clock_dir() const; ///< todo, test if this is returning the proper clockwisity
-
 	void get_support(const Vector3 &p_normal, const Transform3D &p_transform, Vector3 *p_vertices, int *p_count, int p_max) const;
 	void project_range(const Vector3 &p_normal, const Transform3D &p_transform, real_t &r_min, real_t &r_max) const;
 
diff --git a/core/math/geometry_2d.cpp b/core/math/geometry_2d.cpp
index a0b76d31cb..74cb92539a 100644
--- a/core/math/geometry_2d.cpp
+++ b/core/math/geometry_2d.cpp
@@ -320,41 +320,6 @@ Vector<Vector<Point2>> Geometry2D::_polypath_offset(const Vector<Point2> &p_poly
 	return polypaths;
 }
 
-Vector<Point2i> Geometry2D::pack_rects(const Vector<Size2i> &p_sizes, const Size2i &p_atlas_size) {
-	Vector<stbrp_node> nodes;
-	nodes.resize(p_atlas_size.width);
-
-	stbrp_context context;
-	stbrp_init_target(&context, p_atlas_size.width, p_atlas_size.height, nodes.ptrw(), p_atlas_size.width);
-
-	Vector<stbrp_rect> rects;
-	rects.resize(p_sizes.size());
-
-	for (int i = 0; i < p_sizes.size(); i++) {
-		rects.write[i].id = 0;
-		rects.write[i].w = p_sizes[i].width;
-		rects.write[i].h = p_sizes[i].height;
-		rects.write[i].x = 0;
-		rects.write[i].y = 0;
-		rects.write[i].was_packed = 0;
-	}
-
-	int res = stbrp_pack_rects(&context, rects.ptrw(), rects.size());
-	if (res == 0) { //pack failed
-		return Vector<Point2i>();
-	}
-
-	Vector<Point2i> ret;
-	ret.resize(p_sizes.size());
-
-	for (int i = 0; i < p_sizes.size(); i++) {
-		Point2i r(rects[i].x, rects[i].y);
-		ret.write[i] = r;
-	}
-
-	return ret;
-}
-
 Vector<Vector3i> Geometry2D::partial_pack_rects(const Vector<Vector2i> &p_sizes, const Size2i &p_atlas_size) {
 	Vector<stbrp_node> nodes;
 	nodes.resize(p_atlas_size.width);
diff --git a/core/math/geometry_2d.h b/core/math/geometry_2d.h
index b55aecf85e..0e5702e0af 100644
--- a/core/math/geometry_2d.h
+++ b/core/math/geometry_2d.h
@@ -464,7 +464,6 @@ public:
 	static Vector<Vector<Vector2>> decompose_polygon_in_convex(Vector<Point2> polygon);
 
 	static void make_atlas(const Vector<Size2i> &p_rects, Vector<Point2i> &r_result, Size2i &r_size);
-	static Vector<Point2i> pack_rects(const Vector<Size2i> &p_sizes, const Size2i &p_atlas_size);
 	static Vector<Vector3i> partial_pack_rects(const Vector<Vector2i> &p_sizes, const Size2i &p_atlas_size);
 
 private:
diff --git a/core/math/geometry_3d.cpp b/core/math/geometry_3d.cpp
index c04fe7320d..51523ea296 100644
--- a/core/math/geometry_3d.cpp
+++ b/core/math/geometry_3d.cpp
@@ -198,149 +198,6 @@ struct _FaceClassify {
 	_FaceClassify() {}
 };
 
-static bool _connect_faces(_FaceClassify *p_faces, int len, int p_group) {
-	// Connect faces, error will occur if an edge is shared between more than 2 faces.
-	// Clear connections.
-
-	bool error = false;
-
-	for (int i = 0; i < len; i++) {
-		for (int j = 0; j < 3; j++) {
-			p_faces[i].links[j].clear();
-		}
-	}
-
-	for (int i = 0; i < len; i++) {
-		if (p_faces[i].group != p_group) {
-			continue;
-		}
-		for (int j = i + 1; j < len; j++) {
-			if (p_faces[j].group != p_group) {
-				continue;
-			}
-
-			for (int k = 0; k < 3; k++) {
-				Vector3 vi1 = p_faces[i].face.vertex[k];
-				Vector3 vi2 = p_faces[i].face.vertex[(k + 1) % 3];
-
-				for (int l = 0; l < 3; l++) {
-					Vector3 vj2 = p_faces[j].face.vertex[l];
-					Vector3 vj1 = p_faces[j].face.vertex[(l + 1) % 3];
-
-					if (vi1.distance_to(vj1) < 0.00001f &&
-							vi2.distance_to(vj2) < 0.00001f) {
-						if (p_faces[i].links[k].face != -1) {
-							ERR_PRINT("already linked\n");
-							error = true;
-							break;
-						}
-						if (p_faces[j].links[l].face != -1) {
-							ERR_PRINT("already linked\n");
-							error = true;
-							break;
-						}
-
-						p_faces[i].links[k].face = j;
-						p_faces[i].links[k].edge = l;
-						p_faces[j].links[l].face = i;
-						p_faces[j].links[l].edge = k;
-					}
-				}
-				if (error) {
-					break;
-				}
-			}
-			if (error) {
-				break;
-			}
-		}
-		if (error) {
-			break;
-		}
-	}
-
-	for (int i = 0; i < len; i++) {
-		p_faces[i].valid = true;
-		for (int j = 0; j < 3; j++) {
-			if (p_faces[i].links[j].face == -1) {
-				p_faces[i].valid = false;
-			}
-		}
-	}
-	return error;
-}
-
-static bool _group_face(_FaceClassify *p_faces, int len, int p_index, int p_group) {
-	if (p_faces[p_index].group >= 0) {
-		return false;
-	}
-
-	p_faces[p_index].group = p_group;
-
-	for (int i = 0; i < 3; i++) {
-		ERR_FAIL_INDEX_V(p_faces[p_index].links[i].face, len, true);
-		_group_face(p_faces, len, p_faces[p_index].links[i].face, p_group);
-	}
-
-	return true;
-}
-
-Vector<Vector<Face3>> Geometry3D::separate_objects(Vector<Face3> p_array) {
-	Vector<Vector<Face3>> objects;
-
-	int len = p_array.size();
-
-	const Face3 *arrayptr = p_array.ptr();
-
-	Vector<_FaceClassify> fc;
-
-	fc.resize(len);
-
-	_FaceClassify *_fcptr = fc.ptrw();
-
-	for (int i = 0; i < len; i++) {
-		_fcptr[i].face = arrayptr[i];
-	}
-
-	bool error = _connect_faces(_fcptr, len, -1);
-
-	ERR_FAIL_COND_V_MSG(error, Vector<Vector<Face3>>(), "Invalid geometry.");
-
-	// Group connected faces in separate objects.
-
-	int group = 0;
-	for (int i = 0; i < len; i++) {
-		if (!_fcptr[i].valid) {
-			continue;
-		}
-		if (_group_face(_fcptr, len, i, group)) {
-			group++;
-		}
-	}
-
-	// Group connected faces in separate objects.
-
-	for (int i = 0; i < len; i++) {
-		_fcptr[i].face = arrayptr[i];
-	}
-
-	if (group >= 0) {
-		objects.resize(group);
-		Vector<Face3> *group_faces = objects.ptrw();
-
-		for (int i = 0; i < len; i++) {
-			if (!_fcptr[i].valid) {
-				continue;
-			}
-			if (_fcptr[i].group >= 0 && _fcptr[i].group < group) {
-				group_faces[_fcptr[i].group].push_back(_fcptr[i].face);
-			}
-		}
-	}
-
-	return objects;
-}
-
 /*** GEOMETRY WRAPPER ***/
 
 enum _CellFlags {
diff --git a/core/math/geometry_3d.h b/core/math/geometry_3d.h
index 6759db5766..99c554fe05 100644
--- a/core/math/geometry_3d.h
+++ b/core/math/geometry_3d.h
@@ -532,8 +532,6 @@ public:
 		return clipped;
 	}
 
-	static Vector<Vector<Face3>> separate_objects(Vector<Face3> p_array);
-
 	// Create a "wrap" that encloses the given geometry.
 	static Vector<Face3> wrap_geometry(Vector<Face3> p_array, real_t *p_error = nullptr);
 
diff --git a/core/math/transform_2d.cpp b/core/math/transform_2d.cpp
index 6a7ee32230..910995d717 100644
--- a/core/math/transform_2d.cpp
+++ b/core/math/transform_2d.cpp
@@ -221,12 +221,6 @@ Transform2D Transform2D::operator*(const Transform2D &p_transform) const {
 	return t;
 }
 
-Transform2D Transform2D::basis_scaled(const Size2 &p_scale) const {
-	Transform2D copy = *this;
-	copy.scale_basis(p_scale);
-	return copy;
-}
-
 Transform2D Transform2D::scaled(const Size2 &p_scale) const {
 	// Equivalent to left multiplication
 	Transform2D copy = *this;
diff --git a/core/math/transform_2d.h b/core/math/transform_2d.h
index 2a0917c63f..4a17a9db37 100644
--- a/core/math/transform_2d.h
+++ b/core/math/transform_2d.h
@@ -85,7 +85,6 @@ struct _NO_DISCARD_ Transform2D {
 	_FORCE_INLINE_ const Vector2 &get_origin() const { return columns[2]; }
 	_FORCE_INLINE_ void set_origin(const Vector2 &p_origin) { columns[2] = p_origin; }
 
-	Transform2D basis_scaled(const Size2 &p_scale) const;
 	Transform2D scaled(const Size2 &p_scale) const;
 	Transform2D scaled_local(const Size2 &p_scale) const;
 	Transform2D translated(const Vector2 &p_offset) const;
diff --git a/core/math/triangle_mesh.cpp b/core/math/triangle_mesh.cpp
index 4b6921d38b..0da1b8c7ad 100644
--- a/core/math/triangle_mesh.cpp
+++ b/core/math/triangle_mesh.cpp
@@ -182,90 +182,6 @@ void TriangleMesh::create(const Vector<Vector3> &p_faces, const Vector<int32_t>
 	valid = true;
 }
 
-Vector3 TriangleMesh::get_area_normal(const AABB &p_aabb) const {
-	uint32_t *stack = (uint32_t *)alloca(sizeof(int) * max_depth);
-
-	enum {
-		TEST_AABB_BIT = 0,
-		VISIT_LEFT_BIT = 1,
-		VISIT_RIGHT_BIT = 2,
-		VISIT_DONE_BIT = 3,
-		VISITED_BIT_SHIFT = 29,
-		NODE_IDX_MASK = (1 << VISITED_BIT_SHIFT) - 1,
-		VISITED_BIT_MASK = ~NODE_IDX_MASK,
-
-	};
-
-	int n_count = 0;
-	Vector3 n;
-
-	int level = 0;
-
-	const Triangle *triangleptr = triangles.ptr();
-	//	const Vector3 *verticesr = vertices.ptr();
-	const BVH *bvhptr = bvh.ptr();
-
-	int pos = bvh.size() - 1;
-
-	stack[0] = pos;
-	while (true) {
-		uint32_t node = stack[level] & NODE_IDX_MASK;
-		const BVH &b = bvhptr[node];
-		bool done = false;
-
-		switch (stack[level] >> VISITED_BIT_SHIFT) {
-			case TEST_AABB_BIT: {
-				if (!b.aabb.intersects(p_aabb)) {
-					stack[level] = (VISIT_DONE_BIT << VISITED_BIT_SHIFT) | node;
-				} else {
-					if (b.face_index >= 0) {
-						const Triangle &s = triangleptr[b.face_index];
-						n += s.normal;
-						n_count++;
-
-						stack[level] = (VISIT_DONE_BIT << VISITED_BIT_SHIFT) | node;
-
-					} else {
-						stack[level] = (VISIT_LEFT_BIT << VISITED_BIT_SHIFT) | node;
-					}
-				}
-				continue;
-			}
-			case VISIT_LEFT_BIT: {
-				stack[level] = (VISIT_RIGHT_BIT << VISITED_BIT_SHIFT) | node;
-				level++;
-				stack[level] = b.left | TEST_AABB_BIT;
-				continue;
-			}
-			case VISIT_RIGHT_BIT: {
-				stack[level] = (VISIT_DONE_BIT << VISITED_BIT_SHIFT) | node;
-				level++;
-				stack[level] = b.right | TEST_AABB_BIT;
-				continue;
-			}
-			case VISIT_DONE_BIT: {
-				if (level == 0) {
-					done = true;
-					break;
-				} else {
-					level--;
-				}
-				continue;
-			}
-		}
-
-		if (done) {
-			break;
-		}
-	}
-
-	if (n_count > 0) {
-		n /= n_count;
-	}
-
-	return n;
-}
-
 bool TriangleMesh::intersect_segment(const Vector3 &p_begin, const Vector3 &p_end, Vector3 &r_point, Vector3 &r_normal, int32_t *r_surf_index) const {
 	uint32_t *stack = (uint32_t *)alloca(sizeof(int) * max_depth);
 
@@ -468,118 +384,6 @@ bool TriangleMesh::intersect_ray(const Vector3 &p_begin, const Vector3 &p_dir, V
 	return inters;
 }
 
-bool TriangleMesh::intersect_convex_shape(const Plane *p_planes, int p_plane_count, const Vector3 *p_points, int p_point_count) const {
-	uint32_t *stack = (uint32_t *)alloca(sizeof(int) * max_depth);
-
-	//p_fully_inside = true;
-
-	enum {
-		TEST_AABB_BIT = 0,
-		VISIT_LEFT_BIT = 1,
-		VISIT_RIGHT_BIT = 2,
-		VISIT_DONE_BIT = 3,
-		VISITED_BIT_SHIFT = 29,
-		NODE_IDX_MASK = (1 << VISITED_BIT_SHIFT) - 1,
-		VISITED_BIT_MASK = ~NODE_IDX_MASK,
-
-	};
-
-	int level = 0;
-
-	const Triangle *triangleptr = triangles.ptr();
-	const Vector3 *vertexptr = vertices.ptr();
-	const BVH *bvhptr = bvh.ptr();
-
-	int pos = bvh.size() - 1;
-
-	stack[0] = pos;
-	while (true) {
-		uint32_t node = stack[level] & NODE_IDX_MASK;
-		const BVH &b = bvhptr[node];
-		bool done = false;
-
-		switch (stack[level] >> VISITED_BIT_SHIFT) {
-			case TEST_AABB_BIT: {
-				if (!b.aabb.intersects_convex_shape(p_planes, p_plane_count, p_points, p_point_count)) {
-					stack[level] = (VISIT_DONE_BIT << VISITED_BIT_SHIFT) | node;
-				} else {
-					if (b.face_index >= 0) {
-						const Triangle &s = triangleptr[b.face_index];
-
-						for (int j = 0; j < 3; ++j) {
-							const Vector3 &point = vertexptr[s.indices[j]];
-							const Vector3 &next_point = vertexptr[s.indices[(j + 1) % 3]];
-							Vector3 res;
-							bool over = true;
-							for (int i = 0; i < p_plane_count; i++) {
-								const Plane &p = p_planes[i];
-
-								if (p.intersects_segment(point, next_point, &res)) {
-									bool inisde = true;
-									for (int k = 0; k < p_plane_count; k++) {
-										if (k == i) {
-											continue;
-										}
-										const Plane &pp = p_planes[k];
-										if (pp.is_point_over(res)) {
-											inisde = false;
-											break;
-										}
-									}
-									if (inisde) {
-										return true;
-									}
-								}
-
-								if (p.is_point_over(point)) {
-									over = false;
-									break;
-								}
-							}
-							if (over) {
-								return true;
-							}
-						}
-
-						stack[level] = (VISIT_DONE_BIT << VISITED_BIT_SHIFT) | node;
-
-					} else {
-						stack[level] = (VISIT_LEFT_BIT << VISITED_BIT_SHIFT) | node;
-					}
-				}
-				continue;
-			}
-			case VISIT_LEFT_BIT: {
-				stack[level] = (VISIT_RIGHT_BIT << VISITED_BIT_SHIFT) | node;
-				level++;
-				stack[level] = b.left | TEST_AABB_BIT;
-				continue;
-			}
-			case VISIT_RIGHT_BIT: {
-				stack[level] = (VISIT_DONE_BIT << VISITED_BIT_SHIFT) | node;
-				level++;
-				stack[level] = b.right | TEST_AABB_BIT;
-				continue;
-			}
-			case VISIT_DONE_BIT: {
-				if (level == 0) {
-					done = true;
-					break;
-				} else {
-					level--;
-				}
-				continue;
-			}
-		}
-
-		if (done) {
-			break;
-		}
-	}
-
-	return false;
-}
-
 bool TriangleMesh::inside_convex_shape(const Plane *p_planes, int p_plane_count, const Vector3 *p_points, int p_point_count, Vector3 p_scale) const {
 	uint32_t *stack = (uint32_t *)alloca(sizeof(int) * max_depth);
 
diff --git a/core/math/triangle_mesh.h b/core/math/triangle_mesh.h
index 728fd600d5..24fc12dda9 100644
--- a/core/math/triangle_mesh.h
+++ b/core/math/triangle_mesh.h
@@ -84,9 +84,7 @@ public:
 	bool is_valid() const;
 	bool intersect_segment(const Vector3 &p_begin, const Vector3 &p_end, Vector3 &r_point, Vector3 &r_normal, int32_t *r_surf_index = nullptr) const;
 	bool intersect_ray(const Vector3 &p_begin, const Vector3 &p_dir, Vector3 &r_point, Vector3 &r_normal, int32_t *r_surf_index = nullptr) const;
-	bool intersect_convex_shape(const Plane *p_planes, int p_plane_count, const Vector3 *p_points, int p_point_count) const;
 	bool inside_convex_shape(const Plane *p_planes, int p_plane_count, const Vector3 *p_points, int p_point_count, Vector3 p_scale = Vector3(1, 1, 1)) const;
-	Vector3 get_area_normal(const AABB &p_aabb) const;
 	Vector<Face3> get_faces() const;
 
 	const Vector<Triangle> &get_triangles() const { return triangles; }
diff --git a/core/variant/variant.h b/core/variant/variant.h
index fff59c43a6..b9294de77d 100644
--- a/core/variant/variant.h
+++ b/core/variant/variant.h
@@ -655,6 +655,7 @@ public:
 
 	static bool has_indexing(Variant::Type p_type);
 	static Variant::Type get_indexed_element_type(Variant::Type p_type);
+	static uint32_t get_indexed_element_usage(Variant::Type p_type);
 
 	typedef void (*ValidatedIndexedSetter)(Variant *base, int64_t index, const Variant *value, bool *oob);
 	typedef void (*ValidatedIndexedGetter)(const Variant *base, int64_t index, Variant *value, bool *oob);
diff --git a/core/variant/variant_setget.cpp b/core/variant/variant_setget.cpp
index a74556d88f..ba37e15f31 100644
--- a/core/variant/variant_setget.cpp
+++ b/core/variant/variant_setget.cpp
@@ -389,6 +389,7 @@ Variant Variant::get_named(const StringName &p_member, bool &r_valid) const {
 			v.write[index] = PtrToArg<m_elem_type>::convert(member);                                                                 \
 		}                                                                                                                            \
 		static Variant::Type get_index_type() { return GetTypeInfo<m_elem_type>::VARIANT_TYPE; }                                     \
+		static uint32_t get_index_usage() { return GetTypeInfo<m_elem_type>::get_class_info().usage; }                               \
 		static uint64_t get_indexed_size(const Variant *base) { return VariantGetInternalPtr<m_base_type>::get_ptr(base)->size(); }  \
 	};
 
@@ -460,6 +461,7 @@ Variant Variant::get_named(const StringName &p_member, bool &r_valid) const {
 			v.write[index] = PtrToArg<m_elem_type>::convert(member);                                                                 \
 		}                                                                                                                            \
 		static Variant::Type get_index_type() { return GetTypeInfo<m_elem_type>::VARIANT_TYPE; }                                     \
+		static uint32_t get_index_usage() { return GetTypeInfo<m_elem_type>::get_class_info().usage; }                               \
 		static uint64_t get_indexed_size(const Variant *base) { return VariantGetInternalPtr<m_base_type>::get_ptr(base)->size(); }  \
 	};
 
@@ -515,6 +517,7 @@ Variant Variant::get_named(const StringName &p_member, bool &r_valid) const {
 			v[index] = PtrToArg<m_elem_type>::convert(member);                                                                 \
 		}                                                                                                                      \
 		static Variant::Type get_index_type() { return GetTypeInfo<m_elem_type>::VARIANT_TYPE; }                               \
+		static uint32_t get_index_usage() { return GetTypeInfo<m_elem_type>::get_class_info().usage; }                         \
 		static uint64_t get_indexed_size(const Variant *base) { return m_max; }                                                \
 	};
 
@@ -564,6 +567,7 @@ Variant Variant::get_named(const StringName &p_member, bool &r_valid) const {
 			v m_accessor[index] = PtrToArg<m_elem_type>::convert(member);                                                                 \
 		}                                                                                                                                 \
 		static Variant::Type get_index_type() { return GetTypeInfo<m_elem_type>::VARIANT_TYPE; }                                          \
+		static uint32_t get_index_usage() { return GetTypeInfo<m_elem_type>::get_class_info().usage; }                                    \
 		static uint64_t get_indexed_size(const Variant *base) { return m_max; }                                                           \
 	};
 
@@ -613,6 +617,7 @@ Variant Variant::get_named(const StringName &p_member, bool &r_valid) const {
 			v.m_set(index, PtrToArg<m_elem_type>::convert(member));                                                                \
 		}                                                                                                                          \
 		static Variant::Type get_index_type() { return GetTypeInfo<m_elem_type>::VARIANT_TYPE; }                                   \
+		static uint32_t get_index_usage() { return GetTypeInfo<m_elem_type>::get_class_info().usage; }                             \
 		static uint64_t get_indexed_size(const Variant *base) { return m_max; }                                                    \
 	};
 
@@ -683,6 +688,7 @@ struct VariantIndexedSetGet_Array {
 		v.set(index, PtrToArg<Variant>::convert(member));
 	}
 	static Variant::Type get_index_type() { return Variant::NIL; }
+	static uint32_t get_index_usage() { return PROPERTY_USAGE_NIL_IS_VARIANT; }
 	static uint64_t get_indexed_size(const Variant *base) { return 0; }
 };
 
@@ -768,6 +774,7 @@ struct VariantIndexedSetGet_String {
 		}
 	}
 	static Variant::Type get_index_type() { return Variant::STRING; }
+	static uint32_t get_index_usage() { return PROPERTY_USAGE_DEFAULT; }
 	static uint64_t get_indexed_size(const Variant *base) { return VariantInternal::get_string(base)->length(); }
 };
 
@@ -812,6 +819,7 @@ struct VariantIndexedSetGet_String {
 			v[index] = PtrToArg<Variant>::convert(member);                                                                          \
 		}                                                                                                                           \
 		static Variant::Type get_index_type() { return Variant::NIL; }                                                              \
+		static uint32_t get_index_usage() { return PROPERTY_USAGE_DEFAULT; }                                                        \
 		static uint64_t get_indexed_size(const Variant *base) { return VariantGetInternalPtr<m_base_type>::get_ptr(base)->size(); } \
 	};
 
@@ -852,7 +860,8 @@ struct VariantIndexedSetterGetterInfo {
 
 	uint64_t (*get_indexed_size)(const Variant *base) = nullptr;
 
-	Variant::Type index_type;
+	Variant::Type index_type = Variant::NIL;
+	uint32_t index_usage = PROPERTY_USAGE_DEFAULT;
 
 	bool valid = false;
 };
@@ -872,6 +881,7 @@ static void register_indexed_member(Variant::Type p_type) {
 	sgi.ptr_getter = T::ptr_get;
 
 	sgi.index_type = T::get_index_type();
+	sgi.index_usage = T::get_index_usage();
 	sgi.get_indexed_size = T::get_indexed_size;
 
 	sgi.valid = true;
@@ -920,6 +930,11 @@ Variant::Type Variant::get_indexed_element_type(Variant::Type p_type) {
 	return variant_indexed_setters_getters[p_type].index_type;
 }
 
+uint32_t Variant::get_indexed_element_usage(Variant::Type p_type) {
+	ERR_FAIL_INDEX_V(p_type, Variant::VARIANT_MAX, PROPERTY_USAGE_DEFAULT);
+	return variant_indexed_setters_getters[p_type].index_usage;
+}
+
 Variant::ValidatedIndexedSetter Variant::get_member_validated_indexed_setter(Variant::Type p_type) {
 	ERR_FAIL_INDEX_V(p_type, Variant::VARIANT_MAX, nullptr);
 	return variant_indexed_setters_getters[p_type].validated_setter;
diff --git a/doc/classes/AnimationNode.xml b/doc/classes/AnimationNode.xml
index a33ec2f6dc..6e3345b675 100644
--- a/doc/classes/AnimationNode.xml
+++ b/doc/classes/AnimationNode.xml
@@ -49,6 +49,13 @@
 				When inheriting from [AnimationRootNode], implement this virtual method to return whether the blend tree editor should display filter editing on this node.
 			</description>
 		</method>
+		<method name="_is_parameter_read_only" qualifiers="virtual const">
+			<return type="bool" />
+			<param index="0" name="parameter" type="StringName" />
+			<description>
+				When inheriting from [AnimationRootNode], implement this virtual method to return whether the [param parameter] is read-only. Parameters are custom local memory used for your nodes, given a resource can be reused in multiple trees.
+			</description>
+		</method>
 		<method name="_process" qualifiers="virtual const">
 			<return type="float" />
 			<param index="0" name="time" type="float" />
diff --git a/doc/classes/AnimationNodeOneShot.xml b/doc/classes/AnimationNodeOneShot.xml
index 14abc34992..9e8193868c 100644
--- a/doc/classes/AnimationNodeOneShot.xml
+++ b/doc/classes/AnimationNodeOneShot.xml
@@ -28,6 +28,12 @@
 		</member>
 	</members>
 	<constants>
+		<constant name="ONE_SHOT_REQUEST_NONE" value="0" enum="OneShotRequest">
+		</constant>
+		<constant name="ONE_SHOT_REQUEST_FIRE" value="1" enum="OneShotRequest">
+		</constant>
+		<constant name="ONE_SHOT_REQUEST_ABORT" value="2" enum="OneShotRequest">
+		</constant>
 		<constant name="MIX_MODE_BLEND" value="0" enum="MixMode">
 		</constant>
 		<constant name="MIX_MODE_ADD" value="1" enum="MixMode">
diff --git a/doc/classes/AnimationNodeStateMachinePlayback.xml b/doc/classes/AnimationNodeStateMachinePlayback.xml
index 8f53ef0dcf..7e01be12c4 100644
--- a/doc/classes/AnimationNodeStateMachinePlayback.xml
+++ b/doc/classes/AnimationNodeStateMachinePlayback.xml
@@ -50,11 +50,19 @@
 				Returns [code]true[/code] if an animation is playing.
 			</description>
 		</method>
+		<method name="next">
+			<return type="void" />
+			<description>
+				If there is a next path by travel or auto advance, immediately transitions from the current state to the next state.
+			</description>
+		</method>
 		<method name="start">
 			<return type="void" />
 			<param index="0" name="node" type="StringName" />
+			<param index="1" name="reset" type="bool" default="true" />
 			<description>
 				Starts playing the given animation.
+				If [param reset] is [code]true[/code], the animation is played from the beginning.
 			</description>
 		</method>
 		<method name="stop">
@@ -66,8 +74,11 @@
 		<method name="travel">
 			<return type="void" />
 			<param index="0" name="to_node" type="StringName" />
+			<param index="1" name="reset_on_teleport" type="bool" default="true" />
 			<description>
 				Transitions from the current state to another one, following the shortest path.
+				If the path does not connect from the current state, the animation will play after the state teleports.
+				If [param reset_on_teleport] is [code]true[/code], the animation is played from the beginning when the travel cause a teleportation.
 			</description>
 		</method>
 	</methods>
diff --git a/doc/classes/AnimationNodeStateMachineTransition.xml b/doc/classes/AnimationNodeStateMachineTransition.xml
index 814b2d0052..eee25fad7c 100644
--- a/doc/classes/AnimationNodeStateMachineTransition.xml
+++ b/doc/classes/AnimationNodeStateMachineTransition.xml
@@ -28,6 +28,9 @@
 		<member name="priority" type="int" setter="set_priority" getter="get_priority" default="1">
 			Lower priority transitions are preferred when travelling through the tree via [method AnimationNodeStateMachinePlayback.travel] or [member advance_mode] is set to [constant ADVANCE_MODE_AUTO].
 		</member>
+		<member name="reset" type="bool" setter="set_reset" getter="is_reset" default="true">
+			If [code]true[/code], the destination animation is played back from the beginning when switched.
+		</member>
 		<member name="switch_mode" type="int" setter="set_switch_mode" getter="get_switch_mode" enum="AnimationNodeStateMachineTransition.SwitchMode" default="0">
 			The transition type.
 		</member>
diff --git a/doc/classes/AnimationNodeTransition.xml b/doc/classes/AnimationNodeTransition.xml
index f6e2fc5eb2..bca94a568a 100644
--- a/doc/classes/AnimationNodeTransition.xml
+++ b/doc/classes/AnimationNodeTransition.xml
@@ -12,6 +12,12 @@
 		<link title="Third Person Shooter Demo">https://godotengine.org/asset-library/asset/678</link>
 	</tutorials>
 	<methods>
+		<method name="find_input_caption" qualifiers="const">
+			<return type="int" />
+			<param index="0" name="caption" type="String" />
+			<description>
+			</description>
+		</method>
 		<method name="get_input_caption" qualifiers="const">
 			<return type="String" />
 			<param index="0" name="input" type="int" />
@@ -43,7 +49,7 @@
 		<member name="enabled_inputs" type="int" setter="set_enabled_inputs" getter="get_enabled_inputs" default="0">
 			The number of enabled input ports for this node.
 		</member>
-		<member name="from_start" type="bool" setter="set_from_start" getter="is_from_start" default="true">
+		<member name="reset" type="bool" setter="set_reset" getter="is_reset" default="true">
 			If [code]true[/code], the destination animation is played back from the beginning when switched.
 		</member>
 		<member name="xfade_curve" type="Curve" setter="set_xfade_curve" getter="get_xfade_curve">
diff --git a/doc/classes/Array.xml b/doc/classes/Array.xml
index 21ccf79fe2..ce4d7693d8 100644
--- a/doc/classes/Array.xml
+++ b/doc/classes/Array.xml
@@ -679,7 +679,7 @@
 			</description>
 		</operator>
 		<operator name="operator []">
-			<return type="void" />
+			<return type="Variant" />
 			<param index="0" name="index" type="int" />
 			<description>
 				Returns a reference to the element of type [Variant] at the specified location. Arrays start at index 0. [param index] can be a zero or positive value to start from the beginning, or a negative value to start from the end. Out-of-bounds array access causes a run-time error, which will result in an error being printed and the project execution pausing if run from the editor.
diff --git a/doc/classes/ArrayMesh.xml b/doc/classes/ArrayMesh.xml
index f7764d5e32..7b86afcc4c 100644
--- a/doc/classes/ArrayMesh.xml
+++ b/doc/classes/ArrayMesh.xml
@@ -65,11 +65,15 @@
 			<param index="1" name="arrays" type="Array" />
 			<param index="2" name="blend_shapes" type="Array[]" default="[]" />
 			<param index="3" name="lods" type="Dictionary" default="{}" />
-			<param index="4" name="compress_flags" type="int" enum="Mesh.ArrayFormat" default="0" />
-			<description>
-				Creates a new surface.
-				Surfaces are created to be rendered using a [param primitive], which may be any of the types defined in [enum Mesh.PrimitiveType]. (As a note, when using indices, it is recommended to only use points, lines, or triangles.) [method Mesh.get_surface_count] will become the [code]surf_idx[/code] for this new surface.
-				The [param arrays] argument is an array of arrays. See [enum Mesh.ArrayType] for the values used in this array. For example, [code]arrays[0][/code] is the array of vertices. That first vertex sub-array is always required; the others are optional. Adding an index array puts this function into "index mode" where the vertex and other arrays become the sources of data and the index array defines the vertex order. All sub-arrays must have the same length as the vertex array (or be an exact multiple of the vertex array's length, when multiple elements of a sub-array correspond to a single vertex) or be empty, except for [constant Mesh.ARRAY_INDEX] if it is used.
+			<param index="4" name="flags" type="int" enum="Mesh.ArrayFormat" default="0" />
+			<description>
+				Creates a new surface. [method Mesh.get_surface_count] will become the [code]surf_idx[/code] for this new surface.
+				Surfaces are created to be rendered using a [param primitive], which may be any of the values defined in [enum Mesh.PrimitiveType].
+				The [param arrays] argument is an array of arrays. Each of the [constant Mesh.ARRAY_MAX] elements contains an array with some of the mesh data for this surface as described by the corresponding member of [enum Mesh.ArrayType] or [code]null[/code] if it is not used by the surface. For example, [code]arrays[0][/code] is the array of vertices. That first vertex sub-array is always required; the others are optional. Adding an index array puts this surface into "index mode" where the vertex and other arrays become the sources of data and the index array defines the vertex order. All sub-arrays must have the same length as the vertex array (or be an exact multiple of the vertex array's length, when multiple elements of a sub-array correspond to a single vertex) or be empty, except for [constant Mesh.ARRAY_INDEX] if it is used.
+				The [param blend_shapes] argument is an array of vertex data for each blend shape. Each element is an array of the same structure as [param arrays], but [constant Mesh.ARRAY_VERTEX], [constant Mesh.ARRAY_NORMAL], and [constant Mesh.ARRAY_TANGENT] are set if and only if they are set in [param arrays] and all other entries are [code]null[/code].
+				The [param lods] argument is a dictionary with [float] keys and [PackedInt32Array] values. Each entry in the dictionary represents a LOD level of the surface, where the value is the [constant Mesh.ARRAY_INDEX] array to use for the LOD level and the key is roughly proportional to the distance at which the LOD stats being used. I.e., increasing the key of a LOD also increases the distance that the objects has to be from the camera before the LOD is used.
+				The [param flags] argument is the bitwise or of, as required: One value of [enum Mesh.ArrayCustomFormat] left shifted by [code]ARRAY_FORMAT_CUSTOMn_SHIFT[/code] for each custom channel in use, [constant Mesh.ARRAY_FLAG_USE_DYNAMIC_UPDATE], [constant Mesh.ARRAY_FLAG_USE_8_BONE_WEIGHTS], or [constant Mesh.ARRAY_FLAG_USES_EMPTY_VERTEX_ARRAY].
+				[b]Note:[/b] When using indices, it is recommended to only use points, lines, or triangles.
 			</description>
 		</method>
 		<method name="clear_blend_shapes">
diff --git a/doc/classes/Control.xml b/doc/classes/Control.xml
index 75afb0cdbf..7082eff97d 100644
--- a/doc/classes/Control.xml
+++ b/doc/classes/Control.xml
@@ -196,12 +196,12 @@
 			</description>
 		</method>
 		<method name="_structured_text_parser" qualifiers="virtual const">
-			<return type="Vector2i[]" />
+			<return type="Vector3i[]" />
 			<param index="0" name="args" type="Array" />
 			<param index="1" name="text" type="String" />
 			<description>
 				User defined BiDi algorithm override function.
-				Returns an [Array] of [Vector2i] text ranges, in the left-to-right order. Ranges should cover full source [param text] without overlaps. BiDi algorithm will be used on each range separately.
+				Returns an [Array] of [Vector3i] text ranges and text base directions, in the left-to-right order. Ranges should cover full source [param text] without overlaps. BiDi algorithm will be used on each range separately.
 			</description>
 		</method>
 		<method name="accept_event">
diff --git a/doc/classes/ImporterMesh.xml b/doc/classes/ImporterMesh.xml
index b80857a7bf..10479dfcfe 100644
--- a/doc/classes/ImporterMesh.xml
+++ b/doc/classes/ImporterMesh.xml
@@ -27,9 +27,13 @@
 			<param index="5" name="name" type="String" default="&quot;&quot;" />
 			<param index="6" name="flags" type="int" default="0" />
 			<description>
-				Creates a new surface, analogous to [method ArrayMesh.add_surface_from_arrays].
-				Surfaces are created to be rendered using a [param primitive], which may be any of the types defined in [enum Mesh.PrimitiveType]. (As a note, when using indices, it is recommended to only use points, lines, or triangles.) [method Mesh.get_surface_count] will become the [code]surf_idx[/code] for this new surface.
-				The [param arrays] argument is an array of arrays. See [enum Mesh.ArrayType] for the values used in this array. For example, [code]arrays[0][/code] is the array of vertices. That first vertex sub-array is always required; the others are optional. Adding an index array puts this function into "index mode" where the vertex and other arrays become the sources of data and the index array defines the vertex order. All sub-arrays must have the same length as the vertex array (or be an exact multiple of the vertex array's length, when multiple elements of a sub-array correspond to a single vertex) or be empty, except for [constant Mesh.ARRAY_INDEX] if it is used.
+				Creates a new surface. [method Mesh.get_surface_count] will become the [code]surf_idx[/code] for this new surface.
+				Surfaces are created to be rendered using a [param primitive], which may be any of the values defined in [enum Mesh.PrimitiveType].
+				The [param arrays] argument is an array of arrays. Each of the [constant Mesh.ARRAY_MAX] elements contains an array with some of the mesh data for this surface as described by the corresponding member of [enum Mesh.ArrayType] or [code]null[/code] if it is not used by the surface. For example, [code]arrays[0][/code] is the array of vertices. That first vertex sub-array is always required; the others are optional. Adding an index array puts this surface into "index mode" where the vertex and other arrays become the sources of data and the index array defines the vertex order. All sub-arrays must have the same length as the vertex array (or be an exact multiple of the vertex array's length, when multiple elements of a sub-array correspond to a single vertex) or be empty, except for [constant Mesh.ARRAY_INDEX] if it is used.
+				The [param blend_shapes] argument is an array of vertex data for each blend shape. Each element is an array of the same structure as [param arrays], but [constant Mesh.ARRAY_VERTEX], [constant Mesh.ARRAY_NORMAL], and [constant Mesh.ARRAY_TANGENT] are set if and only if they are set in [param arrays] and all other entries are [code]null[/code].
+				The [param lods] argument is a dictionary with [float] keys and [PackedInt32Array] values. Each entry in the dictionary represents a LOD level of the surface, where the value is the [constant Mesh.ARRAY_INDEX] array to use for the LOD level and the key is roughly proportional to the distance at which the LOD stats being used. I.e., increasing the key of a LOD also increases the distance that the objects has to be from the camera before the LOD is used.
+				The [param flags] argument is the bitwise or of, as required: One value of [enum Mesh.ArrayCustomFormat] left shifted by [code]ARRAY_FORMAT_CUSTOMn_SHIFT[/code] for each custom channel in use, [constant Mesh.ARRAY_FLAG_USE_DYNAMIC_UPDATE], [constant Mesh.ARRAY_FLAG_USE_8_BONE_WEIGHTS], or [constant Mesh.ARRAY_FLAG_USES_EMPTY_VERTEX_ARRAY].
+				[b]Note:[/b] When using indices, it is recommended to only use points, lines, or triangles.
 			</description>
 		</method>
 		<method name="clear">
diff --git a/doc/classes/Mesh.xml b/doc/classes/Mesh.xml
index 94e80ffb2b..1c1f48588f 100644
--- a/doc/classes/Mesh.xml
+++ b/doc/classes/Mesh.xml
@@ -227,10 +227,10 @@
 			Contains custom color channel 3. [PackedByteArray] if [code](format &gt;&gt; [constant ARRAY_FORMAT_CUSTOM3_SHIFT]) &amp; [constant ARRAY_FORMAT_CUSTOM_MASK])[/code] is [constant ARRAY_CUSTOM_RGBA8_UNORM], [constant ARRAY_CUSTOM_RGBA8_UNORM], [constant ARRAY_CUSTOM_RG_HALF] or [constant ARRAY_CUSTOM_RGBA_HALF]. [PackedFloat32Array] otherwise.
 		</constant>
 		<constant name="ARRAY_BONES" value="10" enum="ArrayType">
-			[PackedFloat32Array] or [PackedInt32Array] of bone indices. Each element is a group of 4 numbers.
+			[PackedFloat32Array] or [PackedInt32Array] of bone indices. Contains either 4 or 8 numbers per vertex depending on the presence of the [constant ARRAY_FLAG_USE_8_BONE_WEIGHTS] flag.
 		</constant>
 		<constant name="ARRAY_WEIGHTS" value="11" enum="ArrayType">
-			[PackedFloat32Array] of bone weights. Each element in groups of 4 floats.
+			[PackedFloat32Array] or [PackedFloat64Array] of bone weights in the range [code]0.0[/code] to [code]1.0[/code] (inclusive). Contains either 4 or 8 numbers per vertex depending on the presence of the [constant ARRAY_FLAG_USE_8_BONE_WEIGHTS] flag.
 		</constant>
 		<constant name="ARRAY_INDEX" value="12" enum="ArrayType">
 			[PackedInt32Array] of integers used as indices referencing vertices, colors, normals, tangents, and textures. All of those arrays must have the same number of elements as the vertex array. No index can be beyond the vertex array size. When this index array is present, it puts the function into "index mode," where the index selects the *i*'th vertex, normal, tangent, color, UV, etc. This means if you want to have different normals or colors along an edge, you have to duplicate the vertices.
@@ -341,6 +341,9 @@
 		<constant name="ARRAY_FLAG_USE_8_BONE_WEIGHTS" value="134217728" enum="ArrayFormat" is_bitfield="true">
 			Flag used to mark that the mesh contains up to 8 bone influences per vertex. This flag indicates that [constant ARRAY_BONES] and [constant ARRAY_WEIGHTS] elements will have double length.
 		</constant>
+		<constant name="ARRAY_FLAG_USES_EMPTY_VERTEX_ARRAY" value="268435456" enum="ArrayFormat" is_bitfield="true">
+			Flag used to mark that the mesh intentionally contains no vertex array.
+		</constant>
 		<constant name="BLEND_SHAPE_MODE_NORMALIZED" value="0" enum="BlendShapeMode">
 			Blend shapes are normalized.
 		</constant>
diff --git a/doc/classes/RenderingServer.xml b/doc/classes/RenderingServer.xml
index 33170e6606..3d7fb0d445 100644
--- a/doc/classes/RenderingServer.xml
+++ b/doc/classes/RenderingServer.xml
@@ -3810,6 +3810,8 @@
 		</constant>
 		<constant name="ARRAY_FLAG_USE_8_BONE_WEIGHTS" value="134217728" enum="ArrayFormat" is_bitfield="true">
 		</constant>
+		<constant name="ARRAY_FLAG_USES_EMPTY_VERTEX_ARRAY" value="268435456" enum="ArrayFormat" is_bitfield="true">
+		</constant>
 		<constant name="PRIMITIVE_POINTS" value="0" enum="PrimitiveType">
 			Primitive to draw consists of points.
 		</constant>
diff --git a/doc/classes/TextServer.xml b/doc/classes/TextServer.xml
index d2c6dee373..711fb89217 100644
--- a/doc/classes/TextServer.xml
+++ b/doc/classes/TextServer.xml
@@ -1042,7 +1042,7 @@
 			</description>
 		</method>
 		<method name="parse_structured_text" qualifiers="const">
-			<return type="Vector2i[]" />
+			<return type="Vector3i[]" />
 			<param index="0" name="parser_type" type="int" enum="TextServer.StructuredTextParser" />
 			<param index="1" name="args" type="Array" />
 			<param index="2" name="text" type="String" />
@@ -1634,6 +1634,9 @@
 		<constant name="DIRECTION_RTL" value="2" enum="Direction">
 			Text is written from right to left.
 		</constant>
+		<constant name="DIRECTION_INHERITED" value="3" enum="Direction">
+			Text writing direction is the same as base string writing direction. Used for BiDi override only.
+		</constant>
 		<constant name="ORIENTATION_HORIZONTAL" value="0" enum="Orientation">
 			Text is written horizontally.
 		</constant>
@@ -1881,7 +1884,7 @@
 			Font have fixed-width characters.
 		</constant>
 		<constant name="STRUCTURED_TEXT_DEFAULT" value="0" enum="StructuredTextParser">
-			Use default behavior. Same as [constant STRUCTURED_TEXT_NONE] unless specified otherwise in the control description.
+			Use default Unicode BiDi algorithm.
 		</constant>
 		<constant name="STRUCTURED_TEXT_URI" value="1" enum="StructuredTextParser">
 			BiDi override for URI.
@@ -1896,8 +1899,8 @@
 			BiDi override for lists.
 			Structured text options: list separator [code]String[/code].
 		</constant>
-		<constant name="STRUCTURED_TEXT_NONE" value="5" enum="StructuredTextParser">
-			Use default Unicode BiDi algorithm.
+		<constant name="STRUCTURED_TEXT_GDSCRIPT" value="5" enum="StructuredTextParser">
+			BiDi override for GDScript.
 		</constant>
 		<constant name="STRUCTURED_TEXT_CUSTOM" value="6" enum="StructuredTextParser">
 			User defined structured text BiDi override function.
diff --git a/doc/classes/TextServerExtension.xml b/doc/classes/TextServerExtension.xml
index e144b09eb6..f4b306cf96 100644
--- a/doc/classes/TextServerExtension.xml
+++ b/doc/classes/TextServerExtension.xml
@@ -896,7 +896,7 @@
 			</description>
 		</method>
 		<method name="_parse_structured_text" qualifiers="virtual const">
-			<return type="Vector2i[]" />
+			<return type="Vector3i[]" />
 			<param index="0" name="parser_type" type="int" enum="TextServer.StructuredTextParser" />
 			<param index="1" name="args" type="Array" />
 			<param index="2" name="text" type="String" />
diff --git a/editor/code_editor.cpp b/editor/code_editor.cpp
index df8adf01e4..644735a4d8 100644
--- a/editor/code_editor.cpp
+++ b/editor/code_editor.cpp
@@ -2086,6 +2086,7 @@ CodeTextEditor::CodeTextEditor() {
 	text_editor = memnew(CodeEdit);
 	add_child(text_editor);
 	text_editor->set_v_size_flags(SIZE_EXPAND_FILL);
+	text_editor->set_structured_text_bidi_override(TextServer::STRUCTURED_TEXT_GDSCRIPT);
 
 	int ot_mode = EDITOR_GET("interface/editor/code_font_contextual_ligatures");
 	Ref<FontVariation> fc = text_editor->get_theme_font(SNAME("font"));
diff --git a/editor/doc_tools.cpp b/editor/doc_tools.cpp
index c675060b2b..5bdef32c60 100644
--- a/editor/doc_tools.cpp
+++ b/editor/doc_tools.cpp
@@ -750,6 +750,7 @@ void DocTools::generate(bool p_basic_types) {
 			MethodInfo mi;
 			mi.name = "operator []";
 			mi.return_val.type = Variant::get_indexed_element_type(Variant::Type(i));
+			mi.return_val.usage = Variant::get_indexed_element_usage(Variant::Type(i));
 			PropertyInfo arg;
 			arg.name = "index";
 			arg.type = Variant::INT;
diff --git a/editor/editor_data.cpp b/editor/editor_data.cpp
index 6e66962605..3059ce445c 100644
--- a/editor/editor_data.cpp
+++ b/editor/editor_data.cpp
@@ -122,12 +122,6 @@ int EditorSelectionHistory::get_history_pos() {
 	return current_elem_idx;
 }
 
-bool EditorSelectionHistory::is_history_obj_inspector_only(int p_obj) const {
-	ERR_FAIL_INDEX_V(p_obj, history.size(), false);
-	ERR_FAIL_INDEX_V(history[p_obj].level, history[p_obj].path.size(), false);
-	return history[p_obj].path[history[p_obj].level].inspector_only;
-}
-
 ObjectID EditorSelectionHistory::get_history_obj(int p_obj) const {
 	ERR_FAIL_INDEX_V(p_obj, history.size(), ObjectID());
 	ERR_FAIL_INDEX_V(history[p_obj].level, history[p_obj].path.size(), ObjectID());
@@ -351,18 +345,6 @@ void EditorData::apply_changes_in_editors() {
 	}
 }
 
-void EditorData::save_editor_global_states() {
-	for (int i = 0; i < editor_plugins.size(); i++) {
-		editor_plugins[i]->save_global_state();
-	}
-}
-
-void EditorData::restore_editor_global_states() {
-	for (int i = 0; i < editor_plugins.size(); i++) {
-		editor_plugins[i]->restore_global_state();
-	}
-}
-
 void EditorData::paste_object_params(Object *p_object) {
 	ERR_FAIL_NULL(p_object);
 	undo_redo_manager->create_action(TTR("Paste Params"));
diff --git a/editor/editor_data.h b/editor/editor_data.h
index bce9dd345d..6a89b3572c 100644
--- a/editor/editor_data.h
+++ b/editor/editor_data.h
@@ -80,7 +80,6 @@ public:
 
 	// Gets an object from the history. The most recent object would be the object with p_obj = get_history_len() - 1.
 	ObjectID get_history_obj(int p_obj) const;
-	bool is_history_obj_inspector_only(int p_obj) const;
 
 	bool next();
 	bool previous();
@@ -177,7 +176,6 @@ public:
 	Callable get_move_array_element_function(const StringName &p_class) const;
 
 	void save_editor_global_states();
-	void restore_editor_global_states();
 
 	void add_custom_type(const String &p_type, const String &p_inherits, const Ref<Script> &p_script, const Ref<Texture2D> &p_icon);
 	Variant instantiate_custom_type(const String &p_type, const String &p_inherits);
diff --git a/editor/editor_node.cpp b/editor/editor_node.cpp
index 7beea04589..b0278030f9 100644
--- a/editor/editor_node.cpp
+++ b/editor/editor_node.cpp
@@ -7400,7 +7400,6 @@ EditorNode::EditorNode() {
 
 	_update_recent_scenes();
 
-	editor_data.restore_editor_global_states();
 	set_process_shortcut_input(true);
 
 	load_errors = memnew(RichTextLabel);
diff --git a/editor/editor_plugin.cpp b/editor/editor_plugin.cpp
index d508638acd..7f02148dfc 100644
--- a/editor/editor_plugin.cpp
+++ b/editor/editor_plugin.cpp
@@ -712,9 +712,6 @@ bool EditorPlugin::get_remove_list(List<Node *> *p_list) {
 	return false;
 }
 
-void EditorPlugin::restore_global_state() {}
-void EditorPlugin::save_global_state() {}
-
 void EditorPlugin::add_undo_redo_inspector_hook_callback(Callable p_callable) {
 	EditorNode::get_singleton()->get_editor_data().add_undo_redo_inspector_hook_callback(p_callable);
 }
diff --git a/editor/editor_plugin.h b/editor/editor_plugin.h
index b79d2de035..a5a17acdf1 100644
--- a/editor/editor_plugin.h
+++ b/editor/editor_plugin.h
@@ -278,9 +278,6 @@ public:
 	void make_bottom_panel_item_visible(Control *p_item);
 	void hide_bottom_panel();
 
-	virtual void restore_global_state();
-	virtual void save_global_state();
-
 	void add_translation_parser_plugin(const Ref<EditorTranslationParserPlugin> &p_parser);
 	void remove_translation_parser_plugin(const Ref<EditorTranslationParserPlugin> &p_parser);
 
diff --git a/editor/editor_properties.cpp b/editor/editor_properties.cpp
index 3bf320f580..46f52ec4af 100644
--- a/editor/editor_properties.cpp
+++ b/editor/editor_properties.cpp
@@ -342,12 +342,17 @@ void EditorPropertyTextEnum::_notification(int p_what) {
 }
 
 EditorPropertyTextEnum::EditorPropertyTextEnum() {
+	HBoxContainer *hb = memnew(HBoxContainer);
+	add_child(hb);
+
 	default_layout = memnew(HBoxContainer);
-	add_child(default_layout);
+	default_layout->set_h_size_flags(SIZE_EXPAND_FILL);
+	hb->add_child(default_layout);
 
 	edit_custom_layout = memnew(HBoxContainer);
+	edit_custom_layout->set_h_size_flags(SIZE_EXPAND_FILL);
 	edit_custom_layout->hide();
-	add_child(edit_custom_layout);
+	hb->add_child(edit_custom_layout);
 
 	option_button = memnew(OptionButton);
 	option_button->set_h_size_flags(SIZE_EXPAND_FILL);
diff --git a/editor/plugins/animation_blend_tree_editor_plugin.cpp b/editor/plugins/animation_blend_tree_editor_plugin.cpp
index 14e3cb4b97..f5f9ec11b3 100644
--- a/editor/plugins/animation_blend_tree_editor_plugin.cpp
+++ b/editor/plugins/animation_blend_tree_editor_plugin.cpp
@@ -187,7 +187,7 @@ void AnimationNodeBlendTreeEditor::update_graph() {
 			String base_path = AnimationTreeEditor::get_singleton()->get_base_path() + String(E) + "/" + F.name;
 			EditorProperty *prop = EditorInspector::instantiate_property_editor(tree, F.type, base_path, F.hint, F.hint_string, F.usage);
 			if (prop) {
-				prop->set_read_only(read_only);
+				prop->set_read_only(read_only || (F.usage & PROPERTY_USAGE_READ_ONLY));
 				prop->set_object_and_property(tree, base_path);
 				prop->update_property();
 				prop->set_name_split_ratio(0);
diff --git a/editor/plugins/script_editor_plugin.cpp b/editor/plugins/script_editor_plugin.cpp
index 188abf1f5c..e515b46b1e 100644
--- a/editor/plugins/script_editor_plugin.cpp
+++ b/editor/plugins/script_editor_plugin.cpp
@@ -4007,12 +4007,6 @@ void ScriptEditorPlugin::apply_changes() {
 	script_editor->apply_scripts();
 }
 
-void ScriptEditorPlugin::restore_global_state() {
-}
-
-void ScriptEditorPlugin::save_global_state() {
-}
-
 void ScriptEditorPlugin::set_window_layout(Ref<ConfigFile> p_layout) {
 	script_editor->set_window_layout(p_layout);
 }
diff --git a/editor/plugins/script_editor_plugin.h b/editor/plugins/script_editor_plugin.h
index d4c80c416b..988d07621c 100644
--- a/editor/plugins/script_editor_plugin.h
+++ b/editor/plugins/script_editor_plugin.h
@@ -542,9 +542,6 @@ public:
 	virtual void save_external_data() override;
 	virtual void apply_changes() override;
 
-	virtual void restore_global_state() override;
-	virtual void save_global_state() override;
-
 	virtual void set_window_layout(Ref<ConfigFile> p_layout) override;
 	virtual void get_window_layout(Ref<ConfigFile> p_layout) override;
 
diff --git a/modules/astcenc/SCsub b/modules/astcenc/SCsub
new file mode 100644
index 0000000000..0f04f2bc28
--- /dev/null
+++ b/modules/astcenc/SCsub
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+Import("env")
+Import("env_modules")
+
+env_astcenc = env_modules.Clone()
+
+# Thirdparty source files
+
+thirdparty_obj = []
+
+thirdparty_dir = "#thirdparty/astcenc/"
+thirdparty_sources = [
+    "astcenc_averages_and_directions.cpp",
+    "astcenc_block_sizes.cpp",
+    "astcenc_color_quantize.cpp",
+    "astcenc_color_unquantize.cpp",
+    "astcenc_compress_symbolic.cpp",
+    "astcenc_compute_variance.cpp",
+    "astcenc_decompress_symbolic.cpp",
+    "astcenc_diagnostic_trace.cpp",
+    "astcenc_entry.cpp",
+    "astcenc_find_best_partitioning.cpp",
+    "astcenc_ideal_endpoints_and_weights.cpp",
+    "astcenc_image.cpp",
+    "astcenc_integer_sequence.cpp",
+    "astcenc_mathlib.cpp",
+    "astcenc_mathlib_softfloat.cpp",
+    "astcenc_partition_tables.cpp",
+    "astcenc_percentile_tables.cpp",
+    "astcenc_pick_best_endpoint_format.cpp",
+    "astcenc_platform_isa_detection.cpp",
+    "astcenc_quantization.cpp",
+    "astcenc_symbolic_physical.cpp",
+    "astcenc_weight_align.cpp",
+    "astcenc_weight_quant_xfer_tables.cpp",
+]
+thirdparty_sources = [thirdparty_dir + file for file in thirdparty_sources]
+
+env_astcenc.Prepend(CPPPATH=[thirdparty_dir])
+
+env_thirdparty = env_astcenc.Clone()
+env_thirdparty.disable_warnings()
+env_thirdparty.add_source_files(thirdparty_obj, thirdparty_sources)
+env.modules_sources += thirdparty_obj
+
+# Godot source files
+
+module_obj = []
+
+env_astcenc.add_source_files(module_obj, "*.cpp")
+env.modules_sources += module_obj
+
+# Needed to force rebuilding the module files when the thirdparty library is updated.
+env.Depends(module_obj, thirdparty_obj)
diff --git a/modules/astcenc/config.py b/modules/astcenc/config.py
new file mode 100644
index 0000000000..eb565b85b9
--- /dev/null
+++ b/modules/astcenc/config.py
@@ -0,0 +1,6 @@
+def can_build(env, platform):
+    return env.editor_build
+
+
+def configure(env):
+    pass
diff --git a/modules/astcenc/image_compress_astcenc.cpp b/modules/astcenc/image_compress_astcenc.cpp
new file mode 100644
index 0000000000..ce10201343
--- /dev/null
+++ b/modules/astcenc/image_compress_astcenc.cpp
@@ -0,0 +1,251 @@
+/**************************************************************************/
+/*  image_compress_astcenc.cpp                                            */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "image_compress_astcenc.h"
+
+#include "core/os/os.h"
+#include "core/string/print_string.h"
+
+#include <astcenc.h>
+
+void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
+	// TODO: See how to handle lossy quality.
+
+	Image::Format img_format = r_img->get_format();
+	if (img_format >= Image::FORMAT_DXT1) {
+		return; // Do not compress, already compressed.
+	}
+
+	bool is_hdr = false;
+	if ((img_format >= Image::FORMAT_RH) && (img_format <= Image::FORMAT_RGBE9995)) {
+		is_hdr = true;
+		r_img->convert(Image::FORMAT_RGBAF);
+	} else {
+		r_img->convert(Image::FORMAT_RGBA8);
+	}
+
+	// Determine encoder output format from our enum.
+
+	Image::Format target_format = Image::FORMAT_RGBA8;
+	astcenc_profile profile = ASTCENC_PRF_LDR;
+	unsigned int block_x = 4;
+	unsigned int block_y = 4;
+
+	if (p_format == Image::ASTCFormat::ASTC_FORMAT_4x4) {
+		if (is_hdr) {
+			target_format = Image::FORMAT_ASTC_4x4_HDR;
+			profile = ASTCENC_PRF_HDR;
+		} else {
+			target_format = Image::FORMAT_ASTC_4x4;
+		}
+	} else if (p_format == Image::ASTCFormat::ASTC_FORMAT_8x8) {
+		if (is_hdr) {
+			target_format = Image::FORMAT_ASTC_8x8_HDR;
+			profile = ASTCENC_PRF_HDR;
+		} else {
+			target_format = Image::FORMAT_ASTC_8x8;
+		}
+		block_x = 8;
+		block_y = 8;
+	}
+
+	// Compress image data and (if required) mipmaps.
+
+	const bool mipmaps = r_img->has_mipmaps();
+	int width = r_img->get_width();
+	int height = r_img->get_height();
+
+	print_verbose(vformat("astcenc: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));
+
+	// Initialize astcenc.
+
+	astcenc_config config;
+	config.block_x = block_x;
+	config.block_y = block_y;
+	config.profile = profile;
+	const float quality = ASTCENC_PRE_MEDIUM;
+
+	astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
+
+	// Context allocation.
+
+	astcenc_context *context;
+	const unsigned int thread_count = OS::get_singleton()->get_processor_count();
+
+	status = astcenc_context_alloc(&config, thread_count, &context);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
+
+	// Compress image.
+
+	Vector<uint8_t> image_data = r_img->get_data();
+	uint8_t *slices = image_data.ptrw();
+
+	astcenc_image image;
+	image.dim_x = width;
+	image.dim_y = height;
+	image.dim_z = 1;
+	image.data_type = ASTCENC_TYPE_U8;
+	if (is_hdr) {
+		image.data_type = ASTCENC_TYPE_F32;
+	}
+	image.data = reinterpret_cast<void **>(&slices);
+
+	// Compute the number of ASTC blocks in each dimension.
+	unsigned int block_count_x = (width + block_x - 1) / block_x;
+	unsigned int block_count_y = (height + block_y - 1) / block_y;
+	size_t comp_len = block_count_x * block_count_y * 16;
+
+	Vector<uint8_t> compressed_data;
+	compressed_data.resize(comp_len);
+	compressed_data.fill(0);
+
+	const astcenc_swizzle swizzle = {
+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
+	};
+
+	status = astcenc_compress_image(context, &image, &swizzle, compressed_data.ptrw(), comp_len, 0);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: ASTC image compression failed: %s.", astcenc_get_error_string(status)));
+
+	// Replace original image with compressed one.
+
+	r_img->set_data(width, height, mipmaps, target_format, compressed_data);
+
+	print_verbose(vformat("astcenc: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
+}
+
+void _decompress_astc(Image *r_img) {
+	uint64_t start_time = OS::get_singleton()->get_ticks_msec();
+
+	// Determine decompression parameters from image format.
+
+	Image::Format img_format = r_img->get_format();
+	bool is_hdr = false;
+	unsigned int block_x = 0;
+	unsigned int block_y = 0;
+	if (img_format == Image::FORMAT_ASTC_4x4) {
+		block_x = 4;
+		block_y = 4;
+		is_hdr = false;
+	} else if (img_format == Image::FORMAT_ASTC_4x4_HDR) {
+		block_x = 4;
+		block_y = 4;
+		is_hdr = true;
+	} else if (img_format == Image::FORMAT_ASTC_8x8) {
+		block_x = 8;
+		block_y = 8;
+		is_hdr = false;
+	} else if (img_format == Image::FORMAT_ASTC_8x8_HDR) {
+		block_x = 8;
+		block_y = 8;
+		is_hdr = true;
+	} else {
+		ERR_FAIL_MSG("astcenc: Cannot decompress Image with a non-ASTC format.");
+	}
+
+	// Initialize astcenc.
+
+	astcenc_profile profile = ASTCENC_PRF_LDR;
+	if (is_hdr) {
+		profile = ASTCENC_PRF_HDR;
+	}
+	astcenc_config config;
+	const float quality = ASTCENC_PRE_MEDIUM;
+
+	astcenc_error status = astcenc_config_init(profile, block_x, block_y, block_x, quality, 0, &config);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Configuration initialization failed: %s.", astcenc_get_error_string(status)));
+
+	// Context allocation.
+
+	astcenc_context *context = nullptr;
+	const unsigned int thread_count = OS::get_singleton()->get_processor_count();
+
+	status = astcenc_context_alloc(&config, thread_count, &context);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: Context allocation failed: %s.", astcenc_get_error_string(status)));
+
+	// Decompress image.
+
+	const bool mipmaps = r_img->has_mipmaps();
+	int width = r_img->get_width();
+	int height = r_img->get_height();
+
+	astcenc_image image;
+	image.dim_x = width;
+	image.dim_y = height;
+	image.dim_z = 1;
+	image.data_type = ASTCENC_TYPE_U8;
+	Image::Format target_format = Image::FORMAT_RGBA8;
+	if (is_hdr) {
+		target_format = Image::FORMAT_RGBAF;
+		image.data_type = ASTCENC_TYPE_F32;
+	}
+
+	Vector<uint8_t> image_data = r_img->get_data();
+
+	Vector<uint8_t> new_image_data;
+	new_image_data.resize(Image::get_image_data_size(width, height, target_format, false));
+	new_image_data.fill(0);
+	uint8_t *slices = new_image_data.ptrw();
+	image.data = reinterpret_cast<void **>(&slices);
+
+	const astcenc_swizzle swizzle = {
+		ASTCENC_SWZ_R, ASTCENC_SWZ_G, ASTCENC_SWZ_B, ASTCENC_SWZ_A
+	};
+
+	status = astcenc_decompress_image(context, image_data.ptr(), image_data.size(), &image, &swizzle, 0);
+	ERR_FAIL_COND_MSG(status != ASTCENC_SUCCESS,
+			vformat("astcenc: ASTC decompression failed: %s.", astcenc_get_error_string(status)));
+	ERR_FAIL_COND_MSG(image.dim_z > 1,
+			"astcenc: ASTC decompression failed because this is a 3D texture, which is not supported.");
+
+	// Replace original image with compressed one.
+
+	Image::Format image_format = Image::FORMAT_RGBA8;
+	if (image.data_type == ASTCENC_TYPE_F32) {
+		image_format = Image::FORMAT_RGBAF;
+	} else if (image.data_type == ASTCENC_TYPE_U8) {
+		image_format = Image::FORMAT_RGBA8;
+	} else if (image.data_type == ASTCENC_TYPE_F16) {
+		image_format = Image::FORMAT_RGBAH;
+	} else {
+		ERR_FAIL_MSG("astcenc: ASTC decompression failed with an unknown format.");
+	}
+
+	r_img->set_data(image.dim_x, image.dim_y, mipmaps, image_format, new_image_data);
+
+	print_verbose(vformat("astcenc: Decompression took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
+}
diff --git a/modules/astcenc/image_compress_astcenc.h b/modules/astcenc/image_compress_astcenc.h
new file mode 100644
index 0000000000..a197a91e0d
--- /dev/null
+++ b/modules/astcenc/image_compress_astcenc.h
@@ -0,0 +1,39 @@
+/**************************************************************************/
+/*  image_compress_astcenc.h                                              */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef IMAGE_COMPRESS_ASTCENC_H
+#define IMAGE_COMPRESS_ASTCENC_H
+
+#include "core/io/image.h"
+
+void _compress_astc(Image *r_img, float p_lossy_quality, Image::ASTCFormat p_format);
+void _decompress_astc(Image *r_img);
+
+#endif // IMAGE_COMPRESS_ASTCENC_H
diff --git a/modules/astcenc/register_types.cpp b/modules/astcenc/register_types.cpp
new file mode 100644
index 0000000000..0bb1c3432f
--- /dev/null
+++ b/modules/astcenc/register_types.cpp
@@ -0,0 +1,48 @@
+/**************************************************************************/
+/*  register_types.cpp                                                    */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "register_types.h"
+
+#include "image_compress_astcenc.h"
+
+void initialize_astcenc_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+
+	Image::_image_compress_astc_func = _compress_astc;
+	Image::_image_decompress_astc = _decompress_astc;
+}
+
+void uninitialize_astcenc_module(ModuleInitializationLevel p_level) {
+	if (p_level != MODULE_INITIALIZATION_LEVEL_SCENE) {
+		return;
+	}
+}
diff --git a/modules/astcenc/register_types.h b/modules/astcenc/register_types.h
new file mode 100644
index 0000000000..636da9ff8b
--- /dev/null
+++ b/modules/astcenc/register_types.h
@@ -0,0 +1,39 @@
+/**************************************************************************/
+/*  register_types.h                                                      */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef ASTCENC_REGISTER_TYPES_H
+#define ASTCENC_REGISTER_TYPES_H
+
+#include "modules/register_module_types.h"
+
+void initialize_astcenc_module(ModuleInitializationLevel p_level);
+void uninitialize_astcenc_module(ModuleInitializationLevel p_level);
+
+#endif // ASTCENC_REGISTER_TYPES_H
diff --git a/modules/etcpak/image_compress_etcpak.cpp b/modules/etcpak/image_compress_etcpak.cpp
index b5192bd664..a6aeec54cc 100644
--- a/modules/etcpak/image_compress_etcpak.cpp
+++ b/modules/etcpak/image_compress_etcpak.cpp
@@ -33,8 +33,8 @@
 #include "core/os/os.h"
 #include "core/string/print_string.h"
 
-#include "thirdparty/etcpak/ProcessDxtc.hpp"
-#include "thirdparty/etcpak/ProcessRGB.hpp"
+#include <ProcessDxtc.hpp>
+#include <ProcessRGB.hpp>
 
 EtcpakType _determine_etc_type(Image::UsedChannels p_channels) {
 	switch (p_channels) {
@@ -130,7 +130,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 	} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5) {
 		target_format = Image::FORMAT_DXT5;
 	} else {
-		ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
+		ERR_FAIL_MSG("Invalid or unsupported etcpak compression format, not ETC or DXT.");
 	}
 
 	// Compress image data and (if required) mipmaps.
@@ -171,7 +171,7 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 
 	const uint8_t *src_read = r_img->get_data().ptr();
 
-	print_verbose(vformat("ETCPAK: Encoding image size %dx%d to format %s.", width, height, Image::get_format_name(target_format)));
+	print_verbose(vformat("etcpak: Encoding image size %dx%d to format %s%s.", width, height, Image::get_format_name(target_format), mipmaps ? ", with mipmaps" : ""));
 
 	int dest_size = Image::get_image_data_size(width, height, target_format, mipmaps);
 	Vector<uint8_t> dest_data;
@@ -232,12 +232,12 @@ void _compress_etcpak(EtcpakType p_compresstype, Image *r_img, float p_lossy_qua
 		} else if (p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5 || p_compresstype == EtcpakType::ETCPAK_TYPE_DXT5_RA_AS_RG) {
 			CompressDxt5(src_mip_read, dest_mip_write, blocks, mip_w);
 		} else {
-			ERR_FAIL_MSG("Invalid or unsupported Etcpak compression format.");
+			ERR_FAIL_MSG("etcpak: Invalid or unsupported compression format.");
 		}
 	}
 
 	// Replace original image with compressed one.
 	r_img->set_data(width, height, mipmaps, target_format, dest_data);
 
-	print_verbose(vformat("ETCPAK encode took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
+	print_verbose(vformat("etcpak: Encoding took %s ms.", rtos(OS::get_singleton()->get_ticks_msec() - start_time)));
 }
diff --git a/modules/ogg/ogg_packet_sequence.cpp b/modules/ogg/ogg_packet_sequence.cpp
index 0acaaf5fc9..d473f3b4a0 100644
--- a/modules/ogg/ogg_packet_sequence.cpp
+++ b/modules/ogg/ogg_packet_sequence.cpp
@@ -136,6 +136,8 @@ bool OggPacketSequencePlayback::next_ogg_packet(ogg_packet **p_packet) const {
 	ERR_FAIL_COND_V(data_version != ogg_packet_sequence->data_version, false);
 	ERR_FAIL_COND_V(ogg_packet_sequence->page_data.is_empty(), false);
 	ERR_FAIL_COND_V(ogg_packet_sequence->page_granule_positions.is_empty(), false);
+	ERR_FAIL_COND_V(page_cursor >= ogg_packet_sequence->page_data.size(), false);
+
 	// Move on to the next page if need be. This happens first to help simplify seek logic.
 	while (packet_cursor >= ogg_packet_sequence->page_data[page_cursor].size()) {
 		packet_cursor = 0;
diff --git a/modules/text_server_adv/text_server_adv.cpp b/modules/text_server_adv/text_server_adv.cpp
index 8e9ff61ad0..79ca4a7024 100644
--- a/modules/text_server_adv/text_server_adv.cpp
+++ b/modules/text_server_adv/text_server_adv.cpp
@@ -3654,6 +3654,7 @@ void TextServerAdvanced::full_copy(ShapedTextDataAdvanced *p_shaped) {
 
 RID TextServerAdvanced::_create_shaped_text(TextServer::Direction p_direction, TextServer::Orientation p_orientation) {
 	_THREAD_SAFE_METHOD_
+	ERR_FAIL_COND_V_MSG(p_direction == DIRECTION_INHERITED, RID(), "Invalid text direction.");
 
 	ShapedTextDataAdvanced *sd = memnew(ShapedTextDataAdvanced);
 	sd->hb_buffer = hb_buffer_create();
@@ -3679,6 +3680,7 @@ void TextServerAdvanced::_shaped_text_clear(const RID &p_shaped) {
 
 void TextServerAdvanced::_shaped_text_set_direction(const RID &p_shaped, TextServer::Direction p_direction) {
 	ShapedTextDataAdvanced *sd = shaped_owner.get_or_null(p_shaped);
+	ERR_FAIL_COND_MSG(p_direction == DIRECTION_INHERITED, "Invalid text direction.");
 	ERR_FAIL_COND(!sd);
 
 	MutexLock lock(sd->mutex);
@@ -3738,8 +3740,12 @@ void TextServerAdvanced::_shaped_text_set_bidi_override(const RID &p_shaped, con
 	}
 	sd->bidi_override.clear();
 	for (int i = 0; i < p_override.size(); i++) {
-		if (p_override[i].get_type() == Variant::VECTOR2I) {
-			sd->bidi_override.push_back(p_override[i]);
+		if (p_override[i].get_type() == Variant::VECTOR3I) {
+			const Vector3i &r = p_override[i];
+			sd->bidi_override.push_back(r);
+		} else if (p_override[i].get_type() == Variant::VECTOR2I) {
+			const Vector2i &r = p_override[i];
+			sd->bidi_override.push_back(Vector3i(r.x, r.y, DIRECTION_INHERITED));
 		}
 	}
 	invalidate(sd, false);
@@ -5544,8 +5550,31 @@ bool TextServerAdvanced::_shaped_text_shape(const RID &p_shaped) {
 		sd->script_iter = memnew(ScriptIterator(sd->text, 0, sd->text.length()));
 	}
 
+	int base_para_direction = UBIDI_DEFAULT_LTR;
+	switch (sd->direction) {
+		case DIRECTION_LTR: {
+			sd->para_direction = DIRECTION_LTR;
+			base_para_direction = UBIDI_LTR;
+		} break;
+		case DIRECTION_RTL: {
+			sd->para_direction = DIRECTION_RTL;
+			base_para_direction = UBIDI_RTL;
+		} break;
+		case DIRECTION_INHERITED:
+		case DIRECTION_AUTO: {
+			UBiDiDirection direction = ubidi_getBaseDirection(data, sd->utf16.length());
+			if (direction != UBIDI_NEUTRAL) {
+				sd->para_direction = (direction == UBIDI_RTL) ? DIRECTION_RTL : DIRECTION_LTR;
+				base_para_direction = direction;
+			} else {
+				sd->para_direction = DIRECTION_LTR;
+				base_para_direction = UBIDI_DEFAULT_LTR;
+			}
+		} break;
+	}
+
 	if (sd->bidi_override.is_empty()) {
-		sd->bidi_override.push_back(Vector2i(sd->start, sd->end));
+		sd->bidi_override.push_back(Vector3i(sd->start, sd->end, DIRECTION_INHERITED));
 	}
 
 	for (int ov = 0; ov < sd->bidi_override.size(); ov++) {
@@ -5561,23 +5590,22 @@ bool TextServerAdvanced::_shaped_text_shape(const RID &p_shaped) {
 		UBiDi *bidi_iter = ubidi_openSized(end, 0, &err);
 		ERR_FAIL_COND_V_MSG(U_FAILURE(err), false, u_errorName(err));
 
-		switch (sd->direction) {
+		switch (static_cast<TextServer::Direction>(sd->bidi_override[ov].z)) {
 			case DIRECTION_LTR: {
 				ubidi_setPara(bidi_iter, data + start, end - start, UBIDI_LTR, nullptr, &err);
-				sd->para_direction = DIRECTION_LTR;
 			} break;
 			case DIRECTION_RTL: {
 				ubidi_setPara(bidi_iter, data + start, end - start, UBIDI_RTL, nullptr, &err);
-				sd->para_direction = DIRECTION_RTL;
+			} break;
+			case DIRECTION_INHERITED: {
+				ubidi_setPara(bidi_iter, data + start, end - start, base_para_direction, nullptr, &err);
 			} break;
 			case DIRECTION_AUTO: {
 				UBiDiDirection direction = ubidi_getBaseDirection(data + start, end - start);
 				if (direction != UBIDI_NEUTRAL) {
 					ubidi_setPara(bidi_iter, data + start, end - start, direction, nullptr, &err);
-					sd->para_direction = (direction == UBIDI_RTL) ? DIRECTION_RTL : DIRECTION_LTR;
 				} else {
 					ubidi_setPara(bidi_iter, data + start, end - start, UBIDI_DEFAULT_LTR, nullptr, &err);
-					sd->para_direction = DIRECTION_LTR;
 				}
 			} break;
 		}
diff --git a/modules/text_server_adv/text_server_adv.h b/modules/text_server_adv/text_server_adv.h
index 5920ddaa50..c7fe46d554 100644
--- a/modules/text_server_adv/text_server_adv.h
+++ b/modules/text_server_adv/text_server_adv.h
@@ -499,7 +499,7 @@ class TextServerAdvanced : public TextServerExtension {
 		/* Intermediate data */
 		Char16String utf16;
 		Vector<UBiDi *> bidi_iter;
-		Vector<Vector2i> bidi_override;
+		Vector<Vector3i> bidi_override;
 		ScriptIterator *script_iter = nullptr;
 		hb_buffer_t *hb_buffer = nullptr;
 
diff --git a/modules/text_server_fb/text_server_fb.cpp b/modules/text_server_fb/text_server_fb.cpp
index ece34f56d6..b5d7d3a3cf 100644
--- a/modules/text_server_fb/text_server_fb.cpp
+++ b/modules/text_server_fb/text_server_fb.cpp
@@ -51,7 +51,6 @@ using namespace godot;
 #include "core/error/error_macros.h"
 #include "core/string/print_string.h"
 #include "core/string/translation.h"
-#include "core/string/ucaps.h"
 
 #include "modules/modules_enabled.gen.h" // For freetype, msdfgen, svg.
 
@@ -2665,6 +2664,7 @@ void TextServerFallback::full_copy(ShapedTextDataFallback *p_shaped) {
 
 RID TextServerFallback::_create_shaped_text(TextServer::Direction p_direction, TextServer::Orientation p_orientation) {
 	_THREAD_SAFE_METHOD_
+	ERR_FAIL_COND_V_MSG(p_direction == DIRECTION_INHERITED, RID(), "Invalid text direction.");
 
 	ShapedTextDataFallback *sd = memnew(ShapedTextDataFallback);
 	sd->direction = p_direction;
@@ -2688,6 +2688,7 @@ void TextServerFallback::_shaped_text_clear(const RID &p_shaped) {
 }
 
 void TextServerFallback::_shaped_text_set_direction(const RID &p_shaped, TextServer::Direction p_direction) {
+	ERR_FAIL_COND_MSG(p_direction == DIRECTION_INHERITED, "Invalid text direction.");
 	if (p_direction == DIRECTION_RTL) {
 		ERR_PRINT_ONCE("Right-to-left layout is not supported by this text server.");
 	}
@@ -4060,31 +4061,11 @@ double TextServerFallback::_shaped_text_get_underline_thickness(const RID &p_sha
 }
 
 String TextServerFallback::_string_to_upper(const String &p_string, const String &p_language) const {
-	String upper = p_string;
-
-	for (int i = 0; i <= upper.length(); i++) {
-		const char32_t s = upper[i];
-		const char32_t t = _find_upper(s);
-		if (s != t) { // avoid copy on write
-			upper[i] = t;
-		}
-	}
-
-	return upper;
+	return p_string.to_upper();
 }
 
 String TextServerFallback::_string_to_lower(const String &p_string, const String &p_language) const {
-	String lower = p_string;
-
-	for (int i = 0; i <= lower.length(); i++) {
-		const char32_t s = lower[i];
-		const char32_t t = _find_lower(s);
-		if (s != t) { // avoid copy on write
-			lower[i] = t;
-		}
-	}
-
-	return lower;
+	return p_string.to_lower();
 }
 
 PackedInt32Array TextServerFallback::_string_get_word_breaks(const String &p_string, const String &p_language, int p_chars_per_line) const {
diff --git a/scene/3d/label_3d.cpp b/scene/3d/label_3d.cpp
index f8c54809da..d0f71768d2 100644
--- a/scene/3d/label_3d.cpp
+++ b/scene/3d/label_3d.cpp
@@ -442,7 +442,7 @@ void Label3D::_shape() {
 			TS->shaped_text_set_spacing(text_rid, TextServer::SpacingType(i), font->get_spacing(TextServer::SpacingType(i)));
 		}
 
-		TypedArray<Vector2i> stt;
+		TypedArray<Vector3i> stt;
 		if (st_parser == TextServer::STRUCTURED_TEXT_CUSTOM) {
 			GDVIRTUAL_CALL(_structured_text_parser, st_args, txt, stt);
 		} else {
diff --git a/scene/3d/label_3d.h b/scene/3d/label_3d.h
index 8fc772e4b0..96cc941209 100644
--- a/scene/3d/label_3d.h
+++ b/scene/3d/label_3d.h
@@ -143,7 +143,7 @@ private:
 	void _generate_glyph_surfaces(const Glyph &p_glyph, Vector2 &r_offset, const Color &p_modulate, int p_priority = 0, int p_outline_size = 0);
 
 protected:
-	GDVIRTUAL2RC(Array, _structured_text_parser, Array, String)
+	GDVIRTUAL2RC(TypedArray<Vector3i>, _structured_text_parser, Array, String)
 
 	void _notification(int p_what);
 
diff --git a/scene/animation/animation_blend_tree.cpp b/scene/animation/animation_blend_tree.cpp
index 1ef0774828..e074927b9b 100644
--- a/scene/animation/animation_blend_tree.cpp
+++ b/scene/animation/animation_blend_tree.cpp
@@ -229,15 +229,17 @@ AnimationNodeSync::AnimationNodeSync() {
 ////////////////////////////////////////////////////////
 
 void AnimationNodeOneShot::get_parameter_list(List<PropertyInfo> *r_list) const {
-	r_list->push_back(PropertyInfo(Variant::BOOL, active));
-	r_list->push_back(PropertyInfo(Variant::BOOL, prev_active, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
+	r_list->push_back(PropertyInfo(Variant::BOOL, active, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_EDITOR | PROPERTY_USAGE_STORAGE | PROPERTY_USAGE_READ_ONLY));
+	r_list->push_back(PropertyInfo(Variant::INT, request, PROPERTY_HINT_ENUM, ",Fire,Abort"));
 	r_list->push_back(PropertyInfo(Variant::FLOAT, time, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
 	r_list->push_back(PropertyInfo(Variant::FLOAT, remaining, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
 	r_list->push_back(PropertyInfo(Variant::FLOAT, time_to_restart, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
 }
 
 Variant AnimationNodeOneShot::get_parameter_default_value(const StringName &p_parameter) const {
-	if (p_parameter == active || p_parameter == prev_active) {
+	if (p_parameter == request) {
+		return ONE_SHOT_REQUEST_NONE;
+	} else if (p_parameter == active) {
 		return false;
 	} else if (p_parameter == time_to_restart) {
 		return -1;
@@ -246,6 +248,13 @@ Variant AnimationNodeOneShot::get_parameter_default_value(const StringName &p_pa
 	}
 }
 
+bool AnimationNodeOneShot::is_parameter_read_only(const StringName &p_parameter) const {
+	if (p_parameter == active) {
+		return true;
+	}
+	return false;
+}
+
 void AnimationNodeOneShot::set_fadein_time(double p_time) {
 	fade_in = p_time;
 }
@@ -303,41 +312,42 @@ bool AnimationNodeOneShot::has_filter() const {
 }
 
 double AnimationNodeOneShot::process(double p_time, bool p_seek, bool p_is_external_seeking) {
+	OneShotRequest cur_request = static_cast<OneShotRequest>((int)get_parameter(request));
 	bool cur_active = get_parameter(active);
-	bool cur_prev_active = get_parameter(prev_active);
 	double cur_time = get_parameter(time);
 	double cur_remaining = get_parameter(remaining);
 	double cur_time_to_restart = get_parameter(time_to_restart);
 
-	if (!cur_active) {
-		//make it as if this node doesn't exist, pass input 0 by.
-		if (cur_prev_active) {
-			set_parameter(prev_active, false);
-		}
+	set_parameter(request, ONE_SHOT_REQUEST_NONE);
+
+	bool do_start = cur_request == ONE_SHOT_REQUEST_FIRE;
+	if (cur_request == ONE_SHOT_REQUEST_ABORT) {
+		set_parameter(active, false);
+		set_parameter(time_to_restart, -1);
+		return blend_input(0, p_time, p_seek, p_is_external_seeking, 1.0, FILTER_IGNORE, sync);
+	} else if (!do_start && !cur_active) {
 		if (cur_time_to_restart >= 0.0 && !p_seek) {
 			cur_time_to_restart -= p_time;
 			if (cur_time_to_restart < 0) {
-				//restart
-				set_parameter(active, true);
-				cur_active = true;
+				do_start = true; // Restart.
 			}
 			set_parameter(time_to_restart, cur_time_to_restart);
 		}
-
-		return blend_input(0, p_time, p_seek, p_is_external_seeking, 1.0, FILTER_IGNORE, sync);
+		if (!do_start) {
+			return blend_input(0, p_time, p_seek, p_is_external_seeking, 1.0, FILTER_IGNORE, sync);
+		}
 	}
 
 	bool os_seek = p_seek;
-
 	if (p_seek) {
 		cur_time = p_time;
 	}
-	bool do_start = !cur_prev_active;
 
 	if (do_start) {
 		cur_time = 0;
 		os_seek = true;
-		set_parameter(prev_active, true);
+		set_parameter(request, ONE_SHOT_REQUEST_NONE);
+		set_parameter(active, true);
 	}
 
 	real_t blend;
@@ -375,7 +385,6 @@ double AnimationNodeOneShot::process(double p_time, bool p_seek, bool p_is_exter
 		cur_remaining = os_rem;
 		if (cur_remaining <= 0) {
 			set_parameter(active, false);
-			set_parameter(prev_active, false);
 			if (autorestart) {
 				double restart_sec = autorestart_delay + Math::randd() * autorestart_random_delay;
 				set_parameter(time_to_restart, restart_sec);
@@ -419,6 +428,10 @@ void AnimationNodeOneShot::_bind_methods() {
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "autorestart_delay", PROPERTY_HINT_RANGE, "0,60,0.01,or_greater,suffix:s"), "set_autorestart_delay", "get_autorestart_delay");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "autorestart_random_delay", PROPERTY_HINT_RANGE, "0,60,0.01,or_greater,suffix:s"), "set_autorestart_random_delay", "get_autorestart_random_delay");
 
+	BIND_ENUM_CONSTANT(ONE_SHOT_REQUEST_NONE);
+	BIND_ENUM_CONSTANT(ONE_SHOT_REQUEST_FIRE);
+	BIND_ENUM_CONSTANT(ONE_SHOT_REQUEST_ABORT);
+
 	BIND_ENUM_CONSTANT(MIX_MODE_BLEND);
 	BIND_ENUM_CONSTANT(MIX_MODE_ADD);
 }
@@ -640,9 +653,10 @@ void AnimationNodeTransition::get_parameter_list(List<PropertyInfo> *r_list) con
 		anims += inputs[i].name;
 	}
 
-	r_list->push_back(PropertyInfo(Variant::INT, current, PROPERTY_HINT_ENUM, anims));
-	r_list->push_back(PropertyInfo(Variant::INT, prev_current, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
-	r_list->push_back(PropertyInfo(Variant::INT, prev, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
+	r_list->push_back(PropertyInfo(Variant::STRING, current_state, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_READ_ONLY)); // For interface.
+	r_list->push_back(PropertyInfo(Variant::STRING, transition_request, PROPERTY_HINT_ENUM, anims)); // For transition request. It will be cleared after setting the value immediately.
+	r_list->push_back(PropertyInfo(Variant::INT, current_index, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_STORAGE | PROPERTY_USAGE_READ_ONLY)); // To avoid finding the index every frame, use this internally.
+	r_list->push_back(PropertyInfo(Variant::INT, prev_index, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
 	r_list->push_back(PropertyInfo(Variant::FLOAT, time, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
 	r_list->push_back(PropertyInfo(Variant::FLOAT, prev_xfading, PROPERTY_HINT_NONE, "", PROPERTY_USAGE_NONE));
 }
@@ -650,13 +664,22 @@ void AnimationNodeTransition::get_parameter_list(List<PropertyInfo> *r_list) con
 Variant AnimationNodeTransition::get_parameter_default_value(const StringName &p_parameter) const {
 	if (p_parameter == time || p_parameter == prev_xfading) {
 		return 0.0;
-	} else if (p_parameter == prev || p_parameter == prev_current) {
+	} else if (p_parameter == prev_index) {
 		return -1;
+	} else if (p_parameter == transition_request || p_parameter == current_state) {
+		return String();
 	} else {
 		return 0;
 	}
 }
 
+bool AnimationNodeTransition::is_parameter_read_only(const StringName &p_parameter) const {
+	if (p_parameter == current_state || p_parameter == current_index) {
+		return true;
+	}
+	return false;
+}
+
 String AnimationNodeTransition::get_caption() const {
 	return "Transition";
 }
@@ -702,6 +725,17 @@ String AnimationNodeTransition::get_input_caption(int p_input) const {
 	return inputs[p_input].name;
 }
 
+int AnimationNodeTransition::find_input_caption(const String &p_name) const {
+	int idx = -1;
+	for (int i = 0; i < MAX_INPUTS; i++) {
+		if (inputs[i].name == p_name) {
+			idx = i;
+			break;
+		}
+	}
+	return idx;
+}
+
 void AnimationNodeTransition::set_xfade_time(double p_fade) {
 	xfade_time = p_fade;
 }
@@ -718,35 +752,62 @@ Ref<Curve> AnimationNodeTransition::get_xfade_curve() const {
 	return xfade_curve;
 }
 
-void AnimationNodeTransition::set_from_start(bool p_from_start) {
-	from_start = p_from_start;
+void AnimationNodeTransition::set_reset(bool p_reset) {
+	reset = p_reset;
 }
 
-bool AnimationNodeTransition::is_from_start() const {
-	return from_start;
+bool AnimationNodeTransition::is_reset() const {
+	return reset;
 }
 
 double AnimationNodeTransition::process(double p_time, bool p_seek, bool p_is_external_seeking) {
-	int cur_current = get_parameter(current);
-	int cur_prev = get_parameter(prev);
-	int cur_prev_current = get_parameter(prev_current);
+	String cur_transition_request = get_parameter(transition_request);
+	int cur_current_index = get_parameter(current_index);
+	int cur_prev_index = get_parameter(prev_index);
 
 	double cur_time = get_parameter(time);
 	double cur_prev_xfading = get_parameter(prev_xfading);
 
-	bool switched = cur_current != cur_prev_current;
+	bool switched = false;
+	bool restart = false;
+
+	if (!cur_transition_request.is_empty()) {
+		int new_idx = find_input_caption(cur_transition_request);
+		if (new_idx >= 0) {
+			if (cur_current_index == new_idx) {
+				// Transition to same state.
+				restart = reset;
+				cur_prev_xfading = 0;
+				set_parameter(prev_xfading, 0);
+				cur_prev_index = -1;
+				set_parameter(prev_index, -1);
+			} else {
+				switched = true;
+				cur_prev_index = cur_current_index;
+				set_parameter(prev_index, cur_current_index);
+			}
+			cur_current_index = new_idx;
+			set_parameter(current_index, cur_current_index);
+			set_parameter(current_state, cur_transition_request);
+		} else {
+			ERR_PRINT("No such input: '" + cur_transition_request + "'");
+		}
+		cur_transition_request = String();
+		set_parameter(transition_request, cur_transition_request);
+	}
 
-	if (switched) {
-		set_parameter(prev_current, cur_current);
-		set_parameter(prev, cur_prev_current);
+	// Special case for restart.
+	if (restart) {
+		set_parameter(time, 0);
+		return blend_input(cur_current_index, 0, true, p_is_external_seeking, 1.0, FILTER_IGNORE, true);
+	}
 
-		cur_prev = cur_prev_current;
+	if (switched) {
 		cur_prev_xfading = xfade_time;
 		cur_time = 0;
-		switched = true;
 	}
 
-	if (cur_current < 0 || cur_current >= enabled_inputs || cur_prev >= enabled_inputs) {
+	if (cur_current_index < 0 || cur_current_index >= enabled_inputs || cur_prev_index >= enabled_inputs) {
 		return 0;
 	}
 
@@ -754,15 +815,15 @@ double AnimationNodeTransition::process(double p_time, bool p_seek, bool p_is_ex
 
 	if (sync) {
 		for (int i = 0; i < enabled_inputs; i++) {
-			if (i != cur_current && i != cur_prev) {
+			if (i != cur_current_index && i != cur_prev_index) {
 				blend_input(i, p_time, p_seek, p_is_external_seeking, 0, FILTER_IGNORE, true);
 			}
 		}
 	}
 
-	if (cur_prev < 0) { // process current animation, check for transition
+	if (cur_prev_index < 0) { // process current animation, check for transition
 
-		rem = blend_input(cur_current, p_time, p_seek, p_is_external_seeking, 1.0, FILTER_IGNORE, true);
+		rem = blend_input(cur_current_index, p_time, p_seek, p_is_external_seeking, 1.0, FILTER_IGNORE, true);
 
 		if (p_seek) {
 			cur_time = p_time;
@@ -770,8 +831,8 @@ double AnimationNodeTransition::process(double p_time, bool p_seek, bool p_is_ex
 			cur_time += p_time;
 		}
 
-		if (inputs[cur_current].auto_advance && rem <= xfade_time) {
-			set_parameter(current, (cur_current + 1) % enabled_inputs);
+		if (inputs[cur_current_index].auto_advance && rem <= xfade_time) {
+			set_parameter(transition_request, get_input_caption((cur_current_index + 1) % enabled_inputs));
 		}
 
 	} else { // cross-fading from prev to current
@@ -783,21 +844,21 @@ double AnimationNodeTransition::process(double p_time, bool p_seek, bool p_is_ex
 
 		// Blend values must be more than CMP_EPSILON to process discrete keys in edge.
 		real_t blend_inv = 1.0 - blend;
-		if (from_start && !p_seek && switched) { //just switched, seek to start of current
-			rem = blend_input(cur_current, 0, true, p_is_external_seeking, Math::is_zero_approx(blend_inv) ? CMP_EPSILON : blend_inv, FILTER_IGNORE, true);
+		if (reset && !p_seek && switched) { //just switched, seek to start of current
+			rem = blend_input(cur_current_index, 0, true, p_is_external_seeking, Math::is_zero_approx(blend_inv) ? CMP_EPSILON : blend_inv, FILTER_IGNORE, true);
 		} else {
-			rem = blend_input(cur_current, p_time, p_seek, p_is_external_seeking, Math::is_zero_approx(blend_inv) ? CMP_EPSILON : blend_inv, FILTER_IGNORE, true);
+			rem = blend_input(cur_current_index, p_time, p_seek, p_is_external_seeking, Math::is_zero_approx(blend_inv) ? CMP_EPSILON : blend_inv, FILTER_IGNORE, true);
 		}
 
 		if (p_seek) {
-			blend_input(cur_prev, p_time, true, p_is_external_seeking, Math::is_zero_approx(blend) ? CMP_EPSILON : blend, FILTER_IGNORE, true);
+			blend_input(cur_prev_index, p_time, true, p_is_external_seeking, Math::is_zero_approx(blend) ? CMP_EPSILON : blend, FILTER_IGNORE, true);
 			cur_time = p_time;
 		} else {
-			blend_input(cur_prev, p_time, false, p_is_external_seeking, Math::is_zero_approx(blend) ? CMP_EPSILON : blend, FILTER_IGNORE, true);
+			blend_input(cur_prev_index, p_time, false, p_is_external_seeking, Math::is_zero_approx(blend) ? CMP_EPSILON : blend, FILTER_IGNORE, true);
 			cur_time += p_time;
 			cur_prev_xfading -= p_time;
 			if (cur_prev_xfading < 0) {
-				set_parameter(prev, -1);
+				set_parameter(prev_index, -1);
 			}
 		}
 	}
@@ -829,6 +890,7 @@ void AnimationNodeTransition::_bind_methods() {
 
 	ClassDB::bind_method(D_METHOD("set_input_caption", "input", "caption"), &AnimationNodeTransition::set_input_caption);
 	ClassDB::bind_method(D_METHOD("get_input_caption", "input"), &AnimationNodeTransition::get_input_caption);
+	ClassDB::bind_method(D_METHOD("find_input_caption", "caption"), &AnimationNodeTransition::find_input_caption);
 
 	ClassDB::bind_method(D_METHOD("set_xfade_time", "time"), &AnimationNodeTransition::set_xfade_time);
 	ClassDB::bind_method(D_METHOD("get_xfade_time"), &AnimationNodeTransition::get_xfade_time);
@@ -836,13 +898,13 @@ void AnimationNodeTransition::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_xfade_curve", "curve"), &AnimationNodeTransition::set_xfade_curve);
 	ClassDB::bind_method(D_METHOD("get_xfade_curve"), &AnimationNodeTransition::get_xfade_curve);
 
-	ClassDB::bind_method(D_METHOD("set_from_start", "from_start"), &AnimationNodeTransition::set_from_start);
-	ClassDB::bind_method(D_METHOD("is_from_start"), &AnimationNodeTransition::is_from_start);
+	ClassDB::bind_method(D_METHOD("set_reset", "reset"), &AnimationNodeTransition::set_reset);
+	ClassDB::bind_method(D_METHOD("is_reset"), &AnimationNodeTransition::is_reset);
 
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "enabled_inputs", PROPERTY_HINT_RANGE, "0,64,1", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_UPDATE_ALL_IF_MODIFIED), "set_enabled_inputs", "get_enabled_inputs");
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "xfade_time", PROPERTY_HINT_RANGE, "0,120,0.01,suffix:s"), "set_xfade_time", "get_xfade_time");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "xfade_curve", PROPERTY_HINT_RESOURCE_TYPE, "Curve"), "set_xfade_curve", "get_xfade_curve");
-	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "from_start"), "set_from_start", "is_from_start");
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "reset"), "set_reset", "is_reset");
 
 	for (int i = 0; i < MAX_INPUTS; i++) {
 		ADD_PROPERTYI(PropertyInfo(Variant::STRING, "input_" + itos(i) + "/name", PROPERTY_HINT_NONE, "", PROPERTY_USAGE_DEFAULT | PROPERTY_USAGE_INTERNAL), "set_input_caption", "get_input_caption", i);
diff --git a/scene/animation/animation_blend_tree.h b/scene/animation/animation_blend_tree.h
index e72471202e..a1969bb621 100644
--- a/scene/animation/animation_blend_tree.h
+++ b/scene/animation/animation_blend_tree.h
@@ -96,6 +96,12 @@ class AnimationNodeOneShot : public AnimationNodeSync {
 	GDCLASS(AnimationNodeOneShot, AnimationNodeSync);
 
 public:
+	enum OneShotRequest {
+		ONE_SHOT_REQUEST_NONE,
+		ONE_SHOT_REQUEST_FIRE,
+		ONE_SHOT_REQUEST_ABORT,
+	};
+
 	enum MixMode {
 		MIX_MODE_BLEND,
 		MIX_MODE_ADD
@@ -110,13 +116,8 @@ private:
 	double autorestart_random_delay = 0.0;
 	MixMode mix = MIX_MODE_BLEND;
 
-	/*	bool active;
-	bool do_start;
-	double time;
-	double remaining;*/
-
+	StringName request = PNAME("request");
 	StringName active = PNAME("active");
-	StringName prev_active = "prev_active";
 	StringName time = "time";
 	StringName remaining = "remaining";
 	StringName time_to_restart = "time_to_restart";
@@ -127,6 +128,7 @@ protected:
 public:
 	virtual void get_parameter_list(List<PropertyInfo> *r_list) const override;
 	virtual Variant get_parameter_default_value(const StringName &p_parameter) const override;
+	virtual bool is_parameter_read_only(const StringName &p_parameter) const override;
 
 	virtual String get_caption() const override;
 
@@ -153,6 +155,7 @@ public:
 	AnimationNodeOneShot();
 };
 
+VARIANT_ENUM_CAST(AnimationNodeOneShot::OneShotRequest)
 VARIANT_ENUM_CAST(AnimationNodeOneShot::MixMode)
 
 class AnimationNodeAdd2 : public AnimationNodeSync {
@@ -284,22 +287,19 @@ class AnimationNodeTransition : public AnimationNodeSync {
 	InputData inputs[MAX_INPUTS];
 	int enabled_inputs = 0;
 
-	/*
-	double prev_xfading;
-	int prev;
-	double time;
-	int current;
-	int prev_current; */
-
-	StringName prev_xfading = "prev_xfading";
-	StringName prev = "prev";
 	StringName time = "time";
-	StringName current = PNAME("current");
-	StringName prev_current = "prev_current";
+	StringName prev_xfading = "prev_xfading";
+	StringName prev_index = "prev_index";
+	StringName current_index = PNAME("current_index");
+	StringName current_state = PNAME("current_state");
+	StringName transition_request = PNAME("transition_request");
+
+	StringName prev_frame_current = "pf_current";
+	StringName prev_frame_current_idx = "pf_current_idx";
 
 	double xfade_time = 0.0;
 	Ref<Curve> xfade_curve;
-	bool from_start = true;
+	bool reset = true;
 
 	void _update_inputs();
 
@@ -310,6 +310,7 @@ protected:
 public:
 	virtual void get_parameter_list(List<PropertyInfo> *r_list) const override;
 	virtual Variant get_parameter_default_value(const StringName &p_parameter) const override;
+	virtual bool is_parameter_read_only(const StringName &p_parameter) const override;
 
 	virtual String get_caption() const override;
 
@@ -321,6 +322,7 @@ public:
 
 	void set_input_caption(int p_input, const String &p_name);
 	String get_input_caption(int p_input) const;
+	int find_input_caption(const String &p_name) const;
 
 	void set_xfade_time(double p_fade);
 	double get_xfade_time() const;
@@ -328,8 +330,8 @@ public:
 	void set_xfade_curve(const Ref<Curve> &p_curve);
 	Ref<Curve> get_xfade_curve() const;
 
-	void set_from_start(bool p_from_start);
-	bool is_from_start() const;
+	void set_reset(bool p_reset);
+	bool is_reset() const;
 
 	double process(double p_time, bool p_seek, bool p_is_external_seeking) override;
 
diff --git a/scene/animation/animation_node_state_machine.cpp b/scene/animation/animation_node_state_machine.cpp
index f5df64dbdd..2c79e5fe06 100644
--- a/scene/animation/animation_node_state_machine.cpp
+++ b/scene/animation/animation_node_state_machine.cpp
@@ -107,6 +107,15 @@ Ref<Curve> AnimationNodeStateMachineTransition::get_xfade_curve() const {
 	return xfade_curve;
 }
 
+void AnimationNodeStateMachineTransition::set_reset(bool p_reset) {
+	reset = p_reset;
+	emit_changed();
+}
+
+bool AnimationNodeStateMachineTransition::is_reset() const {
+	return reset;
+}
+
 void AnimationNodeStateMachineTransition::set_priority(int p_priority) {
 	priority = p_priority;
 	emit_changed();
@@ -132,6 +141,9 @@ void AnimationNodeStateMachineTransition::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_xfade_curve", "curve"), &AnimationNodeStateMachineTransition::set_xfade_curve);
 	ClassDB::bind_method(D_METHOD("get_xfade_curve"), &AnimationNodeStateMachineTransition::get_xfade_curve);
 
+	ClassDB::bind_method(D_METHOD("set_reset", "reset"), &AnimationNodeStateMachineTransition::set_reset);
+	ClassDB::bind_method(D_METHOD("is_reset"), &AnimationNodeStateMachineTransition::is_reset);
+
 	ClassDB::bind_method(D_METHOD("set_priority", "priority"), &AnimationNodeStateMachineTransition::set_priority);
 	ClassDB::bind_method(D_METHOD("get_priority"), &AnimationNodeStateMachineTransition::get_priority);
 
@@ -140,6 +152,9 @@ void AnimationNodeStateMachineTransition::_bind_methods() {
 
 	ADD_PROPERTY(PropertyInfo(Variant::FLOAT, "xfade_time", PROPERTY_HINT_RANGE, "0,240,0.01,suffix:s"), "set_xfade_time", "get_xfade_time");
 	ADD_PROPERTY(PropertyInfo(Variant::OBJECT, "xfade_curve", PROPERTY_HINT_RESOURCE_TYPE, "Curve"), "set_xfade_curve", "get_xfade_curve");
+
+	ADD_PROPERTY(PropertyInfo(Variant::BOOL, "reset"), "set_reset", "is_reset");
+
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "priority", PROPERTY_HINT_RANGE, "0,32,1"), "set_priority", "get_priority");
 	ADD_GROUP("Switch", "");
 	ADD_PROPERTY(PropertyInfo(Variant::INT, "switch_mode", PROPERTY_HINT_ENUM, "Immediate,Sync,At End"), "set_switch_mode", "get_switch_mode");
@@ -164,18 +179,27 @@ AnimationNodeStateMachineTransition::AnimationNodeStateMachineTransition() {
 
 ////////////////////////////////////////////////////////
 
-void AnimationNodeStateMachinePlayback::travel(const StringName &p_state) {
-	start_request_travel = true;
-	start_request = p_state;
+void AnimationNodeStateMachinePlayback::travel(const StringName &p_state, bool p_reset_on_teleport) {
+	travel_request = p_state;
+	reset_request_on_teleport = p_reset_on_teleport;
 	stop_request = false;
 }
 
-void AnimationNodeStateMachinePlayback::start(const StringName &p_state) {
-	start_request_travel = false;
+void AnimationNodeStateMachinePlayback::start(const StringName &p_state, bool p_reset) {
+	travel_request = StringName();
+	reset_request = p_reset;
+	_start(p_state);
+}
+
+void AnimationNodeStateMachinePlayback::_start(const StringName &p_state) {
 	start_request = p_state;
 	stop_request = false;
 }
 
+void AnimationNodeStateMachinePlayback::next() {
+	next_request = true;
+}
+
 void AnimationNodeStateMachinePlayback::stop() {
 	stop_request = true;
 }
@@ -212,7 +236,7 @@ bool AnimationNodeStateMachinePlayback::_travel(AnimationNodeStateMachine *p_sta
 	path.clear(); //a new one will be needed
 
 	if (current == p_travel) {
-		return true; //nothing to do
+		return false; // Will teleport oneself (restart).
 	}
 
 	Vector2 current_pos = p_state_machine->states[current].position;
@@ -323,6 +347,15 @@ bool AnimationNodeStateMachinePlayback::_travel(AnimationNodeStateMachine *p_sta
 }
 
 double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_state_machine, double p_time, bool p_seek, bool p_is_external_seeking) {
+	double rem = _process(p_state_machine, p_time, p_seek, p_is_external_seeking);
+	start_request = StringName();
+	next_request = false;
+	stop_request = false;
+	reset_request_on_teleport = false;
+	return rem;
+}
+
+double AnimationNodeStateMachinePlayback::_process(AnimationNodeStateMachine *p_state_machine, double p_time, bool p_seek, bool p_is_external_seeking) {
 	if (p_time == -1) {
 		Ref<AnimationNodeStateMachine> anodesm = p_state_machine->states[current].node;
 		if (anodesm.is_valid()) {
@@ -335,14 +368,13 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 	//if not playing and it can restart, then restart
 	if (!playing && start_request == StringName()) {
 		if (!stop_request && p_state_machine->start_node) {
-			start(p_state_machine->start_node);
+			_start(p_state_machine->start_node);
 		} else {
 			return 0;
 		}
 	}
 
 	if (playing && stop_request) {
-		stop_request = false;
 		playing = false;
 		return 0;
 	}
@@ -350,42 +382,45 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 	bool play_start = false;
 
 	if (start_request != StringName()) {
-		if (start_request_travel) {
-			if (!playing) {
-				if (!stop_request && p_state_machine->start_node) {
-					// can restart, just postpone traveling
-					path.clear();
-					current = p_state_machine->start_node;
-					playing = true;
-					play_start = true;
-				} else {
-					// stopped, invalid state
-					String node_name = start_request;
-					start_request = StringName(); //clear start request
-					ERR_FAIL_V_MSG(0, "Can't travel to '" + node_name + "' if state machine is not playing. Maybe you need to enable Autoplay on Load for one of the nodes in your state machine or call .start() first?");
-				}
-			} else {
-				if (!_travel(p_state_machine, start_request)) {
-					// can't travel, then teleport
-					path.clear();
-					current = start_request;
-					play_start = true;
-				}
-				start_request = StringName(); //clear start request
-			}
+		// teleport to start
+		if (p_state_machine->states.has(start_request)) {
+			path.clear();
+			current = start_request;
+			playing = true;
+			play_start = true;
 		} else {
-			// teleport to start
-			if (p_state_machine->states.has(start_request)) {
+			StringName node = start_request;
+			ERR_FAIL_V_MSG(0, "No such node: '" + node + "'");
+		}
+	} else if (travel_request != StringName()) {
+		if (!playing) {
+			if (!stop_request && p_state_machine->start_node) {
+				// can restart, just postpone traveling
 				path.clear();
-				current = start_request;
+				current = p_state_machine->start_node;
 				playing = true;
 				play_start = true;
-				start_request = StringName(); //clear start request
 			} else {
-				StringName node = start_request;
-				start_request = StringName(); //clear start request
-				ERR_FAIL_V_MSG(0, "No such node: '" + node + "'");
+				// stopped, invalid state
+				String node_name = travel_request;
+				travel_request = StringName();
+				ERR_FAIL_V_MSG(0, "Can't travel to '" + node_name + "' if state machine is not playing. Maybe you need to enable Autoplay on Load for one of the nodes in your state machine or call .start() first?");
 			}
+		} else {
+			if (!_travel(p_state_machine, travel_request)) {
+				// can't travel, then teleport
+				if (p_state_machine->states.has(travel_request)) {
+					path.clear();
+					current = travel_request;
+					play_start = true;
+					reset_request = reset_request_on_teleport;
+				} else {
+					StringName node = travel_request;
+					travel_request = StringName();
+					ERR_FAIL_V_MSG(0, "No such node: '" + node + "'");
+				}
+			}
+			travel_request = StringName();
 		}
 	}
 
@@ -396,8 +431,11 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 			current = p_state_machine->start_node;
 		}
 
-		len_current = p_state_machine->blend_node(current, p_state_machine->states[current].node, 0, true, p_is_external_seeking, 1.0, AnimationNode::FILTER_IGNORE, true);
-		pos_current = 0;
+		if (reset_request) {
+			len_current = p_state_machine->blend_node(current, p_state_machine->states[current].node, 0, true, p_is_external_seeking, 1.0, AnimationNode::FILTER_IGNORE, true);
+			pos_current = 0;
+			reset_request = false;
+		}
 	}
 
 	if (!p_state_machine->states.has(current)) {
@@ -421,7 +459,8 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 	if (current_curve.is_valid()) {
 		fade_blend = current_curve->sample(fade_blend);
 	}
-	double rem = p_state_machine->blend_node(current, p_state_machine->states[current].node, p_time, p_seek, p_is_external_seeking, Math::is_zero_approx(fade_blend) ? CMP_EPSILON : fade_blend, AnimationNode::FILTER_IGNORE, true); // Blend values must be more than CMP_EPSILON to process discrete keys in edge.
+
+	double rem = do_start ? len_current : p_state_machine->blend_node(current, p_state_machine->states[current].node, p_time, p_seek, p_is_external_seeking, Math::is_zero_approx(fade_blend) ? CMP_EPSILON : fade_blend, AnimationNode::FILTER_IGNORE, true); // Blend values must be more than CMP_EPSILON to process discrete keys in edge.
 
 	if (fading_from != StringName()) {
 		double fade_blend_inv = 1.0 - fade_blend;
@@ -457,6 +496,7 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 				next_xfade = p_state_machine->transitions[i].transition->get_xfade_time();
 				current_curve = p_state_machine->transitions[i].transition->get_xfade_curve();
 				switch_mode = p_state_machine->transitions[i].transition->get_switch_mode();
+				reset_request = p_state_machine->transitions[i].transition->is_reset();
 				next = path[0];
 			}
 		}
@@ -513,6 +553,7 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 			current_curve = p_state_machine->transitions[auto_advance_to].transition->get_xfade_curve();
 			next_xfade = p_state_machine->transitions[auto_advance_to].transition->get_xfade_time();
 			switch_mode = p_state_machine->transitions[auto_advance_to].transition->get_switch_mode();
+			reset_request = p_state_machine->transitions[auto_advance_to].transition->is_reset();
 		}
 	}
 
@@ -567,7 +608,7 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 			goto_next = fading_from == StringName();
 		}
 
-		if (goto_next) { //end_loop should be used because fade time may be too small or zero and animation may have looped
+		if (next_request || goto_next) { //end_loop should be used because fade time may be too small or zero and animation may have looped
 			if (next_xfade) {
 				//time to fade, baby
 				fading_from = current;
@@ -591,7 +632,9 @@ double AnimationNodeStateMachinePlayback::process(AnimationNodeStateMachine *p_s
 
 			current = next;
 
-			len_current = p_state_machine->blend_node(current, p_state_machine->states[current].node, 0, true, p_is_external_seeking, CMP_EPSILON, AnimationNode::FILTER_IGNORE, true); // Process next node's first key in here.
+			if (reset_request) {
+				len_current = p_state_machine->blend_node(current, p_state_machine->states[current].node, 0, true, p_is_external_seeking, CMP_EPSILON, AnimationNode::FILTER_IGNORE, true); // Process next node's first key in here.
+			}
 			if (switch_mode == AnimationNodeStateMachineTransition::SWITCH_MODE_SYNC) {
 				pos_current = MIN(pos_current, len_current);
 				p_state_machine->blend_node(current, p_state_machine->states[current].node, pos_current, true, p_is_external_seeking, 0, AnimationNode::FILTER_IGNORE, true);
@@ -652,8 +695,9 @@ bool AnimationNodeStateMachinePlayback::_check_advance_condition(const Ref<Anima
 }
 
 void AnimationNodeStateMachinePlayback::_bind_methods() {
-	ClassDB::bind_method(D_METHOD("travel", "to_node"), &AnimationNodeStateMachinePlayback::travel);
-	ClassDB::bind_method(D_METHOD("start", "node"), &AnimationNodeStateMachinePlayback::start);
+	ClassDB::bind_method(D_METHOD("travel", "to_node", "reset_on_teleport"), &AnimationNodeStateMachinePlayback::travel, DEFVAL(true));
+	ClassDB::bind_method(D_METHOD("start", "node", "reset"), &AnimationNodeStateMachinePlayback::start, DEFVAL(true));
+	ClassDB::bind_method(D_METHOD("next"), &AnimationNodeStateMachinePlayback::next);
 	ClassDB::bind_method(D_METHOD("stop"), &AnimationNodeStateMachinePlayback::stop);
 	ClassDB::bind_method(D_METHOD("is_playing"), &AnimationNodeStateMachinePlayback::is_playing);
 	ClassDB::bind_method(D_METHOD("get_current_node"), &AnimationNodeStateMachinePlayback::get_current_node);
diff --git a/scene/animation/animation_node_state_machine.h b/scene/animation/animation_node_state_machine.h
index 116589eb2f..8183d2025a 100644
--- a/scene/animation/animation_node_state_machine.h
+++ b/scene/animation/animation_node_state_machine.h
@@ -57,6 +57,7 @@ private:
 	StringName advance_condition_name;
 	float xfade_time = 0.0;
 	Ref<Curve> xfade_curve;
+	bool reset = true;
 	int priority = 1;
 	String advance_expression;
 
@@ -84,6 +85,9 @@ public:
 	void set_xfade_time(float p_xfade);
 	float get_xfade_time() const;
 
+	void set_reset(bool p_reset);
+	bool is_reset() const;
+
 	void set_xfade_curve(const Ref<Curve> &p_curve);
 	Ref<Curve> get_xfade_curve() const;
 
@@ -131,10 +135,15 @@ class AnimationNodeStateMachinePlayback : public Resource {
 	bool playing = false;
 
 	StringName start_request;
-	bool start_request_travel = false;
+	StringName travel_request;
+	bool reset_request = false;
+	bool reset_request_on_teleport = false;
+	bool next_request = false;
 	bool stop_request = false;
 
 	bool _travel(AnimationNodeStateMachine *p_state_machine, const StringName &p_travel);
+	void _start(const StringName &p_state);
+	double _process(AnimationNodeStateMachine *p_state_machine, double p_time, bool p_seek, bool p_is_external_seeking);
 
 	double process(AnimationNodeStateMachine *p_state_machine, double p_time, bool p_seek, bool p_is_external_seeking);
 
@@ -144,8 +153,9 @@ protected:
 	static void _bind_methods();
 
 public:
-	void travel(const StringName &p_state);
-	void start(const StringName &p_state);
+	void travel(const StringName &p_state, bool p_reset_on_teleport = true);
+	void start(const StringName &p_state, bool p_reset = true);
+	void next();
 	void stop();
 	bool is_playing() const;
 	StringName get_current_node() const;
diff --git a/scene/animation/animation_tree.cpp b/scene/animation/animation_tree.cpp
index ab341797c7..dd00897422 100644
--- a/scene/animation/animation_tree.cpp
+++ b/scene/animation/animation_tree.cpp
@@ -54,13 +54,19 @@ Variant AnimationNode::get_parameter_default_value(const StringName &p_parameter
 	return ret;
 }
 
+bool AnimationNode::is_parameter_read_only(const StringName &p_parameter) const {
+	bool ret = false;
+	GDVIRTUAL_CALL(_is_parameter_read_only, p_parameter, ret);
+	return ret;
+}
+
 void AnimationNode::set_parameter(const StringName &p_name, const Variant &p_value) {
 	ERR_FAIL_COND(!state);
 	ERR_FAIL_COND(!state->tree->property_parent_map.has(base_path));
 	ERR_FAIL_COND(!state->tree->property_parent_map[base_path].has(p_name));
 	StringName path = state->tree->property_parent_map[base_path][p_name];
 
-	state->tree->property_map[path] = p_value;
+	state->tree->property_map[path].first = p_value;
 }
 
 Variant AnimationNode::get_parameter(const StringName &p_name) const {
@@ -69,7 +75,7 @@ Variant AnimationNode::get_parameter(const StringName &p_name) const {
 	ERR_FAIL_COND_V(!state->tree->property_parent_map[base_path].has(p_name), Variant());
 
 	StringName path = state->tree->property_parent_map[base_path][p_name];
-	return state->tree->property_map[path];
+	return state->tree->property_map[path].first;
 }
 
 void AnimationNode::get_child_nodes(List<ChildNode> *r_child_nodes) {
@@ -427,6 +433,7 @@ void AnimationNode::_bind_methods() {
 	GDVIRTUAL_BIND(_get_parameter_list);
 	GDVIRTUAL_BIND(_get_child_by_name, "name");
 	GDVIRTUAL_BIND(_get_parameter_default_value, "parameter");
+	GDVIRTUAL_BIND(_is_parameter_read_only, "parameter");
 	GDVIRTUAL_BIND(_process, "time", "seek", "is_external_seeking");
 	GDVIRTUAL_BIND(_get_caption);
 	GDVIRTUAL_BIND(_has_filter);
@@ -1889,7 +1896,10 @@ void AnimationTree::_update_properties_for_node(const String &p_base_path, Ref<A
 		StringName key = pinfo.name;
 
 		if (!property_map.has(p_base_path + key)) {
-			property_map[p_base_path + key] = node->get_parameter_default_value(key);
+			Pair<Variant, bool> param;
+			param.first = node->get_parameter_default_value(key);
+			param.second = node->is_parameter_read_only(key);
+			property_map[p_base_path + key] = param;
 		}
 
 		property_parent_map[p_base_path][key] = p_base_path + key;
@@ -1931,7 +1941,10 @@ bool AnimationTree::_set(const StringName &p_name, const Variant &p_value) {
 	}
 
 	if (property_map.has(p_name)) {
-		property_map[p_name] = p_value;
+		if (is_inside_tree() && property_map[p_name].second) {
+			return false; // Prevent to set property by user.
+		}
+		property_map[p_name].first = p_value;
 		return true;
 	}
 
@@ -1944,7 +1957,7 @@ bool AnimationTree::_get(const StringName &p_name, Variant &r_ret) const {
 	}
 
 	if (property_map.has(p_name)) {
-		r_ret = property_map[p_name];
+		r_ret = property_map[p_name].first;
 		return true;
 	}
 
diff --git a/scene/animation/animation_tree.h b/scene/animation/animation_tree.h
index 2c1be6199c..52d3e1bd41 100644
--- a/scene/animation/animation_tree.h
+++ b/scene/animation/animation_tree.h
@@ -117,6 +117,7 @@ protected:
 	GDVIRTUAL0RC(Array, _get_parameter_list)
 	GDVIRTUAL1RC(Ref<AnimationNode>, _get_child_by_name, StringName)
 	GDVIRTUAL1RC(Variant, _get_parameter_default_value, StringName)
+	GDVIRTUAL1RC(bool, _is_parameter_read_only, StringName)
 	GDVIRTUAL3RC(double, _process, double, bool, bool)
 	GDVIRTUAL0RC(String, _get_caption)
 	GDVIRTUAL0RC(bool, _has_filter)
@@ -124,6 +125,7 @@ protected:
 public:
 	virtual void get_parameter_list(List<PropertyInfo> *r_list) const;
 	virtual Variant get_parameter_default_value(const StringName &p_parameter) const;
+	virtual bool is_parameter_read_only(const StringName &p_parameter) const;
 
 	void set_parameter(const StringName &p_name, const Variant &p_value);
 	Variant get_parameter(const StringName &p_name) const;
@@ -304,7 +306,7 @@ private:
 	void _update_properties();
 	List<PropertyInfo> properties;
 	HashMap<StringName, HashMap<StringName, StringName>> property_parent_map;
-	HashMap<StringName, Variant> property_map;
+	HashMap<StringName, Pair<Variant, bool>> property_map; // Property value and read-only flag.
 
 	struct Activity {
 		uint64_t last_pass = 0;
diff --git a/scene/gui/control.cpp b/scene/gui/control.cpp
index 5930818763..fead0878fb 100644
--- a/scene/gui/control.cpp
+++ b/scene/gui/control.cpp
@@ -1582,10 +1582,6 @@ void Control::set_block_minimum_size_adjust(bool p_block) {
 	data.block_minimum_size_adjust = p_block;
 }
 
-bool Control::is_minimum_size_adjust_blocked() const {
-	return data.block_minimum_size_adjust;
-}
-
 Size2 Control::get_minimum_size() const {
 	Vector2 ms;
 	GDVIRTUAL_CALL(_get_minimum_size, ms);
@@ -2769,9 +2765,9 @@ void Control::end_bulk_theme_override() {
 
 // Internationalization.
 
-TypedArray<Vector2i> Control::structured_text_parser(TextServer::StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const {
+TypedArray<Vector3i> Control::structured_text_parser(TextServer::StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const {
 	if (p_parser_type == TextServer::STRUCTURED_TEXT_CUSTOM) {
-		TypedArray<Vector2i> ret;
+		TypedArray<Vector3i> ret;
 		GDVIRTUAL_CALL(_structured_text_parser, p_args, p_text, ret);
 		return ret;
 	} else {
diff --git a/scene/gui/control.h b/scene/gui/control.h
index aaab9f530c..a93a88e5b4 100644
--- a/scene/gui/control.h
+++ b/scene/gui/control.h
@@ -145,7 +145,7 @@ public:
 		TEXT_DIRECTION_AUTO = TextServer::DIRECTION_AUTO,
 		TEXT_DIRECTION_LTR = TextServer::DIRECTION_LTR,
 		TEXT_DIRECTION_RTL = TextServer::DIRECTION_RTL,
-		TEXT_DIRECTION_INHERITED,
+		TEXT_DIRECTION_INHERITED = TextServer::DIRECTION_INHERITED,
 	};
 
 private:
@@ -330,7 +330,7 @@ protected:
 
 	// Internationalization.
 
-	virtual TypedArray<Vector2i> structured_text_parser(TextServer::StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const;
+	virtual TypedArray<Vector3i> structured_text_parser(TextServer::StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const;
 
 	// Base object overrides.
 
@@ -340,7 +340,7 @@ protected:
 	// Exposed virtual methods.
 
 	GDVIRTUAL1RC(bool, _has_point, Vector2)
-	GDVIRTUAL2RC(TypedArray<Vector2i>, _structured_text_parser, Array, String)
+	GDVIRTUAL2RC(TypedArray<Vector3i>, _structured_text_parser, Array, String)
 	GDVIRTUAL0RC(Vector2, _get_minimum_size)
 
 	GDVIRTUAL1RC(Variant, _get_drag_data, Vector2)
@@ -464,7 +464,6 @@ public:
 	void update_minimum_size();
 
 	void set_block_minimum_size_adjust(bool p_block);
-	bool is_minimum_size_adjust_blocked() const;
 
 	virtual Size2 get_minimum_size() const;
 	virtual Size2 get_combined_minimum_size() const;
diff --git a/scene/gui/item_list.cpp b/scene/gui/item_list.cpp
index 372baeadae..25a27d5e1a 100644
--- a/scene/gui/item_list.cpp
+++ b/scene/gui/item_list.cpp
@@ -309,12 +309,6 @@ void ItemList::set_item_tag_icon(int p_idx, const Ref<Texture2D> &p_tag_icon) {
 	shape_changed = true;
 }
 
-Ref<Texture2D> ItemList::get_item_tag_icon(int p_idx) const {
-	ERR_FAIL_INDEX_V(p_idx, items.size(), Ref<Texture2D>());
-
-	return items[p_idx].tag_icon;
-}
-
 void ItemList::set_item_selectable(int p_idx, bool p_selectable) {
 	if (p_idx < 0) {
 		p_idx += get_item_count();
diff --git a/scene/gui/item_list.h b/scene/gui/item_list.h
index 848f9a2ba9..934318dbb4 100644
--- a/scene/gui/item_list.h
+++ b/scene/gui/item_list.h
@@ -191,7 +191,6 @@ public:
 	Variant get_item_metadata(int p_idx) const;
 
 	void set_item_tag_icon(int p_idx, const Ref<Texture2D> &p_tag_icon);
-	Ref<Texture2D> get_item_tag_icon(int p_idx) const;
 
 	void set_item_tooltip_enabled(int p_idx, const bool p_enabled);
 	bool is_item_tooltip_enabled(int p_idx) const;
diff --git a/scene/gui/rich_text_label.cpp b/scene/gui/rich_text_label.cpp
index 5ab64b35fd..a7e50a765e 100644
--- a/scene/gui/rich_text_label.cpp
+++ b/scene/gui/rich_text_label.cpp
@@ -4074,8 +4074,8 @@ void RichTextLabel::append_text(const String &p_bbcode) {
 							st_parser_type = TextServer::STRUCTURED_TEXT_EMAIL;
 						} else if (subtag_a[1] == "l" || subtag_a[1] == "list") {
 							st_parser_type = TextServer::STRUCTURED_TEXT_LIST;
-						} else if (subtag_a[1] == "n" || subtag_a[1] == "none") {
-							st_parser_type = TextServer::STRUCTURED_TEXT_NONE;
+						} else if (subtag_a[1] == "n" || subtag_a[1] == "gdscript") {
+							st_parser_type = TextServer::STRUCTURED_TEXT_GDSCRIPT;
 						} else if (subtag_a[1] == "c" || subtag_a[1] == "custom") {
 							st_parser_type = TextServer::STRUCTURED_TEXT_CUSTOM;
 						}
diff --git a/scene/gui/text_edit.cpp b/scene/gui/text_edit.cpp
index 108a533a74..8ffaa9e81f 100644
--- a/scene/gui/text_edit.cpp
+++ b/scene/gui/text_edit.cpp
@@ -1208,7 +1208,15 @@ void TextEdit::_notification(int p_what) {
 						char_ofs = 0;
 					}
 					for (int j = 0; j < gl_size; j++) {
-						const Variant *color_data = color_map.getptr(glyphs[j].start);
+						int64_t color_start = -1;
+						for (const Variant *key = color_map.next(nullptr); key; key = color_map.next(key)) {
+							if (int64_t(*key) <= glyphs[j].start) {
+								color_start = *key;
+							} else {
+								break;
+							}
+						}
+						const Variant *color_data = (color_start >= 0) ? color_map.getptr(color_start) : nullptr;
 						if (color_data != nullptr) {
 							current_color = (color_data->operator Dictionary()).get("color", font_color);
 							if (!editable && current_color.a > font_readonly_color.a) {
diff --git a/scene/gui/tree.cpp b/scene/gui/tree.cpp
index 2138f10ad0..2d985c2324 100644
--- a/scene/gui/tree.cpp
+++ b/scene/gui/tree.cpp
@@ -332,7 +332,7 @@ void TreeItem::set_structured_text_bidi_override(int p_column, TextServer::Struc
 }
 
 TextServer::StructuredTextParser TreeItem::get_structured_text_bidi_override(int p_column) const {
-	ERR_FAIL_INDEX_V(p_column, cells.size(), TextServer::STRUCTURED_TEXT_NONE);
+	ERR_FAIL_INDEX_V(p_column, cells.size(), TextServer::STRUCTURED_TEXT_DEFAULT);
 	return cells[p_column].st_parser;
 }
 
diff --git a/scene/main/canvas_item.cpp b/scene/main/canvas_item.cpp
index a4af7988c6..9285969356 100644
--- a/scene/main/canvas_item.cpp
+++ b/scene/main/canvas_item.cpp
@@ -560,44 +560,47 @@ void CanvasItem::draw_multiline_colors(const Vector<Point2> &p_points, const Vec
 void CanvasItem::draw_rect(const Rect2 &p_rect, const Color &p_color, bool p_filled, real_t p_width) {
 	ERR_FAIL_COND_MSG(!drawing, "Drawing is only allowed inside NOTIFICATION_DRAW, _draw() function or 'draw' signal.");
 
+	Rect2 rect = p_rect.abs();
+
 	if (p_filled) {
 		if (p_width != -1.0) {
 			WARN_PRINT("The draw_rect() \"width\" argument has no effect when \"filled\" is \"true\".");
 		}
 
-		RenderingServer::get_singleton()->canvas_item_add_rect(canvas_item, p_rect, p_color);
+		RenderingServer::get_singleton()->canvas_item_add_rect(canvas_item, rect, p_color);
+	} else if (p_width >= rect.size.width || p_width >= rect.size.height) {
+		RenderingServer::get_singleton()->canvas_item_add_rect(canvas_item, rect.grow(0.5f * p_width), p_color);
 	} else {
 		// Thick lines are offset depending on their width to avoid partial overlapping.
-		// Thin lines don't require an offset, so don't apply one in this case
-		real_t offset;
-		if (p_width >= 0) {
-			offset = p_width / 2.0;
-		} else {
-			offset = 0.0;
-		}
+		// Thin lines are drawn without offset. The result may not be perfect.
+		real_t offset = (p_width >= 0) ? 0.5f * p_width : 0.0f;
 
+		// Top line.
 		RenderingServer::get_singleton()->canvas_item_add_line(
 				canvas_item,
-				p_rect.position + Size2(-offset, 0),
-				p_rect.position + Size2(p_rect.size.width + offset, 0),
+				rect.position + Size2(-offset, 0),
+				rect.position + Size2(-offset + rect.size.width, 0),
 				p_color,
 				p_width);
+		// Right line.
 		RenderingServer::get_singleton()->canvas_item_add_line(
 				canvas_item,
-				p_rect.position + Size2(p_rect.size.width, offset),
-				p_rect.position + Size2(p_rect.size.width, p_rect.size.height - offset),
+				rect.position + Size2(rect.size.width, -offset),
+				rect.position + Size2(rect.size.width, -offset + rect.size.height),
 				p_color,
 				p_width);
+		// Bottom line.
 		RenderingServer::get_singleton()->canvas_item_add_line(
 				canvas_item,
-				p_rect.position + Size2(p_rect.size.width + offset, p_rect.size.height),
-				p_rect.position + Size2(-offset, p_rect.size.height),
+				rect.position + Size2(offset + rect.size.width, rect.size.height),
+				rect.position + Size2(offset, rect.size.height),
 				p_color,
 				p_width);
+		// Left line.
 		RenderingServer::get_singleton()->canvas_item_add_line(
 				canvas_item,
-				p_rect.position + Size2(0, p_rect.size.height - offset),
-				p_rect.position + Size2(0, offset),
+				rect.position + Size2(0, offset + rect.size.height),
+				rect.position + Size2(0, offset),
 				p_color,
 				p_width);
 	}
diff --git a/scene/resources/environment.cpp b/scene/resources/environment.cpp
index 746f2f8f9b..8b4656414d 100644
--- a/scene/resources/environment.cpp
+++ b/scene/resources/environment.cpp
@@ -1043,6 +1043,18 @@ void Environment::_validate_property(PropertyInfo &p_property) const {
 		}
 	}
 
+	if (p_property.name == "ambient_light_color" || p_property.name == "ambient_light_energy") {
+		if (ambient_source == AMBIENT_SOURCE_DISABLED) {
+			p_property.usage = PROPERTY_USAGE_NO_EDITOR;
+		}
+	}
+
+	if (p_property.name == "ambient_light_sky_contribution") {
+		if (ambient_source == AMBIENT_SOURCE_DISABLED || ambient_source == AMBIENT_SOURCE_COLOR) {
+			p_property.usage = PROPERTY_USAGE_NO_EDITOR;
+		}
+	}
+
 	if (p_property.name == "fog_aerial_perspective") {
 		if (bg_mode != BG_SKY) {
 			p_property.usage = PROPERTY_USAGE_NO_EDITOR;
diff --git a/scene/resources/mesh.cpp b/scene/resources/mesh.cpp
index 5e18b5df37..cedf4319f8 100644
--- a/scene/resources/mesh.cpp
+++ b/scene/resources/mesh.cpp
@@ -687,6 +687,7 @@ void Mesh::_bind_methods() {
 	BIND_BITFIELD_FLAG(ARRAY_FLAG_USE_2D_VERTICES);
 	BIND_BITFIELD_FLAG(ARRAY_FLAG_USE_DYNAMIC_UPDATE);
 	BIND_BITFIELD_FLAG(ARRAY_FLAG_USE_8_BONE_WEIGHTS);
+	BIND_BITFIELD_FLAG(ARRAY_FLAG_USES_EMPTY_VERTEX_ARRAY);
 
 	BIND_ENUM_CONSTANT(BLEND_SHAPE_MODE_NORMALIZED);
 	BIND_ENUM_CONSTANT(BLEND_SHAPE_MODE_RELATIVE);
@@ -1555,6 +1556,7 @@ void ArrayMesh::_recompute_aabb() {
 
 // TODO: Need to add binding to add_surface using future MeshSurfaceData object.
 void ArrayMesh::add_surface(BitField<ArrayFormat> p_format, PrimitiveType p_primitive, const Vector<uint8_t> &p_array, const Vector<uint8_t> &p_attribute_array, const Vector<uint8_t> &p_skin_array, int p_vertex_count, const Vector<uint8_t> &p_index_array, int p_index_count, const AABB &p_aabb, const Vector<uint8_t> &p_blend_shape_data, const Vector<AABB> &p_bone_aabbs, const Vector<RS::SurfaceData::LOD> &p_lods) {
+	ERR_FAIL_COND(surfaces.size() == RS::MAX_MESH_SURFACES);
 	_create_if_empty();
 
 	Surface s;
@@ -1590,6 +1592,7 @@ void ArrayMesh::add_surface(BitField<ArrayFormat> p_format, PrimitiveType p_prim
 }
 
 void ArrayMesh::add_surface_from_arrays(PrimitiveType p_primitive, const Array &p_arrays, const TypedArray<Array> &p_blend_shapes, const Dictionary &p_lods, BitField<ArrayFormat> p_flags) {
+	ERR_FAIL_COND(p_blend_shapes.size() != blend_shapes.size());
 	ERR_FAIL_COND(p_arrays.size() != ARRAY_MAX);
 
 	RS::SurfaceData surface;
@@ -2058,7 +2061,7 @@ void ArrayMesh::_bind_methods() {
 	ClassDB::bind_method(D_METHOD("set_blend_shape_mode", "mode"), &ArrayMesh::set_blend_shape_mode);
 	ClassDB::bind_method(D_METHOD("get_blend_shape_mode"), &ArrayMesh::get_blend_shape_mode);
 
-	ClassDB::bind_method(D_METHOD("add_surface_from_arrays", "primitive", "arrays", "blend_shapes", "lods", "compress_flags"), &ArrayMesh::add_surface_from_arrays, DEFVAL(Array()), DEFVAL(Dictionary()), DEFVAL(0));
+	ClassDB::bind_method(D_METHOD("add_surface_from_arrays", "primitive", "arrays", "blend_shapes", "lods", "flags"), &ArrayMesh::add_surface_from_arrays, DEFVAL(Array()), DEFVAL(Dictionary()), DEFVAL(0));
 	ClassDB::bind_method(D_METHOD("clear_surfaces"), &ArrayMesh::clear_surfaces);
 	ClassDB::bind_method(D_METHOD("surface_update_vertex_region", "surf_idx", "offset", "data"), &ArrayMesh::surface_update_vertex_region);
 	ClassDB::bind_method(D_METHOD("surface_update_attribute_region", "surf_idx", "offset", "data"), &ArrayMesh::surface_update_attribute_region);
diff --git a/scene/resources/primitive_meshes.cpp b/scene/resources/primitive_meshes.cpp
index 5ef66a22b6..86ed0001dd 100644
--- a/scene/resources/primitive_meshes.cpp
+++ b/scene/resources/primitive_meshes.cpp
@@ -2901,7 +2901,7 @@ void TextMesh::_create_mesh_array(Array &p_arr) const {
 			TS->shaped_text_set_spacing(text_rid, TextServer::SpacingType(i), font->get_spacing(TextServer::SpacingType(i)));
 		}
 
-		Array stt;
+		TypedArray<Vector3i> stt;
 		if (st_parser == TextServer::STRUCTURED_TEXT_CUSTOM) {
 			GDVIRTUAL_CALL(_structured_text_parser, st_args, txt, stt);
 		} else {
diff --git a/scene/resources/primitive_meshes.h b/scene/resources/primitive_meshes.h
index 22cd12b004..e62f26b17c 100644
--- a/scene/resources/primitive_meshes.h
+++ b/scene/resources/primitive_meshes.h
@@ -622,7 +622,7 @@ protected:
 	virtual void _create_mesh_array(Array &p_arr) const override;
 
 public:
-	GDVIRTUAL2RC(Array, _structured_text_parser, Array, String)
+	GDVIRTUAL2RC(TypedArray<Vector3i>, _structured_text_parser, Array, String)
 
 	TextMesh();
 	~TextMesh();
diff --git a/scene/resources/resource_format_text.cpp b/scene/resources/resource_format_text.cpp
index c85c213c5d..2e8b4f93be 100644
--- a/scene/resources/resource_format_text.cpp
+++ b/scene/resources/resource_format_text.cpp
@@ -49,10 +49,6 @@
 
 ///
 
-void ResourceLoaderText::set_local_path(const String &p_local_path) {
-	res_path = p_local_path;
-}
-
 Ref<Resource> ResourceLoaderText::get_resource() {
 	return resource;
 }
diff --git a/scene/resources/resource_format_text.h b/scene/resources/resource_format_text.h
index 0f95e2fbfd..0cced3d20c 100644
--- a/scene/resources/resource_format_text.h
+++ b/scene/resources/resource_format_text.h
@@ -115,7 +115,6 @@ class ResourceLoaderText {
 	Ref<PackedScene> _parse_node_tag(VariantParser::ResourceParser &parser);
 
 public:
-	void set_local_path(const String &p_local_path);
 	Ref<Resource> get_resource();
 	Error load();
 	Error set_uid(Ref<FileAccess> p_f, ResourceUID::ID p_uid);
diff --git a/servers/physics_3d/gjk_epa.cpp b/servers/physics_3d/gjk_epa.cpp
index 88f2040d17..e5678914fe 100644
--- a/servers/physics_3d/gjk_epa.cpp
+++ b/servers/physics_3d/gjk_epa.cpp
@@ -1011,9 +1011,11 @@ bool gjk_epa_calculate_penetration(const GodotShape3D *p_shape_A, const Transfor
 	if (GjkEpa2::Penetration(p_shape_A, p_transform_A, p_margin_A, p_shape_B, p_transform_B, p_margin_B, p_transform_B.origin - p_transform_A.origin, res)) {
 		if (p_result_callback) {
 			if (p_swap) {
-				p_result_callback(res.witnesses[1], 0, res.witnesses[0], 0, p_userdata);
+				Vector3 normal = (res.witnesses[1] - res.witnesses[0]).normalized();
+				p_result_callback(res.witnesses[1], 0, res.witnesses[0], 0, normal, p_userdata);
 			} else {
-				p_result_callback(res.witnesses[0], 0, res.witnesses[1], 0, p_userdata);
+				Vector3 normal = (res.witnesses[0] - res.witnesses[1]).normalized();
+				p_result_callback(res.witnesses[0], 0, res.witnesses[1], 0, normal, p_userdata);
 			}
 		}
 		return true;
diff --git a/servers/physics_3d/godot_body_pair_3d.cpp b/servers/physics_3d/godot_body_pair_3d.cpp
index ce3da390cb..619e6c00be 100644
--- a/servers/physics_3d/godot_body_pair_3d.cpp
+++ b/servers/physics_3d/godot_body_pair_3d.cpp
@@ -38,12 +38,12 @@
 #define MIN_VELOCITY 0.0001
 #define MAX_BIAS_ROTATION (Math_PI / 8)
 
-void GodotBodyPair3D::_contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata) {
+void GodotBodyPair3D::_contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata) {
 	GodotBodyPair3D *pair = static_cast<GodotBodyPair3D *>(p_userdata);
-	pair->contact_added_callback(p_point_A, p_index_A, p_point_B, p_index_B);
+	pair->contact_added_callback(p_point_A, p_index_A, p_point_B, p_index_B, normal);
 }
 
-void GodotBodyPair3D::contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B) {
+void GodotBodyPair3D::contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal) {
 	Vector3 local_A = A->get_inv_transform().basis.xform(p_point_A);
 	Vector3 local_B = B->get_inv_transform().basis.xform(p_point_B - offset_B);
 
@@ -577,12 +577,12 @@ GodotBodyPair3D::~GodotBodyPair3D() {
 	B->remove_constraint(this);
 }
 
-void GodotBodySoftBodyPair3D::_contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata) {
+void GodotBodySoftBodyPair3D::_contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata) {
 	GodotBodySoftBodyPair3D *pair = static_cast<GodotBodySoftBodyPair3D *>(p_userdata);
-	pair->contact_added_callback(p_point_A, p_index_A, p_point_B, p_index_B);
+	pair->contact_added_callback(p_point_A, p_index_A, p_point_B, p_index_B, normal);
 }
 
-void GodotBodySoftBodyPair3D::contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B) {
+void GodotBodySoftBodyPair3D::contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal) {
 	Vector3 local_A = body->get_inv_transform().xform(p_point_A);
 	Vector3 local_B = p_point_B - soft_body->get_node_position(p_index_B);
 
@@ -591,7 +591,7 @@ void GodotBodySoftBodyPair3D::contact_added_callback(const Vector3 &p_point_A, i
 	contact.index_B = p_index_B;
 	contact.local_A = local_A;
 	contact.local_B = local_B;
-	contact.normal = (p_point_A - p_point_B).normalized();
+	contact.normal = (normal.dot((p_point_A - p_point_B)) < 0 ? -normal : normal);
 	contact.used = true;
 
 	// Attempt to determine if the contact will be reused.
diff --git a/servers/physics_3d/godot_body_pair_3d.h b/servers/physics_3d/godot_body_pair_3d.h
index c3165c7fcf..a8f5180dd5 100644
--- a/servers/physics_3d/godot_body_pair_3d.h
+++ b/servers/physics_3d/godot_body_pair_3d.h
@@ -97,9 +97,9 @@ class GodotBodyPair3D : public GodotBodyContact3D {
 	Contact contacts[MAX_CONTACTS];
 	int contact_count = 0;
 
-	static void _contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata);
+	static void _contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata);
 
-	void contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B);
+	void contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal);
 
 	void validate_contacts();
 	bool _test_ccd(real_t p_step, GodotBody3D *p_A, int p_shape_A, const Transform3D &p_xform_A, GodotBody3D *p_B, int p_shape_B, const Transform3D &p_xform_B);
@@ -126,9 +126,9 @@ class GodotBodySoftBodyPair3D : public GodotBodyContact3D {
 
 	LocalVector<Contact> contacts;
 
-	static void _contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata);
+	static void _contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata);
 
-	void contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B);
+	void contact_added_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal);
 
 	void validate_contacts();
 
diff --git a/servers/physics_3d/godot_collision_solver_3d.cpp b/servers/physics_3d/godot_collision_solver_3d.cpp
index 0b92b745fe..fb5a67c008 100644
--- a/servers/physics_3d/godot_collision_solver_3d.cpp
+++ b/servers/physics_3d/godot_collision_solver_3d.cpp
@@ -81,9 +81,11 @@ bool GodotCollisionSolver3D::solve_static_world_boundary(const GodotShape3D *p_s
 
 		if (p_result_callback) {
 			if (p_swap_result) {
-				p_result_callback(supports[i], 0, support_A, 0, p_userdata);
+				Vector3 normal = (support_A - supports[i]).normalized();
+				p_result_callback(supports[i], 0, support_A, 0, normal, p_userdata);
 			} else {
-				p_result_callback(support_A, 0, supports[i], 0, p_userdata);
+				Vector3 normal = (supports[i] - support_A).normalized();
+				p_result_callback(support_A, 0, supports[i], 0, normal, p_userdata);
 			}
 		}
 	}
@@ -126,9 +128,11 @@ bool GodotCollisionSolver3D::solve_separation_ray(const GodotShape3D *p_shape_A,
 
 	if (p_result_callback) {
 		if (p_swap_result) {
-			p_result_callback(support_B, 0, support_A, 0, p_userdata);
+			Vector3 normal = (support_B - support_A).normalized();
+			p_result_callback(support_B, 0, support_A, 0, normal, p_userdata);
 		} else {
-			p_result_callback(support_A, 0, support_B, 0, p_userdata);
+			Vector3 normal = (support_A - support_B).normalized();
+			p_result_callback(support_A, 0, support_B, 0, normal, p_userdata);
 		}
 	}
 	return true;
@@ -142,7 +146,7 @@ struct _SoftBodyContactCollisionInfo {
 	int contact_count = 0;
 };
 
-void GodotCollisionSolver3D::soft_body_contact_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata) {
+void GodotCollisionSolver3D::soft_body_contact_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata) {
 	_SoftBodyContactCollisionInfo &cinfo = *(static_cast<_SoftBodyContactCollisionInfo *>(p_userdata));
 
 	++cinfo.contact_count;
@@ -152,9 +156,9 @@ void GodotCollisionSolver3D::soft_body_contact_callback(const Vector3 &p_point_A
 	}
 
 	if (cinfo.swap_result) {
-		cinfo.result_callback(p_point_B, cinfo.node_index, p_point_A, p_index_A, cinfo.userdata);
+		cinfo.result_callback(p_point_B, cinfo.node_index, p_point_A, p_index_A, -normal, cinfo.userdata);
 	} else {
-		cinfo.result_callback(p_point_A, p_index_A, p_point_B, cinfo.node_index, cinfo.userdata);
+		cinfo.result_callback(p_point_A, p_index_A, p_point_B, cinfo.node_index, normal, cinfo.userdata);
 	}
 }
 
diff --git a/servers/physics_3d/godot_collision_solver_3d.h b/servers/physics_3d/godot_collision_solver_3d.h
index 7ef0dc97ac..36ea79576e 100644
--- a/servers/physics_3d/godot_collision_solver_3d.h
+++ b/servers/physics_3d/godot_collision_solver_3d.h
@@ -35,11 +35,11 @@
 
 class GodotCollisionSolver3D {
 public:
-	typedef void (*CallbackResult)(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata);
+	typedef void (*CallbackResult)(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata);
 
 private:
 	static bool soft_body_query_callback(uint32_t p_node_index, void *p_userdata);
-	static void soft_body_contact_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata);
+	static void soft_body_contact_callback(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata);
 	static bool soft_body_concave_callback(void *p_userdata, GodotShape3D *p_convex);
 	static bool concave_callback(void *p_userdata, GodotShape3D *p_convex);
 	static bool solve_static_world_boundary(const GodotShape3D *p_shape_A, const Transform3D &p_transform_A, const GodotShape3D *p_shape_B, const Transform3D &p_transform_B, CallbackResult p_result_callback, void *p_userdata, bool p_swap_result, real_t p_margin = 0);
diff --git a/servers/physics_3d/godot_collision_solver_3d_sat.cpp b/servers/physics_3d/godot_collision_solver_3d_sat.cpp
index d13f4ee801..66d1811abb 100644
--- a/servers/physics_3d/godot_collision_solver_3d_sat.cpp
+++ b/servers/physics_3d/godot_collision_solver_3d_sat.cpp
@@ -75,11 +75,13 @@ struct _CollectorCallback {
 	Vector3 normal;
 	Vector3 *prev_axis = nullptr;
 
-	_FORCE_INLINE_ void call(const Vector3 &p_point_A, const Vector3 &p_point_B) {
+	_FORCE_INLINE_ void call(const Vector3 &p_point_A, const Vector3 &p_point_B, Vector3 p_normal) {
+		if (p_normal.dot(p_point_B - p_point_A) < 0)
+			p_normal = -p_normal;
 		if (swap) {
-			callback(p_point_B, 0, p_point_A, 0, userdata);
+			callback(p_point_B, 0, p_point_A, 0, -p_normal, userdata);
 		} else {
-			callback(p_point_A, 0, p_point_B, 0, userdata);
+			callback(p_point_A, 0, p_point_B, 0, p_normal, userdata);
 		}
 	}
 };
@@ -92,7 +94,7 @@ static void _generate_contacts_point_point(const Vector3 *p_points_A, int p_poin
 	ERR_FAIL_COND(p_point_count_B != 1);
 #endif
 
-	p_callback->call(*p_points_A, *p_points_B);
+	p_callback->call(*p_points_A, *p_points_B, p_callback->normal);
 }
 
 static void _generate_contacts_point_edge(const Vector3 *p_points_A, int p_point_count_A, const Vector3 *p_points_B, int p_point_count_B, _CollectorCallback *p_callback) {
@@ -102,7 +104,7 @@ static void _generate_contacts_point_edge(const Vector3 *p_points_A, int p_point
 #endif
 
 	Vector3 closest_B = Geometry3D::get_closest_point_to_segment_uncapped(*p_points_A, p_points_B);
-	p_callback->call(*p_points_A, closest_B);
+	p_callback->call(*p_points_A, closest_B, p_callback->normal);
 }
 
 static void _generate_contacts_point_face(const Vector3 *p_points_A, int p_point_count_A, const Vector3 *p_points_B, int p_point_count_B, _CollectorCallback *p_callback) {
@@ -111,9 +113,9 @@ static void _generate_contacts_point_face(const Vector3 *p_points_A, int p_point
 	ERR_FAIL_COND(p_point_count_B < 3);
 #endif
 
-	Vector3 closest_B = Plane(p_points_B[0], p_points_B[1], p_points_B[2]).project(*p_points_A);
-
-	p_callback->call(*p_points_A, closest_B);
+	Plane plane(p_points_B[0], p_points_B[1], p_points_B[2]);
+	Vector3 closest_B = plane.project(*p_points_A);
+	p_callback->call(*p_points_A, closest_B, plane.get_normal());
 }
 
 static void _generate_contacts_point_circle(const Vector3 *p_points_A, int p_point_count_A, const Vector3 *p_points_B, int p_point_count_B, _CollectorCallback *p_callback) {
@@ -122,9 +124,9 @@ static void _generate_contacts_point_circle(const Vector3 *p_points_A, int p_poi
 	ERR_FAIL_COND(p_point_count_B != 3);
 #endif
 
-	Vector3 closest_B = Plane(p_points_B[0], p_points_B[1], p_points_B[2]).project(*p_points_A);
-
-	p_callback->call(*p_points_A, closest_B);
+	Plane plane(p_points_B[0], p_points_B[1], p_points_B[2]);
+	Vector3 closest_B = plane.project(*p_points_A);
+	p_callback->call(*p_points_A, closest_B, plane.get_normal());
 }
 
 static void _generate_contacts_edge_edge(const Vector3 *p_points_A, int p_point_count_A, const Vector3 *p_points_B, int p_point_count_B, _CollectorCallback *p_callback) {
@@ -154,8 +156,8 @@ static void _generate_contacts_edge_edge(const Vector3 *p_points_A, int p_point_
 		sa.sort(dvec, 4);
 
 		//use the middle ones as contacts
-		p_callback->call(base_A + axis * dvec[1], base_B + axis * dvec[1]);
-		p_callback->call(base_A + axis * dvec[2], base_B + axis * dvec[2]);
+		p_callback->call(base_A + axis * dvec[1], base_B + axis * dvec[1], p_callback->normal);
+		p_callback->call(base_A + axis * dvec[2], base_B + axis * dvec[2], p_callback->normal);
 
 		return;
 	}
@@ -170,7 +172,14 @@ static void _generate_contacts_edge_edge(const Vector3 *p_points_A, int p_point_
 
 	Vector3 closest_A = p_points_A[0] + rel_A * d;
 	Vector3 closest_B = Geometry3D::get_closest_point_to_segment_uncapped(closest_A, p_points_B);
-	p_callback->call(closest_A, closest_B);
+	// The normal should be perpendicular to both edges.
+	Vector3 normal = rel_A.cross(rel_B);
+	real_t normal_len = normal.length();
+	if (normal_len > 1e-3)
+		normal /= normal_len;
+	else
+		normal = p_callback->normal;
+	p_callback->call(closest_A, closest_B, normal);
 }
 
 static void _generate_contacts_edge_circle(const Vector3 *p_points_A, int p_point_count_A, const Vector3 *p_points_B, int p_point_count_B, _CollectorCallback *p_callback) {
@@ -267,7 +276,7 @@ static void _generate_contacts_edge_circle(const Vector3 *p_points_A, int p_poin
 			continue;
 		}
 
-		p_callback->call(contact_point_A, closest_B);
+		p_callback->call(contact_point_A, closest_B, circle_plane.get_normal());
 	}
 }
 
@@ -352,7 +361,7 @@ static void _generate_contacts_face_face(const Vector3 *p_points_A, int p_point_
 			continue;
 		}
 
-		p_callback->call(clipbuf_src[i], closest_B);
+		p_callback->call(clipbuf_src[i], closest_B, plane_B.get_normal());
 	}
 }
 
@@ -431,7 +440,7 @@ static void _generate_contacts_face_circle(const Vector3 *p_points_A, int p_poin
 			continue;
 		}
 
-		p_callback->call(contact_point_A, closest_B);
+		p_callback->call(contact_point_A, closest_B, circle_plane.get_normal());
 	}
 }
 
@@ -534,7 +543,7 @@ static void _generate_contacts_circle_circle(const Vector3 *p_points_A, int p_po
 			continue;
 		}
 
-		p_callback->call(contact_point_A, closest_B);
+		p_callback->call(contact_point_A, closest_B, circle_B_plane.get_normal());
 	}
 }
 
@@ -678,7 +687,7 @@ public:
 		return true;
 	}
 
-	static _FORCE_INLINE_ void test_contact_points(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata) {
+	static _FORCE_INLINE_ void test_contact_points(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata) {
 		SeparatorAxisTest<ShapeA, ShapeB, withMargin> *separator = (SeparatorAxisTest<ShapeA, ShapeB, withMargin> *)p_userdata;
 		Vector3 axis = (p_point_B - p_point_A);
 		real_t depth = axis.length();
@@ -802,11 +811,11 @@ static void analytic_sphere_collision(const Vector3 &p_origin_a, real_t p_radius
 	if (p_radius_a < p_radius_b) {
 		Vector3 point_a = p_origin_a - b_to_a * p_radius_a;
 		Vector3 point_b = point_a + b_to_a * overlap;
-		p_collector->call(point_a, point_b); // Consider adding b_to_a vector
+		p_collector->call(point_a, point_b, b_to_a); // Consider adding b_to_a vector
 	} else {
 		Vector3 point_b = p_origin_b + b_to_a * p_radius_b;
 		Vector3 point_a = point_b - b_to_a * overlap;
-		p_collector->call(point_a, point_b); // Consider adding b_to_a vector
+		p_collector->call(point_a, point_b, b_to_a); // Consider adding b_to_a vector
 	}
 }
 
@@ -859,8 +868,8 @@ static void _collision_sphere_box(const GodotShape3D *p_a, const Transform3D &p_
 		axis = delta / length;
 	}
 	Vector3 point_a = p_transform_a.origin + (sphere_A->get_radius() + p_margin_a) * axis;
-	Vector3 point_b = (withMargin ? nearest + p_margin_b * axis : nearest);
-	p_collector->call(point_a, point_b);
+	Vector3 point_b = (withMargin ? nearest - p_margin_b * axis : nearest);
+	p_collector->call(point_a, point_b, axis);
 }
 
 template <bool withMargin>
@@ -926,8 +935,8 @@ static void analytic_sphere_cylinder_collision(real_t p_radius_a, real_t p_radiu
 		axis = delta / length;
 	}
 	Vector3 point_a = p_transform_a.origin + (p_radius_a + p_margin_a) * axis;
-	Vector3 point_b = (withMargin ? nearest + p_margin_b * axis : nearest);
-	p_collector->call(point_a, point_b);
+	Vector3 point_b = (withMargin ? nearest - p_margin_b * axis : nearest);
+	p_collector->call(point_a, point_b, axis);
 }
 
 template <bool withMargin>
diff --git a/servers/physics_3d/godot_physics_server_3d.cpp b/servers/physics_3d/godot_physics_server_3d.cpp
index e8250acb45..b6d8acfbf3 100644
--- a/servers/physics_3d/godot_physics_server_3d.cpp
+++ b/servers/physics_3d/godot_physics_server_3d.cpp
@@ -1737,7 +1737,7 @@ void GodotPhysicsServer3D::_update_shapes() {
 	}
 }
 
-void GodotPhysicsServer3D::_shape_col_cbk(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata) {
+void GodotPhysicsServer3D::_shape_col_cbk(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata) {
 	CollCbkData *cbk = static_cast<CollCbkData *>(p_userdata);
 
 	if (cbk->max == 0) {
diff --git a/servers/physics_3d/godot_physics_server_3d.h b/servers/physics_3d/godot_physics_server_3d.h
index 3da0c6debe..040e673dcd 100644
--- a/servers/physics_3d/godot_physics_server_3d.h
+++ b/servers/physics_3d/godot_physics_server_3d.h
@@ -77,7 +77,7 @@ public:
 		Vector3 *ptr = nullptr;
 	};
 
-	static void _shape_col_cbk(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata);
+	static void _shape_col_cbk(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata);
 
 	virtual RID world_boundary_shape_create() override;
 	virtual RID separation_ray_shape_create() override;
diff --git a/servers/physics_3d/godot_space_3d.cpp b/servers/physics_3d/godot_space_3d.cpp
index c3aad22932..93572965d2 100644
--- a/servers/physics_3d/godot_space_3d.cpp
+++ b/servers/physics_3d/godot_space_3d.cpp
@@ -445,7 +445,7 @@ struct _RestCallbackData {
 	_RestResultData *other_results = nullptr;
 };
 
-static void _rest_cbk_result(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, void *p_userdata) {
+static void _rest_cbk_result(const Vector3 &p_point_A, int p_index_A, const Vector3 &p_point_B, int p_index_B, const Vector3 &normal, void *p_userdata) {
 	_RestCallbackData *rd = static_cast<_RestCallbackData *>(p_userdata);
 
 	Vector3 contact_rel = p_point_B - p_point_A;
@@ -480,7 +480,7 @@ static void _rest_cbk_result(const Vector3 &p_point_A, int p_index_A, const Vect
 				// Keep this result as separate result.
 				result.len = len;
 				result.contact = p_point_B;
-				result.normal = contact_rel / len;
+				result.normal = normal;
 				result.object = rd->object;
 				result.shape = rd->shape;
 				result.local_shape = rd->local_shape;
@@ -499,7 +499,7 @@ static void _rest_cbk_result(const Vector3 &p_point_A, int p_index_A, const Vect
 
 	rd->best_result.len = len;
 	rd->best_result.contact = p_point_B;
-	rd->best_result.normal = contact_rel / len;
+	rd->best_result.normal = normal;
 	rd->best_result.object = rd->object;
 	rd->best_result.shape = rd->shape;
 	rd->best_result.local_shape = rd->local_shape;
diff --git a/servers/rendering/shader_preprocessor.cpp b/servers/rendering/shader_preprocessor.cpp
index 40c8acffe5..ccbf5defa2 100644
--- a/servers/rendering/shader_preprocessor.cpp
+++ b/servers/rendering/shader_preprocessor.cpp
@@ -1081,21 +1081,17 @@ ShaderPreprocessor::Define *ShaderPreprocessor::create_define(const String &p_bo
 	return define;
 }
 
-void ShaderPreprocessor::clear() {
-	if (state_owner && state != nullptr) {
+void ShaderPreprocessor::clear_state() {
+	if (state != nullptr) {
 		for (const RBMap<String, Define *>::Element *E = state->defines.front(); E; E = E->next()) {
 			memdelete(E->get());
 		}
-
-		memdelete(state);
+		state->defines.clear();
 	}
-	state_owner = false;
 	state = nullptr;
 }
 
 Error ShaderPreprocessor::preprocess(State *p_state, const String &p_code, String &r_result) {
-	clear();
-
 	output.clear();
 
 	state = p_state;
@@ -1242,6 +1238,9 @@ Error ShaderPreprocessor::preprocess(const String &p_code, const String &p_filen
 			}
 		}
 	}
+
+	clear_state();
+
 	return err;
 }
 
@@ -1273,5 +1272,4 @@ ShaderPreprocessor::ShaderPreprocessor() {
 }
 
 ShaderPreprocessor::~ShaderPreprocessor() {
-	clear();
 }
diff --git a/servers/rendering/shader_preprocessor.h b/servers/rendering/shader_preprocessor.h
index f5902c64ca..6e5533c575 100644
--- a/servers/rendering/shader_preprocessor.h
+++ b/servers/rendering/shader_preprocessor.h
@@ -167,7 +167,6 @@ private:
 private:
 	LocalVector<char32_t> output;
 	State *state = nullptr;
-	bool state_owner = false;
 
 private:
 	static bool is_char_word(char32_t p_char);
@@ -211,7 +210,7 @@ private:
 
 	static Define *create_define(const String &p_body);
 
-	void clear();
+	void clear_state();
 
 	Error preprocess(State *p_state, const String &p_code, String &r_result);
 
diff --git a/servers/rendering_server.cpp b/servers/rendering_server.cpp
index e24b2af976..c3bd3d277f 100644
--- a/servers/rendering_server.cpp
+++ b/servers/rendering_server.cpp
@@ -996,6 +996,7 @@ Error RenderingServer::mesh_create_surface_data_from_arrays(SurfaceData *r_surfa
 	if (index_array_len) {
 		List<Variant> keys;
 		p_lods.get_key_list(&keys);
+		keys.sort(); // otherwise lod levels may get skipped
 		for (const Variant &E : keys) {
 			float distance = E;
 			ERR_CONTINUE(distance <= 0.0);
@@ -1826,6 +1827,7 @@ void RenderingServer::_bind_methods() {
 	BIND_BITFIELD_FLAG(ARRAY_FLAG_USE_2D_VERTICES);
 	BIND_BITFIELD_FLAG(ARRAY_FLAG_USE_DYNAMIC_UPDATE);
 	BIND_BITFIELD_FLAG(ARRAY_FLAG_USE_8_BONE_WEIGHTS);
+	BIND_BITFIELD_FLAG(ARRAY_FLAG_USES_EMPTY_VERTEX_ARRAY);
 
 	BIND_ENUM_CONSTANT(PRIMITIVE_POINTS);
 	BIND_ENUM_CONSTANT(PRIMITIVE_LINES);
diff --git a/servers/text/text_server_extension.cpp b/servers/text/text_server_extension.cpp
index 997b83e32d..cbf37f25d6 100644
--- a/servers/text/text_server_extension.cpp
+++ b/servers/text/text_server_extension.cpp
@@ -1373,8 +1373,8 @@ String TextServerExtension::string_to_lower(const String &p_string, const String
 	return p_string;
 }
 
-TypedArray<Vector2i> TextServerExtension::parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const {
-	TypedArray<Vector2i> ret;
+TypedArray<Vector3i> TextServerExtension::parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const {
+	TypedArray<Vector3i> ret;
 	GDVIRTUAL_CALL(_parse_structured_text, p_parser_type, p_args, p_text, ret);
 	return ret;
 }
diff --git a/servers/text/text_server_extension.h b/servers/text/text_server_extension.h
index fb784f5471..8536836983 100644
--- a/servers/text/text_server_extension.h
+++ b/servers/text/text_server_extension.h
@@ -521,8 +521,8 @@ public:
 	GDVIRTUAL2RC(String, _string_to_upper, const String &, const String &);
 	GDVIRTUAL2RC(String, _string_to_lower, const String &, const String &);
 
-	TypedArray<Vector2i> parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const;
-	GDVIRTUAL3RC(TypedArray<Vector2i>, _parse_structured_text, StructuredTextParser, const Array &, const String &);
+	TypedArray<Vector3i> parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const;
+	GDVIRTUAL3RC(TypedArray<Vector3i>, _parse_structured_text, StructuredTextParser, const Array &, const String &);
 
 	virtual int64_t is_confusable(const String &p_string, const PackedStringArray &p_dict) const override;
 	virtual bool spoof_check(const String &p_string) const override;
diff --git a/servers/text_server.cpp b/servers/text_server.cpp
index d339533688..027109b67d 100644
--- a/servers/text_server.cpp
+++ b/servers/text_server.cpp
@@ -483,6 +483,7 @@ void TextServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(DIRECTION_AUTO);
 	BIND_ENUM_CONSTANT(DIRECTION_LTR);
 	BIND_ENUM_CONSTANT(DIRECTION_RTL);
+	BIND_ENUM_CONSTANT(DIRECTION_INHERITED);
 
 	/* Orientation */
 	BIND_ENUM_CONSTANT(ORIENTATION_HORIZONTAL);
@@ -599,7 +600,7 @@ void TextServer::_bind_methods() {
 	BIND_ENUM_CONSTANT(STRUCTURED_TEXT_FILE);
 	BIND_ENUM_CONSTANT(STRUCTURED_TEXT_EMAIL);
 	BIND_ENUM_CONSTANT(STRUCTURED_TEXT_LIST);
-	BIND_ENUM_CONSTANT(STRUCTURED_TEXT_NONE);
+	BIND_ENUM_CONSTANT(STRUCTURED_TEXT_GDSCRIPT);
 	BIND_ENUM_CONSTANT(STRUCTURED_TEXT_CUSTOM);
 }
 
@@ -1692,22 +1693,22 @@ String TextServer::strip_diacritics(const String &p_string) const {
 	return result;
 }
 
-TypedArray<Vector2i> TextServer::parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const {
-	TypedArray<Vector2i> ret;
+TypedArray<Vector3i> TextServer::parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const {
+	TypedArray<Vector3i> ret;
 	switch (p_parser_type) {
 		case STRUCTURED_TEXT_URI: {
 			int prev = 0;
 			for (int i = 0; i < p_text.length(); i++) {
 				if ((p_text[i] == '\\') || (p_text[i] == '/') || (p_text[i] == '.') || (p_text[i] == ':') || (p_text[i] == '&') || (p_text[i] == '=') || (p_text[i] == '@') || (p_text[i] == '?') || (p_text[i] == '#')) {
 					if (prev != i) {
-						ret.push_back(Vector2i(prev, i));
+						ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
 					}
-					ret.push_back(Vector2i(i, i + 1));
+					ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
 					prev = i + 1;
 				}
 			}
 			if (prev != p_text.length()) {
-				ret.push_back(Vector2i(prev, p_text.length()));
+				ret.push_back(Vector3i(prev, p_text.length(), TextServer::DIRECTION_AUTO));
 			}
 		} break;
 		case STRUCTURED_TEXT_FILE: {
@@ -1715,14 +1716,14 @@ TypedArray<Vector2i> TextServer::parse_structured_text(StructuredTextParser p_pa
 			for (int i = 0; i < p_text.length(); i++) {
 				if ((p_text[i] == '\\') || (p_text[i] == '/') || (p_text[i] == ':')) {
 					if (prev != i) {
-						ret.push_back(Vector2i(prev, i));
+						ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
 					}
-					ret.push_back(Vector2i(i, i + 1));
+					ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
 					prev = i + 1;
 				}
 			}
 			if (prev != p_text.length()) {
-				ret.push_back(Vector2i(prev, p_text.length()));
+				ret.push_back(Vector3i(prev, p_text.length(), TextServer::DIRECTION_AUTO));
 			}
 		} break;
 		case STRUCTURED_TEXT_EMAIL: {
@@ -1731,19 +1732,19 @@ TypedArray<Vector2i> TextServer::parse_structured_text(StructuredTextParser p_pa
 			for (int i = 0; i < p_text.length(); i++) {
 				if ((p_text[i] == '@') && local) { // Add full "local" as single context.
 					local = false;
-					ret.push_back(Vector2i(prev, i));
-					ret.push_back(Vector2i(i, i + 1));
+					ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+					ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
 					prev = i + 1;
 				} else if (!local && (p_text[i] == '.')) { // Add each dot separated "domain" part as context.
 					if (prev != i) {
-						ret.push_back(Vector2i(prev, i));
+						ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
 					}
-					ret.push_back(Vector2i(i, i + 1));
+					ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
 					prev = i + 1;
 				}
 			}
 			if (prev != p_text.length()) {
-				ret.push_back(Vector2i(prev, p_text.length()));
+				ret.push_back(Vector3i(prev, p_text.length(), TextServer::DIRECTION_AUTO));
 			}
 		} break;
 		case STRUCTURED_TEXT_LIST: {
@@ -1752,18 +1753,97 @@ TypedArray<Vector2i> TextServer::parse_structured_text(StructuredTextParser p_pa
 				int prev = 0;
 				for (int i = 0; i < tags.size(); i++) {
 					if (prev != i) {
-						ret.push_back(Vector2i(prev, prev + tags[i].length()));
+						ret.push_back(Vector3i(prev, prev + tags[i].length(), TextServer::DIRECTION_INHERITED));
 					}
-					ret.push_back(Vector2i(prev + tags[i].length(), prev + tags[i].length() + 1));
+					ret.push_back(Vector3i(prev + tags[i].length(), prev + tags[i].length() + 1, TextServer::DIRECTION_INHERITED));
 					prev = prev + tags[i].length() + 1;
 				}
 			}
 		} break;
+		case STRUCTURED_TEXT_GDSCRIPT: {
+			bool in_string_literal = false;
+			bool in_string_literal_single = false;
+			bool in_id = false;
+
+			int prev = 0;
+			for (int i = 0; i < p_text.length(); i++) {
+				char32_t c = p_text[i];
+				if (in_string_literal) {
+					if (c == '\\') {
+						i++;
+						continue; // Skip escaped chars.
+					} else if (c == '\"') {
+						// String literal end, push string and ".
+						if (prev != i) {
+							ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+						}
+						prev = i + 1;
+						ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
+						in_string_literal = false;
+					}
+				} else if (in_string_literal_single) {
+					if (c == '\\') {
+						i++;
+						continue; // Skip escaped chars.
+					} else if (c == '\'') {
+						// String literal end, push string and '.
+						if (prev != i) {
+							ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+						}
+						prev = i + 1;
+						ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
+						in_string_literal_single = false;
+					}
+				} else if (in_id) {
+					if (!is_unicode_identifier_continue(c)) {
+						// End of id, push id.
+						if (prev != i) {
+							ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+						}
+						prev = i;
+						in_id = false;
+					}
+				} else if (is_unicode_identifier_start(c)) {
+					// Start of new id, push prev element.
+					if (prev != i) {
+						ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+					}
+					prev = i;
+					in_id = true;
+				} else if (c == '\"') {
+					// String literal start, push prev element and ".
+					if (prev != i) {
+						ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+					}
+					prev = i + 1;
+					ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
+					in_string_literal = true;
+				} else if (c == '\'') {
+					// String literal start, push prev element and '.
+					if (prev != i) {
+						ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+					}
+					prev = i + 1;
+					ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
+					in_string_literal_single = true;
+				} else if (c == '#') {
+					// Start of comment, push prev element and #, skip the rest of the text.
+					if (prev != i) {
+						ret.push_back(Vector3i(prev, i, TextServer::DIRECTION_AUTO));
+					}
+					prev = i + 1;
+					ret.push_back(Vector3i(i, i + 1, TextServer::DIRECTION_LTR));
+					break;
+				}
+			}
+			if (prev < p_text.length()) {
+				ret.push_back(Vector3i(prev, p_text.length(), TextServer::DIRECTION_AUTO));
+			}
+		} break;
 		case STRUCTURED_TEXT_CUSTOM:
-		case STRUCTURED_TEXT_NONE:
 		case STRUCTURED_TEXT_DEFAULT:
 		default: {
-			ret.push_back(Vector2i(0, p_text.length()));
+			ret.push_back(Vector3i(0, p_text.length(), TextServer::DIRECTION_INHERITED));
 		}
 	}
 	return ret;
diff --git a/servers/text_server.h b/servers/text_server.h
index a56c7d8b23..a91d367e97 100644
--- a/servers/text_server.h
+++ b/servers/text_server.h
@@ -65,7 +65,8 @@ public:
 	enum Direction {
 		DIRECTION_AUTO,
 		DIRECTION_LTR,
-		DIRECTION_RTL
+		DIRECTION_RTL,
+		DIRECTION_INHERITED,
 	};
 
 	enum Orientation {
@@ -198,7 +199,7 @@ public:
 		STRUCTURED_TEXT_FILE,
 		STRUCTURED_TEXT_EMAIL,
 		STRUCTURED_TEXT_LIST,
-		STRUCTURED_TEXT_NONE,
+		STRUCTURED_TEXT_GDSCRIPT,
 		STRUCTURED_TEXT_CUSTOM
 	};
 
@@ -505,7 +506,7 @@ public:
 	virtual String string_to_upper(const String &p_string, const String &p_language = "") const = 0;
 	virtual String string_to_lower(const String &p_string, const String &p_language = "") const = 0;
 
-	TypedArray<Vector2i> parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const;
+	TypedArray<Vector3i> parse_structured_text(StructuredTextParser p_parser_type, const Array &p_args, const String &p_text) const;
 
 	virtual void cleanup() {}
 
diff --git a/tests/scene/test_arraymesh.h b/tests/scene/test_arraymesh.h
index 4d9feeb4fa..b2a2ecc3bf 100644
--- a/tests/scene/test_arraymesh.h
+++ b/tests/scene/test_arraymesh.h
@@ -114,6 +114,17 @@ TEST_CASE("[SceneTree][ArrayMesh] Adding and modifying blendshapes.") {
 		CHECK(mesh->get_blend_shape_count() == 0);
 	}
 
+	SUBCASE("Can't add surface with incorrect number of blend shapes.") {
+		mesh->add_blend_shape(name_a);
+		mesh->add_blend_shape(name_b);
+		Ref<CylinderMesh> cylinder = memnew(CylinderMesh);
+		Array cylinder_array{};
+		ERR_PRINT_OFF
+		mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, cylinder_array);
+		ERR_PRINT_ON
+		CHECK(mesh->get_surface_count() == 0);
+	}
+
 	SUBCASE("Can't clear blend shapes after surface had been added.") {
 		mesh->add_blend_shape(name_a);
 		mesh->add_blend_shape(name_b);
@@ -121,7 +132,15 @@ TEST_CASE("[SceneTree][ArrayMesh] Adding and modifying blendshapes.") {
 		Array cylinder_array{};
 		cylinder_array.resize(Mesh::ARRAY_MAX);
 		cylinder->create_mesh_array(cylinder_array, 3.f, 3.f, 5.f);
-		mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, cylinder_array);
+		Array blend_shape{};
+		blend_shape.resize(Mesh::ARRAY_MAX);
+		blend_shape[Mesh::ARRAY_VERTEX] = cylinder_array[Mesh::ARRAY_VERTEX];
+		blend_shape[Mesh::ARRAY_NORMAL] = cylinder_array[Mesh::ARRAY_NORMAL];
+		blend_shape[Mesh::ARRAY_TANGENT] = cylinder_array[Mesh::ARRAY_TANGENT];
+		Array blend_shapes{};
+		blend_shapes.push_back(blend_shape);
+		blend_shapes.push_back(blend_shape);
+		mesh->add_surface_from_arrays(Mesh::PRIMITIVE_TRIANGLES, cylinder_array, blend_shapes);
 
 		ERR_PRINT_OFF
 		mesh->clear_blend_shapes();
diff --git a/tests/scene/test_primitives.h b/tests/scene/test_primitives.h
index 6cdb5fb0a5..9232a3020d 100644
--- a/tests/scene/test_primitives.h
+++ b/tests/scene/test_primitives.h
@@ -734,7 +734,7 @@ TEST_CASE("[SceneTree][Primitive][Text] Text Primitive") {
 				text->get_structured_text_bidi_override() == TextServer::STRUCTURED_TEXT_FILE ||
 				text->get_structured_text_bidi_override() == TextServer::STRUCTURED_TEXT_EMAIL ||
 				text->get_structured_text_bidi_override() == TextServer::STRUCTURED_TEXT_LIST ||
-				text->get_structured_text_bidi_override() == TextServer::STRUCTURED_TEXT_NONE ||
+				text->get_structured_text_bidi_override() == TextServer::STRUCTURED_TEXT_GDSCRIPT ||
 				text->get_structured_text_bidi_override() == TextServer::STRUCTURED_TEXT_CUSTOM));
 		CHECK(text->get_structured_text_bidi_override_options().size() >= 0);
 		CHECK(text->get_width() > 0);
diff --git a/thirdparty/README.md b/thirdparty/README.md
index 33f835cbcd..38001b8782 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -17,6 +17,18 @@ Files extracted from upstream source:
 - `license.txt`
 
 
+## astcenc
+
+- Upstream: https://github.com/ARM-software/astc-encoder
+- Version: 4.3.0 (ec83dda79fcefe07f69cdae7ed980d169bf2c4d4, 2023)
+- License: Apache 2.0
+
+Files extracted from upstream source:
+
+- `astcenc_*` and `astcenc.h` files from `Source`
+- `LICENSE.txt`
+
+
 ## basis_universal
 
 - Upstream: https://github.com/BinomialLLC/basis_universal
diff --git a/thirdparty/astcenc/LICENSE.txt b/thirdparty/astcenc/LICENSE.txt
new file mode 100644
index 0000000000..b82735a310
--- /dev/null
+++ b/thirdparty/astcenc/LICENSE.txt
@@ -0,0 +1,175 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
diff --git a/thirdparty/astcenc/astcenc.h b/thirdparty/astcenc/astcenc.h
new file mode 100644
index 0000000000..70ae783373
--- /dev/null
+++ b/thirdparty/astcenc/astcenc.h
@@ -0,0 +1,815 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief The core astcenc codec library interface.
+ *
+ * This interface is the entry point to the core astcenc codec. It aims to be easy to use for
+ * non-experts, but also to allow experts to have fine control over the compressor heuristics if
+ * needed. The core codec only handles compression and decompression, transferring all inputs and
+ * outputs via memory buffers. To catch obvious input/output buffer sizing issues, which can cause
+ * security and stability problems, all transfer buffers are explicitly sized.
+ *
+ * While the aim is that we keep this interface mostly stable, it should be viewed as a mutable
+ * interface tied to a specific source version. We are not trying to maintain backwards
+ * compatibility across codec versions.
+ *
+ * The API state management is based around an explicit context object, which is the context for all
+ * allocated memory resources needed to compress and decompress a single image. A context can be
+ * used to sequentially compress multiple images using the same configuration, allowing setup
+ * overheads to be amortized over multiple images, which is particularly important when images are
+ * small.
+ *
+ * Multi-threading can be used two ways.
+ *
+ *     * An application wishing to process multiple images in parallel can allocate multiple
+ *       contexts and assign each context to a thread.
+ *     * An application wishing to process a single image in using multiple threads can configure
+ *       contexts for multi-threaded use, and invoke astcenc_compress/decompress() once per thread
+ *       for faster processing. The caller is responsible for creating the worker threads, and
+ *       synchronizing between images.
+ *
+ * Threading
+ * =========
+ *
+ * In pseudo-code, the usage for manual user threading looks like this:
+ *
+ *     // Configure the compressor run
+ *     astcenc_config my_config;
+ *     astcenc_config_init(..., &my_config);
+ *
+ *     // Power users can tweak <my_config> settings here ...
+ *
+ *     // Allocate working state given config and thread_count
+ *     astcenc_context* my_context;
+ *     astcenc_context_alloc(&my_config, thread_count, &my_context);
+ *
+ *     // Compress each image using these config settings
+ *     foreach image:
+ *         // For each thread in the thread pool
+ *         for i in range(0, thread_count):
+ *             astcenc_compress_image(my_context, &my_input, my_output, i);
+ *
+ *         astcenc_compress_reset(my_context);
+ *
+ *     // Clean up
+ *     astcenc_context_free(my_context);
+ *
+ * Images
+ * ======
+ *
+ * The codec supports compressing single images, which can be either 2D images or volumetric 3D
+ * images. Calling code is responsible for any handling of aggregate types, such as mipmap chains,
+ * texture arrays, or sliced 3D textures.
+ *
+ * Images are passed in as an astcenc_image structure. Inputs can be either 8-bit unorm, 16-bit
+ * half-float, or 32-bit float, as indicated by the data_type field.
+ *
+ * Images can be any dimension; there is no requirement to be a multiple of the ASTC block size.
+ *
+ * Data is always passed in as 4 color components, and accessed as an array of 2D image slices. Data
+ * within an image slice is always tightly packed without padding. Addressing looks like this:
+ *
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4    ]   // Red
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 1]   // Green
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 2]   // Blue
+ *     data[z_coord][y_coord * x_dim * 4 + x_coord * 4 + 3]   // Alpha
+ *
+ * Common compressor usage
+ * =======================
+ *
+ * One of the most important things for coding image quality is to align the input data component
+ * count with the ASTC color endpoint mode. This avoids wasting bits encoding components you don't
+ * actually need in the endpoint colors.
+ *
+ *         | Input data   | Encoding swizzle | Sampling swizzle |
+ *         | ------------ | ---------------- | ---------------- |
+ *         | 1 component  | RRR1             | .[rgb]           |
+ *         | 2 components | RRRG             | .[rgb]a          |
+ *         | 3 components | RGB1             | .rgb             |
+ *         | 4 components | RGBA             | .rgba            |
+ *
+ * The 1 and 2 component modes recommend sampling from "g" to recover the luminance value as this
+ * provide best compatibility with other texture formats where the green component may be stored at
+ * higher precision than the others, such as RGB565. For ASTC any of the RGB components can be used;
+ * the luminance endpoint component will be returned for all three.
+ *
+ * When using the normal map compression mode ASTC will store normals as a two component X+Y map.
+ * Input images must contain unit-length normalized and should be passed in using a two component
+ * swizzle. The astcenc command line tool defaults to an RRRG swizzle, but some developers prefer
+ * to use GGGR for compatability with BC5n which will work just as well. The Z component can be
+ * recovered programmatically in shader code, using knowledge that the vector is unit length and
+ * that Z must be positive for a tangent-space normal map.
+ *
+ * Decompress-only usage
+ * =====================
+ *
+ * For some use cases it is useful to have a cut-down context and/or library which supports
+ * decompression but not compression.
+ *
+ * A context can be made decompress-only using the ASTCENC_FLG_DECOMPRESS_ONLY flag when the context
+ * is allocated. These contexts have lower dynamic memory footprint than a full context.
+ *
+ * The entire library can be made decompress-only by building the files with the define
+ * ASTCENC_DECOMPRESS_ONLY set. In this build the context will be smaller, and the library will
+ * exclude the functionality which is only needed for compression. This reduces the binary size by
+ * ~180KB. For these builds contexts must be created with the ASTCENC_FLG_DECOMPRESS_ONLY flag.
+ *
+ * Note that context structures returned by a library built as decompress-only are incompatible with
+ * a library built with compression included, and visa versa, as they have different sizes and
+ * memory layout.
+ *
+ * Self-decompress-only usage
+ * ==========================
+ *
+ * ASTC is a complex format with a large search space. The parts of this search space that are
+ * searched is determined by heuristics that are, in part, tied to the quality level used when
+ * creating the context.
+ *
+ * A normal context is capable of decompressing any ASTC texture, including those generated by other
+ * compressors with unknown heuristics. This is the most flexible implementation, but forces the
+ * data tables used by the codec to include entries that are not needed during compression. This
+ * can slow down context creation by a significant amount, especially for the faster compression
+ * modes where few data table entries are actually used. To optimize this use case the context can
+ * be created with the ASTCENC_FLG_SELF_DECOMPRESS_ONLY flag. This tells the compressor that it will
+ * only be asked to decompress images that it compressed itself, allowing the data tables to
+ * exclude entries that are not needed by the current compression configuration. This reduces the
+ * size of the context data tables in memory and improves context creation performance. Note that,
+ * as of the 3.6 release, this flag no longer affects compression performance.
+ *
+ * Using this flag while attempting to decompress an valid image which was created by another
+ * compressor, or even another astcenc compressor version or configuration, may result in blocks
+ * returning as solid magenta or NaN value error blocks.
+ */
+
+#ifndef ASTCENC_INCLUDED
+#define ASTCENC_INCLUDED
+
+#include <cstddef>
+#include <cstdint>
+
+#if defined(ASTCENC_DYNAMIC_LIBRARY)
+	#if defined(_MSC_VER)
+		#define ASTCENC_PUBLIC extern "C" __declspec(dllexport)
+	#else
+		#define ASTCENC_PUBLIC extern "C" __attribute__ ((visibility ("default")))
+	#endif
+#else
+	#define ASTCENC_PUBLIC
+#endif
+
+/* ============================================================================
+    Data declarations
+============================================================================ */
+
+/**
+ * @brief An opaque structure; see astcenc_internal.h for definition.
+ */
+struct astcenc_context;
+
+/**
+ * @brief A codec API error code.
+ */
+enum astcenc_error {
+	/** @brief The call was successful. */
+	ASTCENC_SUCCESS = 0,
+	/** @brief The call failed due to low memory, or undersized I/O buffers. */
+	ASTCENC_ERR_OUT_OF_MEM,
+	/** @brief The call failed due to the build using fast math. */
+	ASTCENC_ERR_BAD_CPU_FLOAT,
+	/** @brief The call failed due to the build using an unsupported ISA. */
+	ASTCENC_ERR_BAD_CPU_ISA,
+	/** @brief The call failed due to an out-of-spec parameter. */
+	ASTCENC_ERR_BAD_PARAM,
+	/** @brief The call failed due to an out-of-spec block size. */
+	ASTCENC_ERR_BAD_BLOCK_SIZE,
+	/** @brief The call failed due to an out-of-spec color profile. */
+	ASTCENC_ERR_BAD_PROFILE,
+	/** @brief The call failed due to an out-of-spec quality value. */
+	ASTCENC_ERR_BAD_QUALITY,
+	/** @brief The call failed due to an out-of-spec component swizzle. */
+	ASTCENC_ERR_BAD_SWIZZLE,
+	/** @brief The call failed due to an out-of-spec flag set. */
+	ASTCENC_ERR_BAD_FLAGS,
+	/** @brief The call failed due to the context not supporting the operation. */
+	ASTCENC_ERR_BAD_CONTEXT,
+	/** @brief The call failed due to unimplemented functionality. */
+	ASTCENC_ERR_NOT_IMPLEMENTED,
+#if defined(ASTCENC_DIAGNOSTICS)
+	/** @brief The call failed due to an issue with diagnostic tracing. */
+	ASTCENC_ERR_DTRACE_FAILURE,
+#endif
+};
+
+/**
+ * @brief A codec color profile.
+ */
+enum astcenc_profile {
+	/** @brief The LDR sRGB color profile. */
+	ASTCENC_PRF_LDR_SRGB = 0,
+	/** @brief The LDR linear color profile. */
+	ASTCENC_PRF_LDR,
+	/** @brief The HDR RGB with LDR alpha color profile. */
+	ASTCENC_PRF_HDR_RGB_LDR_A,
+	/** @brief The HDR RGBA color profile. */
+	ASTCENC_PRF_HDR
+};
+
+/** @brief The fastest, lowest quality, search preset. */
+static const float ASTCENC_PRE_FASTEST = 0.0f;
+
+/** @brief The fast search preset. */
+static const float ASTCENC_PRE_FAST = 10.0f;
+
+/** @brief The medium quality search preset. */
+static const float ASTCENC_PRE_MEDIUM = 60.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_THOROUGH = 98.0f;
+
+/** @brief The thorough quality search preset. */
+static const float ASTCENC_PRE_VERYTHOROUGH = 99.0f;
+
+/** @brief The exhaustive, highest quality, search preset. */
+static const float ASTCENC_PRE_EXHAUSTIVE = 100.0f;
+
+/**
+ * @brief A codec component swizzle selector.
+ */
+enum astcenc_swz
+{
+	/** @brief Select the red component. */
+	ASTCENC_SWZ_R = 0,
+	/** @brief Select the green component. */
+	ASTCENC_SWZ_G = 1,
+	/** @brief Select the blue component. */
+	ASTCENC_SWZ_B = 2,
+	/** @brief Select the alpha component. */
+	ASTCENC_SWZ_A = 3,
+	/** @brief Use a constant zero component. */
+	ASTCENC_SWZ_0 = 4,
+	/** @brief Use a constant one component. */
+	ASTCENC_SWZ_1 = 5,
+	/** @brief Use a reconstructed normal vector Z component. */
+	ASTCENC_SWZ_Z = 6
+};
+
+/**
+ * @brief A texel component swizzle.
+ */
+struct astcenc_swizzle
+{
+	/** @brief The red component selector. */
+	astcenc_swz r;
+	/** @brief The green component selector. */
+	astcenc_swz g;
+	/** @brief The blue component selector. */
+	astcenc_swz b;
+	/** @brief The alpha component selector. */
+	astcenc_swz a;
+};
+
+/**
+ * @brief A texel component data format.
+ */
+enum astcenc_type
+{
+	/** @brief Unorm 8-bit data per component. */
+	ASTCENC_TYPE_U8 = 0,
+	/** @brief 16-bit float per component. */
+	ASTCENC_TYPE_F16 = 1,
+	/** @brief 32-bit float per component. */
+	ASTCENC_TYPE_F32 = 2
+};
+
+/**
+ * @brief Enable normal map compression.
+ *
+ * Input data will be treated a two component normal map, storing X and Y, and the codec will
+ * optimize for angular error rather than simple linear PSNR. In this mode the input swizzle should
+ * be e.g. rrrg (the default ordering for ASTC normals on the command line) or gggr (the ordering
+ * used by BC5n).
+ */
+static const unsigned int ASTCENC_FLG_MAP_NORMAL          = 1 << 0;
+
+/**
+ * @brief Enable alpha weighting.
+ *
+ * The input alpha value is used for transparency, so errors in the RGB components are weighted by
+ * the transparency level. This allows the codec to more accurately encode the alpha value in areas
+ * where the color value is less significant.
+ */
+static const unsigned int ASTCENC_FLG_USE_ALPHA_WEIGHT     = 1 << 2;
+
+/**
+ * @brief Enable perceptual error metrics.
+ *
+ * This mode enables perceptual compression mode, which will optimize for perceptual error rather
+ * than best PSNR. Only some input modes support perceptual error metrics.
+ */
+static const unsigned int ASTCENC_FLG_USE_PERCEPTUAL       = 1 << 3;
+
+/**
+ * @brief Create a decompression-only context.
+ *
+ * This mode disables support for compression. This enables context allocation to skip some
+ * transient buffer allocation, resulting in lower memory usage.
+ */
+static const unsigned int ASTCENC_FLG_DECOMPRESS_ONLY      = 1 << 4;
+
+/**
+ * @brief Create a self-decompression context.
+ *
+ * This mode configures the compressor so that it is only guaranteed to be able to decompress images
+ * that were actually created using the current context. This is the common case for compression use
+ * cases, and setting this flag enables additional optimizations, but does mean that the context
+ * cannot reliably decompress arbitrary ASTC images.
+ */
+static const unsigned int ASTCENC_FLG_SELF_DECOMPRESS_ONLY = 1 << 5;
+
+/**
+ * @brief Enable RGBM map compression.
+ *
+ * Input data will be treated as HDR data that has been stored in an LDR RGBM-encoded wrapper
+ * format. Data must be preprocessed by the user to be in LDR RGBM format before calling the
+ * compression function, this flag is only used to control the use of RGBM-specific heuristics and
+ * error metrics.
+ *
+ * IMPORTANT: The ASTC format is prone to bad failure modes with unconstrained RGBM data; very small
+ * M values can round to zero due to quantization and result in black or white pixels. It is highly
+ * recommended that the minimum value of M used in the encoding is kept above a lower threshold (try
+ * 16 or 32). Applying this threshold reduces the number of very dark colors that can be
+ * represented, but is still higher precision than 8-bit LDR.
+ *
+ * When this flag is set the value of @c rgbm_m_scale in the context must be set to the RGBM scale
+ * factor used during reconstruction. This defaults to 5 when in RGBM mode.
+ *
+ * It is recommended that the value of @c cw_a_weight is set to twice the value of the multiplier
+ * scale, ensuring that the M value is accurately encoded. This defaults to 10 when in RGBM mode,
+ * matching the default scale factor.
+ */
+static const unsigned int ASTCENC_FLG_MAP_RGBM             = 1 << 6;
+
+/**
+ * @brief The bit mask of all valid flags.
+ */
+static const unsigned int ASTCENC_ALL_FLAGS =
+                              ASTCENC_FLG_MAP_NORMAL |
+                              ASTCENC_FLG_MAP_RGBM |
+                              ASTCENC_FLG_USE_ALPHA_WEIGHT |
+                              ASTCENC_FLG_USE_PERCEPTUAL |
+                              ASTCENC_FLG_DECOMPRESS_ONLY |
+                              ASTCENC_FLG_SELF_DECOMPRESS_ONLY;
+
+/**
+ * @brief The config structure.
+ *
+ * This structure will initially be populated by a call to astcenc_config_init, but power users may
+ * modify it before calling astcenc_context_alloc. See astcenccli_toplevel_help.cpp for full user
+ * documentation of the power-user settings.
+ *
+ * Note for any settings which are associated with a specific color component, the value in the
+ * config applies to the component that exists after any compression data swizzle is applied.
+ */
+struct astcenc_config
+{
+	/** @brief The color profile. */
+	astcenc_profile profile;
+
+	/** @brief The set of set flags. */
+	unsigned int flags;
+
+	/** @brief The ASTC block size X dimension. */
+	unsigned int block_x;
+
+	/** @brief The ASTC block size Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The ASTC block size Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The red component weight scale for error weighting (-cw). */
+	float cw_r_weight;
+
+	/** @brief The green component weight scale for error weighting (-cw). */
+	float cw_g_weight;
+
+	/** @brief The blue component weight scale for error weighting (-cw). */
+	float cw_b_weight;
+
+	/** @brief The alpha component weight scale for error weighting (-cw). */
+	float cw_a_weight;
+
+	/**
+	 * @brief The radius for any alpha-weight scaling (-a).
+	 *
+	 * It is recommended that this is set to 1 when using FLG_USE_ALPHA_WEIGHT on a texture that
+	 * will be sampled using linear texture filtering to minimize color bleed out of transparent
+	 * texels that are adjacent to non-transparent texels.
+	 */
+	unsigned int a_scale_radius;
+
+	/** @brief The RGBM scale factor for the shared multiplier (-rgbm). */
+	float rgbm_m_scale;
+
+	/**
+	 * @brief The maximum number of partitions searched (-partitioncountlimit).
+	 *
+	 * Valid values are between 1 and 4.
+	 */
+	unsigned int tune_partition_count_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-2partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_2partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-3partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_3partition_index_limit;
+
+	/**
+	 * @brief The maximum number of partitions searched (-4partitionindexlimit).
+	 *
+	 * Valid values are between 1 and 1024.
+	 */
+	unsigned int tune_4partition_index_limit;
+
+	/**
+	 * @brief The maximum centile for block modes searched (-blockmodelimit).
+	 *
+	 * Valid values are between 1 and 100.
+	 */
+	unsigned int tune_block_mode_limit;
+
+	/**
+	 * @brief The maximum iterative refinements applied (-refinementlimit).
+	 *
+	 * Valid values are between 1 and N; there is no technical upper limit
+	 * but little benefit is expected after N=4.
+	 */
+	unsigned int tune_refinement_limit;
+
+	/**
+	 * @brief The number of trial candidates per mode search (-candidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_TRIAL_CANDIDATES (default 4).
+	 */
+	unsigned int tune_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-2partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_2partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-3partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_3partitioning_candidate_limit;
+
+	/**
+	 * @brief The number of trial partitionings per search (-4partitioncandidatelimit).
+	 *
+	 * Valid values are between 1 and TUNE_MAX_PARTITIONING_CANDIDATES.
+	 */
+	unsigned int tune_4partitioning_candidate_limit;
+
+	/**
+	 * @brief The dB threshold for stopping block search (-dblimit).
+	 *
+	 * This option is ineffective for HDR textures.
+	 */
+	float tune_db_limit;
+
+	/**
+	 * @brief The amount of MSE overshoot needed to early-out trials.
+	 *
+	 * The first early-out is for 1 partition, 1 plane trials, where we try a minimal encode using
+	 * the high probability block modes. This can short-cut compression for simple blocks.
+	 *
+	 * The second early-out is for refinement trials, where we can exit refinement once quality is
+	 * reached.
+	 */
+	float tune_mse_overshoot;
+
+	/**
+	 * @brief The threshold for skipping 3.1/4.1 trials (-2partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_2_partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping 4.1 trials (-3partitionlimitfactor).
+	 *
+	 * This option is further scaled for normal maps, so it skips less often.
+	 */
+	float tune_3_partition_early_out_limit_factor;
+
+	/**
+	 * @brief The threshold for skipping two weight planes (-2planelimitcorrelation).
+	 *
+	 * This option is ineffective for normal maps.
+	 */
+	float tune_2_plane_early_out_limit_correlation;
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	/**
+	 * @brief The path to save the diagnostic trace data to.
+	 *
+	 * This option is not part of the public API, and requires special builds
+	 * of the library.
+	 */
+	const char* trace_file_path;
+#endif
+};
+
+/**
+ * @brief An uncompressed 2D or 3D image.
+ *
+ * 3D image are passed in as an array of 2D slices. Each slice has identical
+ * size and color format.
+ */
+struct astcenc_image
+{
+	/** @brief The X dimension of the image, in texels. */
+	unsigned int dim_x;
+
+	/** @brief The Y dimension of the image, in texels. */
+	unsigned int dim_y;
+
+	/** @brief The Z dimension of the image, in texels. */
+	unsigned int dim_z;
+
+	/** @brief The data type per component. */
+	astcenc_type data_type;
+
+	/** @brief The array of 2D slices, of length @c dim_z. */
+	void** data;
+};
+
+/**
+ * @brief A block encoding metadata query result.
+ *
+ * If the block is an error block or a constant color block or an error block all fields other than
+ * the profile, block dimensions, and error/constant indicator will be zero.
+ */
+struct astcenc_block_info
+{
+	/** @brief The block encoding color profile. */
+	astcenc_profile profile;
+
+	/** @brief The number of texels in the X dimension. */
+	unsigned int block_x;
+
+	/** @brief The number of texels in the Y dimension. */
+	unsigned int block_y;
+
+	/** @brief The number of texel in the Z dimension. */
+	unsigned int block_z;
+
+	/** @brief The number of texels in the block. */
+	unsigned int texel_count;
+
+	/** @brief True if this block is an error block. */
+	bool is_error_block;
+
+	/** @brief True if this block is a constant color block. */
+	bool is_constant_block;
+
+	/** @brief True if this block is an HDR block. */
+	bool is_hdr_block;
+
+	/** @brief True if this block uses two weight planes. */
+	bool is_dual_plane_block;
+
+	/** @brief The number of partitions if not constant color. */
+	unsigned int partition_count;
+
+	/** @brief The partition index if 2 - 4 partitions used. */
+	unsigned int partition_index;
+
+	/** @brief The component index of the second plane if dual plane. */
+	unsigned int dual_plane_component;
+
+	/** @brief The color endpoint encoding mode for each partition. */
+	unsigned int color_endpoint_modes[4];
+
+	/** @brief The number of color endpoint quantization levels. */
+	unsigned int color_level_count;
+
+	/** @brief The number of weight quantization levels. */
+	unsigned int weight_level_count;
+
+	/** @brief The number of weights in the X dimension. */
+	unsigned int weight_x;
+
+	/** @brief The number of weights in the Y dimension. */
+	unsigned int weight_y;
+
+	/** @brief The number of weights in the Z dimension. */
+	unsigned int weight_z;
+
+	/** @brief The unpacked color endpoints for each partition. */
+	float color_endpoints[4][2][4];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane1[216];
+
+	/** @brief The per-texel interpolation weights for the block. */
+	float weight_values_plane2[216];
+
+	/** @brief The per-texel partition assignments for the block. */
+	uint8_t partition_assignment[216];
+};
+
+/**
+ * Populate a codec config based on default settings.
+ *
+ * Power users can edit the returned config struct to fine tune before allocating the context.
+ *
+ * @param      profile   Color profile.
+ * @param      block_x   ASTC block size X dimension.
+ * @param      block_y   ASTC block size Y dimension.
+ * @param      block_z   ASTC block size Z dimension.
+ * @param      quality   Search quality preset / effort level. Either an
+ *                       @c ASTCENC_PRE_* value, or a effort level between 0
+ *                       and 100. Performance is not linear between 0 and 100.
+
+ * @param      flags     A valid set of @c ASTCENC_FLG_* flag bits.
+ * @param[out] config    Output config struct to populate.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if the inputs are invalid
+ * either individually, or in combination.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_config_init(
+	astcenc_profile profile,
+	unsigned int block_x,
+	unsigned int block_y,
+	unsigned int block_z,
+	float quality,
+	unsigned int flags,
+	astcenc_config* config);
+
+/**
+ * @brief Allocate a new codec context based on a config.
+ *
+ * This function allocates all of the memory resources and threads needed by the codec. This can be
+ * slow, so it is recommended that contexts are reused to serially compress or decompress multiple
+ * images to amortize setup cost.
+ *
+ * Contexts can be allocated to support only decompression using the @c ASTCENC_FLG_DECOMPRESS_ONLY
+ * flag when creating the configuration. The compression functions will fail if invoked. For a
+ * decompress-only library build the @c ASTCENC_FLG_DECOMPRESS_ONLY flag must be set when creating
+ * any context.
+ *
+ * @param[in]  config         Codec config.
+ * @param      thread_count   Thread count to configure for.
+ * @param[out] context        Location to store an opaque context pointer.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if context creation failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_context_alloc(
+	const astcenc_config* config,
+	unsigned int thread_count,
+	astcenc_context** context);
+
+/**
+ * @brief Compress an image.
+ *
+ * A single context can only compress or decompress a single image at a time.
+ *
+ * For a context configured for multi-threading, any set of the N threads can call this function.
+ * Work will be dynamically scheduled across the threads available. Each thread must have a unique
+ * @c thread_index.
+ *
+ * @param         context        Codec context.
+ * @param[in,out] image          An input image, in 2D slices.
+ * @param         swizzle        Compression data swizzle, applied before compression.
+ * @param[out]    data_out       Pointer to output data array.
+ * @param         data_len       Length of the output data array.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if compression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_image(
+	astcenc_context* context,
+	astcenc_image* image,
+	const astcenc_swizzle* swizzle,
+	uint8_t* data_out,
+	size_t data_len,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new compression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_compress_image() function for image N,
+ * but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_compress_reset(
+	astcenc_context* context);
+
+/**
+ * @brief Decompress an image.
+ *
+ * @param         context        Codec context.
+ * @param[in]     data           Pointer to compressed data.
+ * @param         data_len       Length of the compressed data, in bytes.
+ * @param[in,out] image_out      Output image.
+ * @param         swizzle        Decompression data swizzle, applied after decompression.
+ * @param         thread_index   Thread index [0..N-1] of calling thread.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if decompression failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_image(
+	astcenc_context* context,
+	const uint8_t* data,
+	size_t data_len,
+	astcenc_image* image_out,
+	const astcenc_swizzle* swizzle,
+	unsigned int thread_index);
+
+/**
+ * @brief Reset the codec state for a new decompression.
+ *
+ * The caller is responsible for synchronizing threads in the worker thread pool. This function must
+ * only be called when all threads have exited the @c astcenc_decompress_image() function for image
+ * N, but before any thread enters it for image N + 1.
+ *
+ * Calling this is not required (but won't hurt), if the context is created for single threaded use.
+ *
+ * @param context   Codec context.
+ *
+ * @return @c ASTCENC_SUCCESS on success, or an error if reset failed.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_decompress_reset(
+	astcenc_context* context);
+
+/**
+ * Free the compressor context.
+ *
+ * @param context   The codec context.
+ */
+ASTCENC_PUBLIC void astcenc_context_free(
+	astcenc_context* context);
+
+/**
+ * @brief Provide a high level summary of a block's encoding.
+ *
+ * This feature is primarily useful for codec developers but may be useful for developers building
+ * advanced content packaging pipelines.
+ *
+ * @param context   Codec context.
+ * @param data      One block of compressed ASTC data.
+ * @param info      The output info structure to populate.
+ *
+ * @return @c ASTCENC_SUCCESS if the block was decoded, or an error otherwise. Note that this
+ *         function will return success even if the block itself was an error block encoding, as the
+ *         decode was correctly handled.
+ */
+ASTCENC_PUBLIC astcenc_error astcenc_get_block_info(
+	astcenc_context* context,
+	const uint8_t data[16],
+	astcenc_block_info* info);
+
+/**
+ * @brief Get a printable string for specific status code.
+ *
+ * @param status   The status value.
+ *
+ * @return A human readable nul-terminated string.
+ */
+ASTCENC_PUBLIC const char* astcenc_get_error_string(
+	astcenc_error status);
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_averages_and_directions.cpp b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
new file mode 100644
index 0000000000..d1f003844a
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_averages_and_directions.cpp
@@ -0,0 +1,995 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for finding dominant direction of a set of colors.
+ */
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Compute the average RGB color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized, and lane<3> will be zero.
+ */
+static void compute_partition_averages_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean.swz<0, 1, 2>();
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloatacc pp_avg_rgb[3] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[2], data_b, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]),
+		                           hadd_s(pp_avg_rgb[1]),
+		                           hadd_s(pp_avg_rgb[2]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloatacc pp_avg_rgb[2][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloatacc pp_avg_rgb[3][3] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgb[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgb[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgb[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgb[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgb[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgb[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgb[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgb[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgb[2][2], data_b, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]),
+		                           hadd_s(pp_avg_rgb[0][1]),
+		                           hadd_s(pp_avg_rgb[0][2]));
+
+		vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]),
+		                           hadd_s(pp_avg_rgb[1][1]),
+		                           hadd_s(pp_avg_rgb[1][2]));
+
+		vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]),
+		                           hadd_s(pp_avg_rgb[2][1]),
+		                           hadd_s(pp_avg_rgb[2][2]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/**
+ * @brief Compute the average RGBA color of each partition.
+ *
+ * The algorithm here uses a vectorized sequential scan and per-partition
+ * color accumulators, using select() to mask texel lanes in other partitions.
+ *
+ * We only accumulate sums for N-1 partitions during the scan; the value for
+ * the last partition can be computed given that we know the block-wide average
+ * already.
+ *
+ * Because of this we could reduce the loop iteration count so it "just" spans
+ * the max texel index needed for the N-1 partitions, which could need fewer
+ * iterations than the full block texel count. However, this makes the loop
+ * count erratic and causes more branch mispredictions so is a net loss.
+ *
+ * @param      pi         The partitioning to use.
+ * @param      blk        The block data to process.
+ * @param[out] averages   The output averages. Unused partition indices will
+ *                        not be initialized.
+ */
+static void compute_partition_averages_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	vfloat4 averages[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	// For 1 partition just use the precomputed mean
+	if (partition_count == 1)
+	{
+		averages[0] = blk.data_mean;
+	}
+	// For 2 partitions scan results for partition 0, compute partition 1
+	else if (partition_count == 2)
+	{
+		vfloat4 pp_avg_rgba[4] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0], data_r, p0_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[1], data_g, p0_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[2], data_b, p0_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[3], data_a, p0_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]),
+		                           hadd_s(pp_avg_rgba[1]),
+		                           hadd_s(pp_avg_rgba[2]),
+		                           hadd_s(pp_avg_rgba[3]));
+
+		vfloat4 p1_total = block_total - p0_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+	}
+	// For 3 partitions scan results for partition 0/1, compute partition 2
+	else if (partition_count == 3)
+	{
+		vfloat4 pp_avg_rgba[2][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = block_total - p0_total - p1_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+	}
+	else
+	{
+		// For 4 partitions scan results for partition 0/1/2, compute partition 3
+		vfloat4 pp_avg_rgba[3][4] {};
+
+		vint lane_id = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint texel_partition(pi.partition_of_texel + i);
+
+			vmask lane_mask = lane_id < vint(texel_count);
+			lane_id += vint(ASTCENC_SIMD_WIDTH);
+
+			vmask p0_mask = lane_mask & (texel_partition == vint(0));
+			vmask p1_mask = lane_mask & (texel_partition == vint(1));
+			vmask p2_mask = lane_mask & (texel_partition == vint(2));
+
+			vfloat data_r = loada(blk.data_r + i);
+			haccumulate(pp_avg_rgba[0][0], data_r, p0_mask);
+			haccumulate(pp_avg_rgba[1][0], data_r, p1_mask);
+			haccumulate(pp_avg_rgba[2][0], data_r, p2_mask);
+
+			vfloat data_g = loada(blk.data_g + i);
+			haccumulate(pp_avg_rgba[0][1], data_g, p0_mask);
+			haccumulate(pp_avg_rgba[1][1], data_g, p1_mask);
+			haccumulate(pp_avg_rgba[2][1], data_g, p2_mask);
+
+			vfloat data_b = loada(blk.data_b + i);
+			haccumulate(pp_avg_rgba[0][2], data_b, p0_mask);
+			haccumulate(pp_avg_rgba[1][2], data_b, p1_mask);
+			haccumulate(pp_avg_rgba[2][2], data_b, p2_mask);
+
+			vfloat data_a = loada(blk.data_a + i);
+			haccumulate(pp_avg_rgba[0][3], data_a, p0_mask);
+			haccumulate(pp_avg_rgba[1][3], data_a, p1_mask);
+			haccumulate(pp_avg_rgba[2][3], data_a, p2_mask);
+		}
+
+		vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count);
+
+		vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]),
+		                           hadd_s(pp_avg_rgba[0][1]),
+		                           hadd_s(pp_avg_rgba[0][2]),
+		                           hadd_s(pp_avg_rgba[0][3]));
+
+		vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]),
+		                           hadd_s(pp_avg_rgba[1][1]),
+		                           hadd_s(pp_avg_rgba[1][2]),
+		                           hadd_s(pp_avg_rgba[1][3]));
+
+		vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]),
+		                           hadd_s(pp_avg_rgba[2][1]),
+		                           hadd_s(pp_avg_rgba[2][2]),
+		                           hadd_s(pp_avg_rgba[2][3]));
+
+		vfloat4 p3_total = block_total - p0_total - p1_total- p2_total;
+
+		averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]);
+		averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]);
+		averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]);
+		averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]);
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_4_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	for (int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+		vfloat4 sum_wp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = blk.texel(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+
+			vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero;
+			sum_wp += select(zero, texel_datum, tdm3);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+		vfloat4 prod_wp = dot(sum_wp, sum_wp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+		best_sum = select(best_sum, prod_zp, mask);
+
+		mask = prod_wp > best_sum;
+		best_vector = select(best_vector, sum_wp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	unsigned int omitted_component,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgba(pi, blk, partition_averages);
+
+	const float* data_vr = blk.data_r;
+	const float* data_vg = blk.data_g;
+	const float* data_vb = blk.data_b;
+
+	// TODO: Data-driven permute would be useful to avoid this ...
+	if (omitted_component == 0)
+	{
+		partition_averages[0] = partition_averages[0].swz<1, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<1, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<1, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<1, 2, 3>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 1)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 2, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 2, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 2, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 2, 3>();
+
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 2)
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 3>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 3>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 3>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 3>();
+
+		data_vb = blk.data_a;
+	}
+	else
+	{
+		partition_averages[0] = partition_averages[0].swz<0, 1, 2>();
+		partition_averages[1] = partition_averages[1].swz<0, 1, 2>();
+		partition_averages[2] = partition_averages[2].swz<0, 1, 2>();
+		partition_averages[3] = partition_averages[3].swz<0, 1, 2>();
+	}
+
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = vfloat3(data_vr[iwt],
+			                              data_vg[iwt],
+			                              data_vb[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_3_comp_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	// Pre-compute partition_averages
+	vfloat4 partition_averages[BLOCK_MAX_PARTITIONS];
+	compute_partition_averages_rgb(pi, blk, partition_averages);
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		vfloat4 average = partition_averages[partition];
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+		vfloat4 sum_zp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+
+			vfloat4 texel_datum = blk.texel3(iwt);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+
+			vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero;
+			sum_zp += select(zero, texel_datum, tdm2);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+		vfloat4 prod_zp = dot(sum_zp, sum_zp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+		best_sum = select(best_sum, prod_yp, mask);
+
+		mask = prod_zp > best_sum;
+		best_vector = select(best_vector, sum_zp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_avgs_and_dirs_2_comp(
+	const partition_info& pt,
+	const image_block& blk,
+	unsigned int component1,
+	unsigned int component2,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]
+) {
+	vfloat4 average;
+
+	const float* data_vr = nullptr;
+	const float* data_vg = nullptr;
+
+	if (component1 == 0 && component2 == 1)
+	{
+		average = blk.data_mean.swz<0, 1>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_g;
+	}
+	else if (component1 == 0 && component2 == 2)
+	{
+		average = blk.data_mean.swz<0, 2>();
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_b;
+	}
+	else // (component1 == 1 && component2 == 2)
+	{
+		assert(component1 == 1 && component2 == 2);
+
+		average = blk.data_mean.swz<1, 2>();
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+	}
+
+	unsigned int partition_count = pt.partition_count;
+	promise(partition_count > 0);
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pt.texels_of_partition[partition];
+		unsigned int texel_count = pt.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Only compute a partition mean if more than one partition
+		if (partition_count > 1)
+		{
+			average = vfloat4::zero();
+			for (unsigned int i = 0; i < texel_count; i++)
+			{
+				unsigned int iwt = texel_indexes[i];
+				average += vfloat2(data_vr[iwt], data_vg[iwt]);
+			}
+
+			average = average / static_cast<float>(texel_count);
+		}
+
+		pm[partition].avg = average;
+
+		vfloat4 sum_xp = vfloat4::zero();
+		vfloat4 sum_yp = vfloat4::zero();
+
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			unsigned int iwt = texel_indexes[i];
+			vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]);
+			texel_datum = texel_datum - average;
+
+			vfloat4 zero = vfloat4::zero();
+
+			vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero;
+			sum_xp += select(zero, texel_datum, tdm0);
+
+			vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero;
+			sum_yp += select(zero, texel_datum, tdm1);
+		}
+
+		vfloat4 prod_xp = dot(sum_xp, sum_xp);
+		vfloat4 prod_yp = dot(sum_yp, sum_yp);
+
+		vfloat4 best_vector = sum_xp;
+		vfloat4 best_sum = prod_xp;
+
+		vmask4 mask = prod_yp > best_sum;
+		best_vector = select(best_vector, sum_yp, mask);
+
+		pm[partition].dir = best_vector;
+	}
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
+	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
+	float uncor_lengths[BLOCK_MAX_PARTITIONS],
+	float samec_lengths[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+
+		float uncor_loparam = 1e10f;
+		float uncor_hiparam = -1e10f;
+
+		float samec_loparam = 1e10f;
+		float samec_hiparam = -1e10f;
+
+		processed_line4 l_uncor = uncor_plines[partition];
+		processed_line4 l_samec = samec_plines[partition];
+
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+		vfloat l_uncor_bs3(l_uncor.bs.lane<3>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+		vfloat l_uncor_amod3(l_uncor.amod.lane<3>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+		vfloat l_samec_bs3(l_samec.bs.lane<3>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat samec_loparamv(1e10f);
+		vfloat samec_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+		vfloat ew_a(blk.channel_weight.lane<3>());
+
+		// This implementation over-shoots, but this is safe as we initialize the texel_indexes
+		// array to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint(texel_count);
+			vint texel_idxs(texel_indexes + i);
+
+			vfloat data_r = gatherf(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+			vfloat data_a = gatherf(blk.data_a, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2)
+			                   + (data_a * l_uncor_bs3);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+			vfloat uncor_dist3 = (l_uncor_amod3 - data_a)
+			                   + (uncor_param * l_uncor_bs3);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2)
+			                 + (ew_a * uncor_dist3 * uncor_dist3);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2)
+			                   + (data_a * l_samec_bs3);
+
+			samec_loparamv = min(samec_param, samec_loparamv);
+			samec_hiparamv = max(samec_param, samec_hiparamv);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+			vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2)
+			                 + (ew_a * samec_dist3 * samec_dist3);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		uncor_loparam = hmin_s(uncor_loparamv);
+		uncor_hiparam = hmax_s(uncor_hiparamv);
+
+		samec_loparam = hmin_s(samec_loparamv);
+		samec_hiparam = hmax_s(samec_hiparamv);
+
+		float uncor_linelen = uncor_hiparam - uncor_loparam;
+		float samec_linelen = samec_hiparam - samec_loparam;
+
+		// Turn very small numbers and NaNs into a small number
+		uncor_lengths[partition] = astc::max(uncor_linelen, 1e-7f);
+		samec_lengths[partition] = astc::max(samec_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+/* See header for documentation. */
+void compute_error_squared_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error
+) {
+	unsigned int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	vfloatacc uncor_errorsumv = vfloatacc::zero();
+	vfloatacc samec_errorsumv = vfloatacc::zero();
+
+	for (unsigned int partition = 0; partition < partition_count; partition++)
+	{
+		partition_lines3& pl = plines[partition];
+		const uint8_t *texel_indexes = pi.texels_of_partition[partition];
+		unsigned int texel_count = pi.partition_texel_count[partition];
+		promise(texel_count > 0);
+
+		float uncor_loparam = 1e10f;
+		float uncor_hiparam = -1e10f;
+
+		float samec_loparam = 1e10f;
+		float samec_hiparam = -1e10f;
+
+		processed_line3 l_uncor = pl.uncor_pline;
+		processed_line3 l_samec = pl.samec_pline;
+
+		// This implementation is an example vectorization of this function.
+		// It works for - the codec is a 2-4% faster than not vectorizing - but
+		// the benefit is limited by the use of gathers and register pressure
+
+		// Vectorize some useful scalar inputs
+		vfloat l_uncor_bs0(l_uncor.bs.lane<0>());
+		vfloat l_uncor_bs1(l_uncor.bs.lane<1>());
+		vfloat l_uncor_bs2(l_uncor.bs.lane<2>());
+
+		vfloat l_uncor_amod0(l_uncor.amod.lane<0>());
+		vfloat l_uncor_amod1(l_uncor.amod.lane<1>());
+		vfloat l_uncor_amod2(l_uncor.amod.lane<2>());
+
+		vfloat l_samec_bs0(l_samec.bs.lane<0>());
+		vfloat l_samec_bs1(l_samec.bs.lane<1>());
+		vfloat l_samec_bs2(l_samec.bs.lane<2>());
+
+		assert(all(l_samec.amod == vfloat4(0.0f)));
+
+		vfloat uncor_loparamv(1e10f);
+		vfloat uncor_hiparamv(-1e10f);
+
+		vfloat samec_loparamv(1e10f);
+		vfloat samec_hiparamv(-1e10f);
+
+		vfloat ew_r(blk.channel_weight.lane<0>());
+		vfloat ew_g(blk.channel_weight.lane<1>());
+		vfloat ew_b(blk.channel_weight.lane<2>());
+
+		// This implementation over-shoots, but this is safe as we initialize the weights array
+		// to extend the last value. This means min/max are not impacted, but we need to mask
+		// out the dummy values when we compute the line weighting.
+		vint lane_ids = vint::lane_id();
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vmask mask = lane_ids < vint(texel_count);
+			vint texel_idxs(texel_indexes + i);
+
+			vfloat data_r = gatherf(blk.data_r, texel_idxs);
+			vfloat data_g = gatherf(blk.data_g, texel_idxs);
+			vfloat data_b = gatherf(blk.data_b, texel_idxs);
+
+			vfloat uncor_param = (data_r * l_uncor_bs0)
+			                   + (data_g * l_uncor_bs1)
+			                   + (data_b * l_uncor_bs2);
+
+			uncor_loparamv = min(uncor_param, uncor_loparamv);
+			uncor_hiparamv = max(uncor_param, uncor_hiparamv);
+
+			vfloat uncor_dist0 = (l_uncor_amod0 - data_r)
+			                   + (uncor_param * l_uncor_bs0);
+			vfloat uncor_dist1 = (l_uncor_amod1 - data_g)
+			                   + (uncor_param * l_uncor_bs1);
+			vfloat uncor_dist2 = (l_uncor_amod2 - data_b)
+			                   + (uncor_param * l_uncor_bs2);
+
+			vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0)
+			                 + (ew_g * uncor_dist1 * uncor_dist1)
+			                 + (ew_b * uncor_dist2 * uncor_dist2);
+
+			haccumulate(uncor_errorsumv, uncor_err, mask);
+
+			// Process samechroma data
+			vfloat samec_param = (data_r * l_samec_bs0)
+			                   + (data_g * l_samec_bs1)
+			                   + (data_b * l_samec_bs2);
+
+			samec_loparamv = min(samec_param, samec_loparamv);
+			samec_hiparamv = max(samec_param, samec_hiparamv);
+
+			vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r;
+			vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g;
+			vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b;
+
+			vfloat samec_err = (ew_r * samec_dist0 * samec_dist0)
+			                 + (ew_g * samec_dist1 * samec_dist1)
+			                 + (ew_b * samec_dist2 * samec_dist2);
+
+			haccumulate(samec_errorsumv, samec_err, mask);
+
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		uncor_loparam = hmin_s(uncor_loparamv);
+		uncor_hiparam = hmax_s(uncor_hiparamv);
+
+		samec_loparam = hmin_s(samec_loparamv);
+		samec_hiparam = hmax_s(samec_hiparamv);
+
+		float uncor_linelen = uncor_hiparam - uncor_loparam;
+		float samec_linelen = samec_hiparam - samec_loparam;
+
+		// Turn very small numbers and NaNs into a small number
+		pl.uncor_line_len = astc::max(uncor_linelen, 1e-7f);
+		pl.samec_line_len = astc::max(samec_linelen, 1e-7f);
+	}
+
+	uncor_error = hadd_s(uncor_errorsumv);
+	samec_error = hadd_s(samec_errorsumv);
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_block_sizes.cpp b/thirdparty/astcenc/astcenc_block_sizes.cpp
new file mode 100644
index 0000000000..1c22d06a5c
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_block_sizes.cpp
@@ -0,0 +1,1184 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions to generate block size descriptor and decimation tables.
+ */
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Decode the properties of an encoded 2D block mode.
+ *
+ * @param      block_mode      The encoded block mode.
+ * @param[out] x_weights       The number of weights in the X dimension.
+ * @param[out] y_weights       The number of weights in the Y dimension.
+ * @param[out] is_dual_plane   True if this block mode has two weight planes.
+ * @param[out] quant_mode      The quantization level for the weights.
+ * @param[out] weight_bits     The storage bit count for the weights.
+ *
+ * @return Returns true if a valid mode, false otherwise.
+ */
+static bool decode_block_mode_2d(
+	unsigned int block_mode,
+	unsigned int& x_weights,
+	unsigned int& y_weights,
+	bool& is_dual_plane,
+	unsigned int& quant_mode,
+	unsigned int& weight_bits
+) {
+	unsigned int base_quant_mode = (block_mode >> 4) & 1;
+	unsigned int H = (block_mode >> 9) & 1;
+	unsigned int D = (block_mode >> 10) & 1;
+	unsigned int A = (block_mode >> 5) & 0x3;
+
+	x_weights = 0;
+	y_weights = 0;
+
+	if ((block_mode & 3) != 0)
+	{
+		base_quant_mode |= (block_mode & 3) << 1;
+		unsigned int B = (block_mode >> 7) & 3;
+		switch ((block_mode >> 2) & 3)
+		{
+		case 0:
+			x_weights = B + 4;
+			y_weights = A + 2;
+			break;
+		case 1:
+			x_weights = B + 8;
+			y_weights = A + 2;
+			break;
+		case 2:
+			x_weights = A + 2;
+			y_weights = B + 8;
+			break;
+		case 3:
+			B &= 1;
+			if (block_mode & 0x100)
+			{
+				x_weights = B + 2;
+				y_weights = A + 2;
+			}
+			else
+			{
+				x_weights = A + 2;
+				y_weights = B + 6;
+			}
+			break;
+		}
+	}
+	else
+	{
+		base_quant_mode |= ((block_mode >> 2) & 3) << 1;
+		if (((block_mode >> 2) & 3) == 0)
+		{
+			return false;
+		}
+
+		unsigned int B = (block_mode >> 9) & 3;
+		switch ((block_mode >> 7) & 3)
+		{
+		case 0:
+			x_weights = 12;
+			y_weights = A + 2;
+			break;
+		case 1:
+			x_weights = A + 2;
+			y_weights = 12;
+			break;
+		case 2:
+			x_weights = A + 6;
+			y_weights = B + 6;
+			D = 0;
+			H = 0;
+			break;
+		case 3:
+			switch ((block_mode >> 5) & 3)
+			{
+			case 0:
+				x_weights = 6;
+				y_weights = 10;
+				break;
+			case 1:
+				x_weights = 10;
+				y_weights = 6;
+				break;
+			case 2:
+			case 3:
+				return false;
+			}
+			break;
+		}
+	}
+
+	unsigned int weight_count = x_weights * y_weights * (D + 1);
+	quant_mode = (base_quant_mode - 2) + 6 * H;
+	is_dual_plane = D != 0;
+
+	weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
+	return (weight_count <= BLOCK_MAX_WEIGHTS &&
+	        weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
+	        weight_bits <= BLOCK_MAX_WEIGHT_BITS);
+}
+
+/**
+ * @brief Decode the properties of an encoded 3D block mode.
+ *
+ * @param      block_mode      The encoded block mode.
+ * @param[out] x_weights       The number of weights in the X dimension.
+ * @param[out] y_weights       The number of weights in the Y dimension.
+ * @param[out] z_weights       The number of weights in the Z dimension.
+ * @param[out] is_dual_plane   True if this block mode has two weight planes.
+ * @param[out] quant_mode      The quantization level for the weights.
+ * @param[out] weight_bits     The storage bit count for the weights.
+ *
+ * @return Returns true if a valid mode, false otherwise.
+ */
+static bool decode_block_mode_3d(
+	unsigned int block_mode,
+	unsigned int& x_weights,
+	unsigned int& y_weights,
+	unsigned int& z_weights,
+	bool& is_dual_plane,
+	unsigned int& quant_mode,
+	unsigned int& weight_bits
+) {
+	unsigned int base_quant_mode = (block_mode >> 4) & 1;
+	unsigned int H = (block_mode >> 9) & 1;
+	unsigned int D = (block_mode >> 10) & 1;
+	unsigned int A = (block_mode >> 5) & 0x3;
+
+	x_weights = 0;
+	y_weights = 0;
+	z_weights = 0;
+
+	if ((block_mode & 3) != 0)
+	{
+		base_quant_mode |= (block_mode & 3) << 1;
+		unsigned int B = (block_mode >> 7) & 3;
+		unsigned int C = (block_mode >> 2) & 0x3;
+		x_weights = A + 2;
+		y_weights = B + 2;
+		z_weights = C + 2;
+	}
+	else
+	{
+		base_quant_mode |= ((block_mode >> 2) & 3) << 1;
+		if (((block_mode >> 2) & 3) == 0)
+		{
+			return false;
+		}
+
+		int B = (block_mode >> 9) & 3;
+		if (((block_mode >> 7) & 3) != 3)
+		{
+			D = 0;
+			H = 0;
+		}
+		switch ((block_mode >> 7) & 3)
+		{
+		case 0:
+			x_weights = 6;
+			y_weights = B + 2;
+			z_weights = A + 2;
+			break;
+		case 1:
+			x_weights = A + 2;
+			y_weights = 6;
+			z_weights = B + 2;
+			break;
+		case 2:
+			x_weights = A + 2;
+			y_weights = B + 2;
+			z_weights = 6;
+			break;
+		case 3:
+			x_weights = 2;
+			y_weights = 2;
+			z_weights = 2;
+			switch ((block_mode >> 5) & 3)
+			{
+			case 0:
+				x_weights = 6;
+				break;
+			case 1:
+				y_weights = 6;
+				break;
+			case 2:
+				z_weights = 6;
+				break;
+			case 3:
+				return false;
+			}
+			break;
+		}
+	}
+
+	unsigned int weight_count = x_weights * y_weights * z_weights * (D + 1);
+	quant_mode = (base_quant_mode - 2) + 6 * H;
+	is_dual_plane = D != 0;
+
+	weight_bits = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(quant_mode));
+	return (weight_count <= BLOCK_MAX_WEIGHTS &&
+	        weight_bits >= BLOCK_MIN_WEIGHT_BITS &&
+	        weight_bits <= BLOCK_MAX_WEIGHT_BITS);
+}
+
+/**
+ * @brief Create a 2D decimation entry for a block-size and weight-decimation pair.
+ *
+ * @param      x_texels    The number of texels in the X dimension.
+ * @param      y_texels    The number of texels in the Y dimension.
+ * @param      x_weights   The number of weights in the X dimension.
+ * @param      y_weights   The number of weights in the Y dimension.
+ * @param[out] di          The decimation info structure to populate.
+ * @param[out] wb          The decimation table init scratch working buffers.
+ */
+static void init_decimation_info_2d(
+	unsigned int x_texels,
+	unsigned int y_texels,
+	unsigned int x_weights,
+	unsigned int y_weights,
+	decimation_info& di,
+	dt_init_working_buffers& wb
+) {
+	unsigned int texels_per_block = x_texels * y_texels;
+	unsigned int weights_per_block = x_weights * y_weights;
+
+	uint8_t max_texel_count_of_weight = 0;
+
+	promise(weights_per_block > 0);
+	promise(texels_per_block > 0);
+	promise(x_texels > 0);
+	promise(y_texels > 0);
+
+	for (unsigned int i = 0; i < weights_per_block; i++)
+	{
+		wb.texel_count_of_weight[i] = 0;
+	}
+
+	for (unsigned int i = 0; i < texels_per_block; i++)
+	{
+		wb.weight_count_of_texel[i] = 0;
+	}
+
+	for (unsigned int y = 0; y < y_texels; y++)
+	{
+		for (unsigned int x = 0; x < x_texels; x++)
+		{
+			unsigned int texel = y * x_texels + x;
+
+			unsigned int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
+			unsigned int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
+
+			unsigned int x_weight_frac = x_weight & 0xF;
+			unsigned int y_weight_frac = y_weight & 0xF;
+			unsigned int x_weight_int = x_weight >> 4;
+			unsigned int y_weight_int = y_weight >> 4;
+
+			unsigned int qweight[4];
+			qweight[0] = x_weight_int + y_weight_int * x_weights;
+			qweight[1] = qweight[0] + 1;
+			qweight[2] = qweight[0] + x_weights;
+			qweight[3] = qweight[2] + 1;
+
+			// Truncated-precision bilinear interpolation
+			unsigned int prod = x_weight_frac * y_weight_frac;
+
+			unsigned int weight[4];
+			weight[3] = (prod + 8) >> 4;
+			weight[1] = x_weight_frac - weight[3];
+			weight[2] = y_weight_frac - weight[3];
+			weight[0] = 16 - x_weight_frac - y_weight_frac + weight[3];
+
+			for (unsigned int i = 0; i < 4; i++)
+			{
+				if (weight[i] != 0)
+				{
+					wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
+					wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
+					wb.weight_count_of_texel[texel]++;
+					wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
+					wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
+					wb.texel_count_of_weight[qweight[i]]++;
+					max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
+				}
+			}
+		}
+	}
+
+	uint8_t max_texel_weight_count = 0;
+	for (unsigned int i = 0; i < texels_per_block; i++)
+	{
+		di.texel_weight_count[i] = wb.weight_count_of_texel[i];
+		max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
+
+		for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
+		{
+			di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
+			di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
+			di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
+		}
+
+		// Init all 4 entries so we can rely on zeros for vectorization
+		for (unsigned int j = wb.weight_count_of_texel[i]; j < 4; j++)
+		{
+			di.texel_weight_contribs_int_tr[j][i] = 0;
+			di.texel_weight_contribs_float_tr[j][i] = 0.0f;
+			di.texel_weights_tr[j][i] = 0;
+		}
+	}
+
+	di.max_texel_weight_count = max_texel_weight_count;
+
+	for (unsigned int i = 0; i < weights_per_block; i++)
+	{
+		unsigned int texel_count_wt = wb.texel_count_of_weight[i];
+		di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
+
+		for (unsigned int j = 0; j < texel_count_wt; j++)
+		{
+			uint8_t texel = wb.texels_of_weight[i][j];
+
+			// Create transposed versions of these for better vectorization
+			di.weight_texels_tr[j][i] = texel;
+			di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
+
+			// Store the per-texel contribution of this weight for each texel it contributes to
+			di.texel_contrib_for_weight[j][i] = 0.0f;
+			for (unsigned int k = 0; k < 4; k++)
+			{
+				uint8_t dttw = di.texel_weights_tr[k][texel];
+				float dttwf = di.texel_weight_contribs_float_tr[k][texel];
+				if (dttw == i && dttwf != 0.0f)
+				{
+					di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
+					break;
+				}
+			}
+		}
+
+		// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
+		// Match last texel in active lane in SIMD group, for better gathers
+		uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
+		for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
+		{
+			di.weight_texels_tr[j][i] = last_texel;
+			di.weights_texel_contribs_tr[j][i] = 0.0f;
+		}
+	}
+
+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
+	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
+	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
+	{
+		di.texel_weight_count[i] = 0;
+
+		for (unsigned int j = 0; j < 4; j++)
+		{
+			di.texel_weight_contribs_float_tr[j][i] = 0;
+			di.texel_weights_tr[j][i] = 0;
+			di.texel_weight_contribs_int_tr[j][i] = 0;
+		}
+	}
+
+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
+	// Match last texel in active lane in SIMD group, for better gathers
+	unsigned int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
+	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
+
+	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
+	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
+	{
+		di.weight_texel_count[i] = 0;
+
+		for (unsigned int j = 0; j < max_texel_count_of_weight; j++)
+		{
+			di.weight_texels_tr[j][i] = last_texel;
+			di.weights_texel_contribs_tr[j][i] = 0.0f;
+		}
+	}
+
+	di.texel_count = static_cast<uint8_t>(texels_per_block);
+	di.weight_count = static_cast<uint8_t>(weights_per_block);
+	di.weight_x = static_cast<uint8_t>(x_weights);
+	di.weight_y = static_cast<uint8_t>(y_weights);
+	di.weight_z = 1;
+}
+
+/**
+ * @brief Create a 3D decimation entry for a block-size and weight-decimation pair.
+ *
+ * @param      x_texels    The number of texels in the X dimension.
+ * @param      y_texels    The number of texels in the Y dimension.
+ * @param      z_texels    The number of texels in the Z dimension.
+ * @param      x_weights   The number of weights in the X dimension.
+ * @param      y_weights   The number of weights in the Y dimension.
+ * @param      z_weights   The number of weights in the Z dimension.
+ * @param[out] di          The decimation info structure to populate.
+   @param[out] wb          The decimation table init scratch working buffers.
+ */
+static void init_decimation_info_3d(
+	unsigned int x_texels,
+	unsigned int y_texels,
+	unsigned int z_texels,
+	unsigned int x_weights,
+	unsigned int y_weights,
+	unsigned int z_weights,
+	decimation_info& di,
+	dt_init_working_buffers& wb
+) {
+	unsigned int texels_per_block = x_texels * y_texels * z_texels;
+	unsigned int weights_per_block = x_weights * y_weights * z_weights;
+
+	uint8_t max_texel_count_of_weight = 0;
+
+	promise(weights_per_block > 0);
+	promise(texels_per_block > 0);
+
+	for (unsigned int i = 0; i < weights_per_block; i++)
+	{
+		wb.texel_count_of_weight[i] = 0;
+	}
+
+	for (unsigned int i = 0; i < texels_per_block; i++)
+	{
+		wb.weight_count_of_texel[i] = 0;
+	}
+
+	for (unsigned int z = 0; z < z_texels; z++)
+	{
+		for (unsigned int y = 0; y < y_texels; y++)
+		{
+			for (unsigned int x = 0; x < x_texels; x++)
+			{
+				int texel = (z * y_texels + y) * x_texels + x;
+
+				int x_weight = (((1024 + x_texels / 2) / (x_texels - 1)) * x * (x_weights - 1) + 32) >> 6;
+				int y_weight = (((1024 + y_texels / 2) / (y_texels - 1)) * y * (y_weights - 1) + 32) >> 6;
+				int z_weight = (((1024 + z_texels / 2) / (z_texels - 1)) * z * (z_weights - 1) + 32) >> 6;
+
+				int x_weight_frac = x_weight & 0xF;
+				int y_weight_frac = y_weight & 0xF;
+				int z_weight_frac = z_weight & 0xF;
+				int x_weight_int = x_weight >> 4;
+				int y_weight_int = y_weight >> 4;
+				int z_weight_int = z_weight >> 4;
+				int qweight[4];
+				int weight[4];
+				qweight[0] = (z_weight_int * y_weights + y_weight_int) * x_weights + x_weight_int;
+				qweight[3] = ((z_weight_int + 1) * y_weights + (y_weight_int + 1)) * x_weights + (x_weight_int + 1);
+
+				// simplex interpolation
+				int fs = x_weight_frac;
+				int ft = y_weight_frac;
+				int fp = z_weight_frac;
+
+				int cas = ((fs > ft) << 2) + ((ft > fp) << 1) + ((fs > fp));
+				int N = x_weights;
+				int NM = x_weights * y_weights;
+
+				int s1, s2, w0, w1, w2, w3;
+				switch (cas)
+				{
+				case 7:
+					s1 = 1;
+					s2 = N;
+					w0 = 16 - fs;
+					w1 = fs - ft;
+					w2 = ft - fp;
+					w3 = fp;
+					break;
+				case 3:
+					s1 = N;
+					s2 = 1;
+					w0 = 16 - ft;
+					w1 = ft - fs;
+					w2 = fs - fp;
+					w3 = fp;
+					break;
+				case 5:
+					s1 = 1;
+					s2 = NM;
+					w0 = 16 - fs;
+					w1 = fs - fp;
+					w2 = fp - ft;
+					w3 = ft;
+					break;
+				case 4:
+					s1 = NM;
+					s2 = 1;
+					w0 = 16 - fp;
+					w1 = fp - fs;
+					w2 = fs - ft;
+					w3 = ft;
+					break;
+				case 2:
+					s1 = N;
+					s2 = NM;
+					w0 = 16 - ft;
+					w1 = ft - fp;
+					w2 = fp - fs;
+					w3 = fs;
+					break;
+				case 0:
+					s1 = NM;
+					s2 = N;
+					w0 = 16 - fp;
+					w1 = fp - ft;
+					w2 = ft - fs;
+					w3 = fs;
+					break;
+				default:
+					s1 = NM;
+					s2 = N;
+					w0 = 16 - fp;
+					w1 = fp - ft;
+					w2 = ft - fs;
+					w3 = fs;
+					break;
+				}
+
+				qweight[1] = qweight[0] + s1;
+				qweight[2] = qweight[1] + s2;
+				weight[0] = w0;
+				weight[1] = w1;
+				weight[2] = w2;
+				weight[3] = w3;
+
+				for (unsigned int i = 0; i < 4; i++)
+				{
+					if (weight[i] != 0)
+					{
+						wb.grid_weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(qweight[i]);
+						wb.weights_of_texel[texel][wb.weight_count_of_texel[texel]] = static_cast<uint8_t>(weight[i]);
+						wb.weight_count_of_texel[texel]++;
+						wb.texels_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(texel);
+						wb.texel_weights_of_weight[qweight[i]][wb.texel_count_of_weight[qweight[i]]] = static_cast<uint8_t>(weight[i]);
+						wb.texel_count_of_weight[qweight[i]]++;
+						max_texel_count_of_weight = astc::max(max_texel_count_of_weight, wb.texel_count_of_weight[qweight[i]]);
+					}
+				}
+			}
+		}
+	}
+
+	uint8_t max_texel_weight_count = 0;
+	for (unsigned int i = 0; i < texels_per_block; i++)
+	{
+		di.texel_weight_count[i] = wb.weight_count_of_texel[i];
+		max_texel_weight_count = astc::max(max_texel_weight_count, di.texel_weight_count[i]);
+
+		// Init all 4 entries so we can rely on zeros for vectorization
+		for (unsigned int j = 0; j < 4; j++)
+		{
+			di.texel_weight_contribs_int_tr[j][i] = 0;
+			di.texel_weight_contribs_float_tr[j][i] = 0.0f;
+			di.texel_weights_tr[j][i] = 0;
+		}
+
+		for (unsigned int j = 0; j < wb.weight_count_of_texel[i]; j++)
+		{
+			di.texel_weight_contribs_int_tr[j][i] = wb.weights_of_texel[i][j];
+			di.texel_weight_contribs_float_tr[j][i] = static_cast<float>(wb.weights_of_texel[i][j]) * (1.0f / WEIGHTS_TEXEL_SUM);
+			di.texel_weights_tr[j][i] = wb.grid_weights_of_texel[i][j];
+		}
+	}
+
+	di.max_texel_weight_count = max_texel_weight_count;
+
+	for (unsigned int i = 0; i < weights_per_block; i++)
+	{
+		unsigned int texel_count_wt = wb.texel_count_of_weight[i];
+		di.weight_texel_count[i] = static_cast<uint8_t>(texel_count_wt);
+
+		for (unsigned int j = 0; j < texel_count_wt; j++)
+		{
+			unsigned int texel = wb.texels_of_weight[i][j];
+
+			// Create transposed versions of these for better vectorization
+			di.weight_texels_tr[j][i] = static_cast<uint8_t>(texel);
+			di.weights_texel_contribs_tr[j][i] = static_cast<float>(wb.texel_weights_of_weight[i][j]);
+
+			// Store the per-texel contribution of this weight for each texel it contributes to
+			di.texel_contrib_for_weight[j][i] = 0.0f;
+			for (unsigned int k = 0; k < 4; k++)
+			{
+				uint8_t dttw = di.texel_weights_tr[k][texel];
+				float dttwf = di.texel_weight_contribs_float_tr[k][texel];
+				if (dttw == i && dttwf != 0.0f)
+				{
+					di.texel_contrib_for_weight[j][i] = di.texel_weight_contribs_float_tr[k][texel];
+					break;
+				}
+			}
+		}
+
+		// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
+		// Match last texel in active lane in SIMD group, for better gathers
+		uint8_t last_texel = di.weight_texels_tr[texel_count_wt - 1][i];
+		for (unsigned int j = texel_count_wt; j < max_texel_count_of_weight; j++)
+		{
+			di.weight_texels_tr[j][i] = last_texel;
+			di.weights_texel_contribs_tr[j][i] = 0.0f;
+		}
+	}
+
+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
+	unsigned int texels_per_block_simd = round_up_to_simd_multiple_vla(texels_per_block);
+	for (unsigned int i = texels_per_block; i < texels_per_block_simd; i++)
+	{
+		di.texel_weight_count[i] = 0;
+
+		for (unsigned int j = 0; j < 4; j++)
+		{
+			di.texel_weight_contribs_float_tr[j][i] = 0;
+			di.texel_weights_tr[j][i] = 0;
+			di.texel_weight_contribs_int_tr[j][i] = 0;
+		}
+	}
+
+	// Initialize array tail so we can over-fetch with SIMD later to avoid loop tails
+	// Match last texel in active lane in SIMD group, for better gathers
+	int last_texel_count_wt = wb.texel_count_of_weight[weights_per_block - 1];
+	uint8_t last_texel = di.weight_texels_tr[last_texel_count_wt - 1][weights_per_block - 1];
+
+	unsigned int weights_per_block_simd = round_up_to_simd_multiple_vla(weights_per_block);
+	for (unsigned int i = weights_per_block; i < weights_per_block_simd; i++)
+	{
+		di.weight_texel_count[i] = 0;
+
+		for (int j = 0; j < max_texel_count_of_weight; j++)
+		{
+			di.weight_texels_tr[j][i] = last_texel;
+			di.weights_texel_contribs_tr[j][i] = 0.0f;
+		}
+	}
+
+	di.texel_count = static_cast<uint8_t>(texels_per_block);
+	di.weight_count = static_cast<uint8_t>(weights_per_block);
+	di.weight_x = static_cast<uint8_t>(x_weights);
+	di.weight_y = static_cast<uint8_t>(y_weights);
+	di.weight_z = static_cast<uint8_t>(z_weights);
+}
+
+/**
+ * @brief Assign the texels to use for kmeans clustering.
+ *
+ * The max limit is @c BLOCK_MAX_KMEANS_TEXELS; above this a random selection is used.
+ * The @c bsd.texel_count is an input and must be populated beforehand.
+ *
+ * @param[in,out] bsd   The block size descriptor to populate.
+ */
+static void assign_kmeans_texels(
+	block_size_descriptor& bsd
+) {
+	// Use all texels for kmeans on a small block
+	if (bsd.texel_count <= BLOCK_MAX_KMEANS_TEXELS)
+	{
+		for (uint8_t i = 0; i < bsd.texel_count; i++)
+		{
+			bsd.kmeans_texels[i] = i;
+		}
+
+		return;
+	}
+
+	// Select a random subset of BLOCK_MAX_KMEANS_TEXELS for kmeans on a large block
+	uint64_t rng_state[2];
+	astc::rand_init(rng_state);
+
+	// Initialize array used for tracking used indices
+	bool seen[BLOCK_MAX_TEXELS];
+	for (uint8_t i = 0; i < bsd.texel_count; i++)
+	{
+		seen[i] = false;
+	}
+
+	// Assign 64 random indices, retrying if we see repeats
+	unsigned int arr_elements_set = 0;
+	while (arr_elements_set < BLOCK_MAX_KMEANS_TEXELS)
+	{
+		uint8_t texel = static_cast<uint8_t>(astc::rand(rng_state));
+		texel = texel % bsd.texel_count;
+		if (!seen[texel])
+		{
+			bsd.kmeans_texels[arr_elements_set++] = texel;
+			seen[texel] = true;
+		}
+	}
+}
+
+/**
+ * @brief Allocate a single 2D decimation table entry.
+ *
+ * @param x_texels    The number of texels in the X dimension.
+ * @param y_texels    The number of texels in the Y dimension.
+ * @param x_weights   The number of weights in the X dimension.
+ * @param y_weights   The number of weights in the Y dimension.
+ * @param bsd         The block size descriptor we are populating.
+ * @param wb          The decimation table init scratch working buffers.
+ * @param index       The packed array index to populate.
+ */
+static void construct_dt_entry_2d(
+	unsigned int x_texels,
+	unsigned int y_texels,
+	unsigned int x_weights,
+	unsigned int y_weights,
+	block_size_descriptor& bsd,
+	dt_init_working_buffers& wb,
+	unsigned int index
+) {
+	unsigned int weight_count = x_weights * y_weights;
+	assert(weight_count <= BLOCK_MAX_WEIGHTS);
+
+	bool try_2planes = (2 * weight_count) <= BLOCK_MAX_WEIGHTS;
+
+	decimation_info& di = bsd.decimation_tables[index];
+	init_decimation_info_2d(x_texels, y_texels, x_weights, y_weights, di, wb);
+
+	int maxprec_1plane = -1;
+	int maxprec_2planes = -1;
+	for (int i = 0; i < 12; i++)
+	{
+		unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
+		if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
+		{
+			maxprec_1plane = i;
+		}
+
+		if (try_2planes)
+		{
+			unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
+			if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
+			{
+				maxprec_2planes = i;
+			}
+		}
+	}
+
+	// At least one of the two should be valid ...
+	assert(maxprec_1plane >= 0 || maxprec_2planes >= 0);
+	bsd.decimation_modes[index].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
+	bsd.decimation_modes[index].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
+	bsd.decimation_modes[index].refprec_1_plane = 0;
+	bsd.decimation_modes[index].refprec_2_planes = 0;
+}
+
+/**
+ * @brief Allocate block modes and decimation tables for a single 2D block size.
+ *
+ * @param      x_texels         The number of texels in the X dimension.
+ * @param      y_texels         The number of texels in the Y dimension.
+ * @param      can_omit_modes   Can we discard modes that astcenc won't use, even if legal?
+ * @param      mode_cutoff      Percentile cutoff in range [0,1]. Low values more likely to be used.
+ * @param[out] bsd              The block size descriptor to populate.
+ */
+static void construct_block_size_descriptor_2d(
+	unsigned int x_texels,
+	unsigned int y_texels,
+	bool can_omit_modes,
+	float mode_cutoff,
+	block_size_descriptor& bsd
+) {
+	// Store a remap table for storing packed decimation modes.
+	// Indexing uses [Y * 16 + X] and max size for each axis is 12.
+	static const unsigned int MAX_DMI = 12 * 16 + 12;
+	int decimation_mode_index[MAX_DMI];
+
+	dt_init_working_buffers* wb = new dt_init_working_buffers;
+
+	bsd.xdim = static_cast<uint8_t>(x_texels);
+	bsd.ydim = static_cast<uint8_t>(y_texels);
+	bsd.zdim = 1;
+	bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels);
+
+	for (unsigned int i = 0; i < MAX_DMI; i++)
+	{
+		decimation_mode_index[i] = -1;
+	}
+
+	// Gather all the decimation grids that can be used with the current block
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	const float *percentiles = get_2d_percentile_table(x_texels, y_texels);
+	float always_cutoff = 0.0f;
+#else
+	// Unused in decompress-only builds
+	(void)can_omit_modes;
+	(void)mode_cutoff;
+#endif
+
+	// Construct the list of block formats referencing the decimation tables
+	unsigned int packed_bm_idx = 0;
+	unsigned int packed_dm_idx = 0;
+
+	// Trackers
+	unsigned int bm_counts[4] { 0 };
+	unsigned int dm_counts[4] { 0 };
+
+	// Clear the list to a known-bad value
+	for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
+	{
+		bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
+	}
+
+	// Iterate four times to build a usefully ordered list:
+	//   - Pass 0 - keep selected single plane "always" block modes
+	//   - Pass 1 - keep selected single plane "non-always" block modes
+	//   - Pass 2 - keep select dual plane block modes
+	//   - Pass 3 - keep everything else that's legal
+	unsigned int limit = can_omit_modes ? 3 : 4;
+	for (unsigned int j = 0; j < limit; j ++)
+	{
+		for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
+		{
+			// Skip modes we've already included in a previous pass
+			if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
+			{
+				continue;
+			}
+
+			// Decode parameters
+			unsigned int x_weights;
+			unsigned int y_weights;
+			bool is_dual_plane;
+			unsigned int quant_mode;
+			unsigned int weight_bits;
+			bool valid = decode_block_mode_2d(i, x_weights, y_weights, is_dual_plane, quant_mode, weight_bits);
+
+			// Always skip invalid encodings for the current block size
+			if (!valid || (x_weights > x_texels) || (y_weights > y_texels))
+			{
+				continue;
+			}
+
+			// Selectively skip dual plane encodings
+			if (((j <= 1) && is_dual_plane) || (j == 2 && !is_dual_plane))
+			{
+				continue;
+			}
+
+			// Always skip encodings we can't physically encode based on
+			// generic encoding bit availability
+			if (is_dual_plane)
+			{
+				 // This is the only check we need as only support 1 partition
+				 if ((109 - weight_bits) <= 0)
+				 {
+					continue;
+				 }
+			}
+			else
+			{
+				// This is conservative - fewer bits may be available for > 1 partition
+				 if ((111 - weight_bits) <= 0)
+				 {
+					continue;
+				 }
+			}
+
+			// Selectively skip encodings based on percentile
+			bool percentile_hit = false;
+	#if !defined(ASTCENC_DECOMPRESS_ONLY)
+			if (j == 0)
+			{
+				percentile_hit = percentiles[i] <= always_cutoff;
+			}
+			else
+			{
+				percentile_hit = percentiles[i] <= mode_cutoff;
+			}
+	#endif
+
+			if (j != 3 && !percentile_hit)
+			{
+				continue;
+			}
+
+			// Allocate and initialize the decimation table entry if we've not used it yet
+			int decimation_mode = decimation_mode_index[y_weights * 16 + x_weights];
+			if (decimation_mode < 0)
+			{
+				construct_dt_entry_2d(x_texels, y_texels, x_weights, y_weights, bsd, *wb, packed_dm_idx);
+				decimation_mode_index[y_weights * 16 + x_weights] = packed_dm_idx;
+				decimation_mode = packed_dm_idx;
+
+				dm_counts[j]++;
+				packed_dm_idx++;
+			}
+
+			auto& bm = bsd.block_modes[packed_bm_idx];
+
+			bm.decimation_mode = static_cast<uint8_t>(decimation_mode);
+			bm.quant_mode = static_cast<uint8_t>(quant_mode);
+			bm.is_dual_plane = static_cast<uint8_t>(is_dual_plane);
+			bm.weight_bits = static_cast<uint8_t>(weight_bits);
+			bm.mode_index = static_cast<uint16_t>(i);
+
+			auto& dm = bsd.decimation_modes[decimation_mode];
+
+			if (is_dual_plane)
+			{
+				dm.set_ref_2_plane(bm.get_weight_quant_mode());
+			}
+			else
+			{
+				dm.set_ref_1_plane(bm.get_weight_quant_mode());
+			}
+
+			bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_bm_idx);
+
+			packed_bm_idx++;
+			bm_counts[j]++;
+		}
+	}
+
+	bsd.block_mode_count_1plane_always = bm_counts[0];
+	bsd.block_mode_count_1plane_selected = bm_counts[0] + bm_counts[1];
+	bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1] + bm_counts[2];
+	bsd.block_mode_count_all = bm_counts[0] + bm_counts[1] + bm_counts[2] + bm_counts[3];
+
+	bsd.decimation_mode_count_always = dm_counts[0];
+	bsd.decimation_mode_count_selected = dm_counts[0] + dm_counts[1] + dm_counts[2];
+	bsd.decimation_mode_count_all = dm_counts[0] + dm_counts[1] + dm_counts[2] + dm_counts[3];
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	assert(bsd.block_mode_count_1plane_always > 0);
+	assert(bsd.decimation_mode_count_always > 0);
+
+	delete[] percentiles;
+#endif
+
+	// Ensure the end of the array contains valid data (should never get read)
+	for (unsigned int i = bsd.decimation_mode_count_all; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
+	{
+		bsd.decimation_modes[i].maxprec_1plane = -1;
+		bsd.decimation_modes[i].maxprec_2planes = -1;
+		bsd.decimation_modes[i].refprec_1_plane = 0;
+		bsd.decimation_modes[i].refprec_2_planes = 0;
+	}
+
+	// Determine the texels to use for kmeans clustering.
+	assign_kmeans_texels(bsd);
+
+	delete wb;
+}
+
+/**
+ * @brief Allocate block modes and decimation tables for a single 3D block size.
+ *
+ * TODO: This function doesn't include all of the heuristics that we use for 2D block sizes such as
+ * the percentile mode cutoffs. If 3D becomes more widely used we should look at this.
+ *
+ * @param      x_texels   The number of texels in the X dimension.
+ * @param      y_texels   The number of texels in the Y dimension.
+ * @param      z_texels   The number of texels in the Z dimension.
+ * @param[out] bsd        The block size descriptor to populate.
+ */
+static void construct_block_size_descriptor_3d(
+	unsigned int x_texels,
+	unsigned int y_texels,
+	unsigned int z_texels,
+	block_size_descriptor& bsd
+) {
+	// Store a remap table for storing packed decimation modes.
+	// Indexing uses [Z * 64 + Y *  8 + X] and max size for each axis is 6.
+	static constexpr unsigned int MAX_DMI = 6 * 64 + 6 * 8 + 6;
+	int decimation_mode_index[MAX_DMI];
+	unsigned int decimation_mode_count = 0;
+
+	dt_init_working_buffers* wb = new dt_init_working_buffers;
+
+	bsd.xdim = static_cast<uint8_t>(x_texels);
+	bsd.ydim = static_cast<uint8_t>(y_texels);
+	bsd.zdim = static_cast<uint8_t>(z_texels);
+	bsd.texel_count = static_cast<uint8_t>(x_texels * y_texels * z_texels);
+
+	for (unsigned int i = 0; i < MAX_DMI; i++)
+	{
+		decimation_mode_index[i] = -1;
+	}
+
+	// gather all the infill-modes that can be used with the current block size
+	for (unsigned int x_weights = 2; x_weights <= x_texels; x_weights++)
+	{
+		for (unsigned int y_weights = 2; y_weights <= y_texels; y_weights++)
+		{
+			for (unsigned int z_weights = 2; z_weights <= z_texels; z_weights++)
+			{
+				unsigned int weight_count = x_weights * y_weights * z_weights;
+				if (weight_count > BLOCK_MAX_WEIGHTS)
+				{
+					continue;
+				}
+
+				decimation_info& di = bsd.decimation_tables[decimation_mode_count];
+				decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights] = decimation_mode_count;
+				init_decimation_info_3d(x_texels, y_texels, z_texels, x_weights, y_weights, z_weights, di, *wb);
+
+				int maxprec_1plane = -1;
+				int maxprec_2planes = -1;
+				for (unsigned int i = 0; i < 12; i++)
+				{
+					unsigned int bits_1plane = get_ise_sequence_bitcount(weight_count, static_cast<quant_method>(i));
+					if (bits_1plane >= BLOCK_MIN_WEIGHT_BITS && bits_1plane <= BLOCK_MAX_WEIGHT_BITS)
+					{
+						maxprec_1plane = i;
+					}
+
+					unsigned int bits_2planes = get_ise_sequence_bitcount(2 * weight_count, static_cast<quant_method>(i));
+					if (bits_2planes >= BLOCK_MIN_WEIGHT_BITS && bits_2planes <= BLOCK_MAX_WEIGHT_BITS)
+					{
+						maxprec_2planes = i;
+					}
+				}
+
+				if ((2 * weight_count) > BLOCK_MAX_WEIGHTS)
+				{
+					maxprec_2planes = -1;
+				}
+
+				bsd.decimation_modes[decimation_mode_count].maxprec_1plane = static_cast<int8_t>(maxprec_1plane);
+				bsd.decimation_modes[decimation_mode_count].maxprec_2planes = static_cast<int8_t>(maxprec_2planes);
+				bsd.decimation_modes[decimation_mode_count].refprec_1_plane = maxprec_1plane == -1 ? 0 : 0xFFFF;
+				bsd.decimation_modes[decimation_mode_count].refprec_2_planes = maxprec_2planes == -1 ? 0 : 0xFFFF;
+				decimation_mode_count++;
+			}
+		}
+	}
+
+	// Ensure the end of the array contains valid data (should never get read)
+	for (unsigned int i = decimation_mode_count; i < WEIGHTS_MAX_DECIMATION_MODES; i++)
+	{
+		bsd.decimation_modes[i].maxprec_1plane = -1;
+		bsd.decimation_modes[i].maxprec_2planes = -1;
+		bsd.decimation_modes[i].refprec_1_plane = 0;
+		bsd.decimation_modes[i].refprec_2_planes = 0;
+	}
+
+	bsd.decimation_mode_count_always = 0; // Skipped for 3D modes
+	bsd.decimation_mode_count_selected = decimation_mode_count;
+	bsd.decimation_mode_count_all = decimation_mode_count;
+
+	// Construct the list of block formats referencing the decimation tables
+
+	// Clear the list to a known-bad value
+	for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
+	{
+		bsd.block_mode_packed_index[i] = BLOCK_BAD_BLOCK_MODE;
+	}
+
+	unsigned int packed_idx = 0;
+	unsigned int bm_counts[2] { 0 };
+
+	// Iterate two times to build a usefully ordered list:
+	//   - Pass 0 - keep valid single plane block modes
+	//   - Pass 1 - keep valid dual plane block modes
+	for (unsigned int j = 0; j < 2; j++)
+	{
+		for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
+		{
+			// Skip modes we've already included in a previous pass
+			if (bsd.block_mode_packed_index[i] != BLOCK_BAD_BLOCK_MODE)
+			{
+				continue;
+			}
+
+			unsigned int x_weights;
+			unsigned int y_weights;
+			unsigned int z_weights;
+			bool is_dual_plane;
+			unsigned int quant_mode;
+			unsigned int weight_bits;
+
+			bool valid = decode_block_mode_3d(i, x_weights, y_weights, z_weights, is_dual_plane, quant_mode, weight_bits);
+			// Skip invalid encodings
+			if (!valid || x_weights > x_texels || y_weights > y_texels || z_weights > z_texels)
+			{
+				continue;
+			}
+
+			// Skip encodings in the wrong iteration
+			if ((j == 0 && is_dual_plane) || (j == 1 && !is_dual_plane))
+			{
+				continue;
+			}
+
+			// Always skip encodings we can't physically encode based on bit availability
+			if (is_dual_plane)
+			{
+				 // This is the only check we need as only support 1 partition
+				 if ((109 - weight_bits) <= 0)
+				 {
+					continue;
+				 }
+			}
+			else
+			{
+				// This is conservative - fewer bits may be available for > 1 partition
+				 if ((111 - weight_bits) <= 0)
+				 {
+					continue;
+				 }
+			}
+
+			int decimation_mode = decimation_mode_index[z_weights * 64 + y_weights * 8 + x_weights];
+			bsd.block_modes[packed_idx].decimation_mode = static_cast<uint8_t>(decimation_mode);
+			bsd.block_modes[packed_idx].quant_mode = static_cast<uint8_t>(quant_mode);
+			bsd.block_modes[packed_idx].weight_bits = static_cast<uint8_t>(weight_bits);
+			bsd.block_modes[packed_idx].is_dual_plane = static_cast<uint8_t>(is_dual_plane);
+			bsd.block_modes[packed_idx].mode_index = static_cast<uint16_t>(i);
+
+			bsd.block_mode_packed_index[i] = static_cast<uint16_t>(packed_idx);
+			bm_counts[j]++;
+			packed_idx++;
+		}
+	}
+
+	bsd.block_mode_count_1plane_always = 0;  // Skipped for 3D modes
+	bsd.block_mode_count_1plane_selected = bm_counts[0];
+	bsd.block_mode_count_1plane_2plane_selected = bm_counts[0] + bm_counts[1];
+	bsd.block_mode_count_all = bm_counts[0] + bm_counts[1];
+
+	// Determine the texels to use for kmeans clustering.
+	assign_kmeans_texels(bsd);
+
+	delete wb;
+}
+
+/* See header for documentation. */
+void init_block_size_descriptor(
+	unsigned int x_texels,
+	unsigned int y_texels,
+	unsigned int z_texels,
+	bool can_omit_modes,
+	unsigned int partition_count_cutoff,
+	float mode_cutoff,
+	block_size_descriptor& bsd
+) {
+	if (z_texels > 1)
+	{
+		construct_block_size_descriptor_3d(x_texels, y_texels, z_texels, bsd);
+	}
+	else
+	{
+		construct_block_size_descriptor_2d(x_texels, y_texels, can_omit_modes, mode_cutoff, bsd);
+	}
+
+	init_partition_tables(bsd, can_omit_modes, partition_count_cutoff);
+}
diff --git a/thirdparty/astcenc/astcenc_color_quantize.cpp b/thirdparty/astcenc/astcenc_color_quantize.cpp
new file mode 100644
index 0000000000..edcfe4f853
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_color_quantize.cpp
@@ -0,0 +1,2071 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for color quantization.
+ *
+ * The design of the color quantization functionality requires the caller to use higher level error
+ * analysis to determine the base encoding that should be used. This earlier analysis will select
+ * the basic type of the endpoint that should be used:
+ *
+ *     * Mode: LDR or HDR
+ *     * Quantization level
+ *     * Channel count: L, LA, RGB, or RGBA
+ *     * Endpoint 2 type: Direct color endcode, or scaled from endpoint 1.
+ *
+ * However, this leaves a number of decisions about exactly how to pack the endpoints open. In
+ * particular we need to determine if blue contraction can be used, or/and if delta encoding can be
+ * used. If they can be applied these will allow us to maintain higher precision in the endpoints
+ * without needing additional storage.
+ */
+
+#include <stdio.h>
+#include <assert.h>
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Determine the quantized value given a quantization level.
+ *
+ * @param quant_level   The quantization level to use.
+ * @param value         The value to convert. This may be outside of the 0-255 range and will be
+ *                      clamped before the value is looked up.
+ *
+ * @return The encoded quantized value. These are not necessarily in order; the compressor
+ *         scrambles the values slightly to make hardware implementation easier.
+ */
+static inline uint8_t quant_color(
+	quant_method quant_level,
+	int value
+) {
+	return color_unquant_to_uquant_tables[quant_level - QUANT_6][value];
+}
+
+/**
+ * @brief Quantize an LDR RGB color.
+ *
+ * Since this is a fall-back encoding, we cannot actually fail but must produce a sensible result.
+ * For this encoding @c color0 cannot be larger than @c color1. If @c color0 is actually larger
+ * than @c color1, @c color0 is reduced and @c color1 is increased until the constraint is met.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (r0, r1, g0, g1, b0, b1).
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_rgb(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[6],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float r0 = astc::clamp255f(color0.lane<0>() * scale);
+	float g0 = astc::clamp255f(color0.lane<1>() * scale);
+	float b0 = astc::clamp255f(color0.lane<2>() * scale);
+
+	float r1 = astc::clamp255f(color1.lane<0>() * scale);
+	float g1 = astc::clamp255f(color1.lane<1>() * scale);
+	float b1 = astc::clamp255f(color1.lane<2>() * scale);
+
+	int ri0, gi0, bi0, ri1, gi1, bi1;
+	float rgb0_addon = 0.5f;
+	float rgb1_addon = 0.5f;
+	do
+	{
+		ri0 = quant_color(quant_level, astc::max(astc::flt2int_rd(r0 + rgb0_addon), 0));
+		gi0 = quant_color(quant_level, astc::max(astc::flt2int_rd(g0 + rgb0_addon), 0));
+		bi0 = quant_color(quant_level, astc::max(astc::flt2int_rd(b0 + rgb0_addon), 0));
+		ri1 = quant_color(quant_level, astc::min(astc::flt2int_rd(r1 + rgb1_addon), 255));
+		gi1 = quant_color(quant_level, astc::min(astc::flt2int_rd(g1 + rgb1_addon), 255));
+		bi1 = quant_color(quant_level, astc::min(astc::flt2int_rd(b1 + rgb1_addon), 255));
+
+		rgb0_addon -= 0.2f;
+		rgb1_addon += 0.2f;
+	} while (ri0 + gi0 + bi0 > ri1 + gi1 + bi1);
+
+	output[0] = static_cast<uint8_t>(ri0);
+	output[1] = static_cast<uint8_t>(ri1);
+	output[2] = static_cast<uint8_t>(gi0);
+	output[3] = static_cast<uint8_t>(gi1);
+	output[4] = static_cast<uint8_t>(bi0);
+	output[5] = static_cast<uint8_t>(bi1);
+}
+
+/**
+ * @brief Quantize an LDR RGBA color.
+ *
+ * Since this is a fall-back encoding, we cannot actually fail but must produce a sensible result.
+ * For this encoding @c color0.rgb cannot be larger than @c color1.rgb (this indicates blue
+ * contraction). If @c color0.rgb is actually larger than @c color1.rgb, @c color0.rgb is reduced
+ * and @c color1.rgb is increased until the constraint is met.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (r0, r1, g0, g1, b0, b1, a0, a1).
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_rgba(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[8],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
+
+	output[6] = quant_color(quant_level, astc::flt2int_rtn(a0));
+	output[7] = quant_color(quant_level, astc::flt2int_rtn(a1));
+
+	quantize_rgb(color0, color1, output, quant_level);
+}
+
+/**
+ * @brief Try to quantize an LDR RGB color using blue-contraction.
+ *
+ * Blue-contraction is only usable if encoded color 1 is larger than color 0.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (r1, r0, g1, g0, b1, b0).
+ * @param      quant_level   The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static bool try_quantize_rgb_blue_contract(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[6],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float r0 = color0.lane<0>() * scale;
+	float g0 = color0.lane<1>() * scale;
+	float b0 = color0.lane<2>() * scale;
+
+	float r1 = color1.lane<0>() * scale;
+	float g1 = color1.lane<1>() * scale;
+	float b1 = color1.lane<2>() * scale;
+
+	// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
+	r0 += (r0 - b0);
+	g0 += (g0 - b0);
+	r1 += (r1 - b1);
+	g1 += (g1 - b1);
+
+	if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
+		r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
+	{
+		return false;
+	}
+
+	// Quantize the inverse-blue-contracted color
+	int ri0 = quant_color(quant_level, astc::flt2int_rtn(r0));
+	int gi0 = quant_color(quant_level, astc::flt2int_rtn(g0));
+	int bi0 = quant_color(quant_level, astc::flt2int_rtn(b0));
+
+	int ri1 = quant_color(quant_level, astc::flt2int_rtn(r1));
+	int gi1 = quant_color(quant_level, astc::flt2int_rtn(g1));
+	int bi1 = quant_color(quant_level, astc::flt2int_rtn(b1));
+
+	// If color #1 is not larger than color #0 then blue-contraction cannot be used. Note that
+	// blue-contraction and quantization change this order, which is why we must test afterwards.
+	if (ri1 + gi1 + bi1 <= ri0 + gi0 + bi0)
+	{
+		return false;
+	}
+
+	output[0] = static_cast<uint8_t>(ri1);
+	output[1] = static_cast<uint8_t>(ri0);
+	output[2] = static_cast<uint8_t>(gi1);
+	output[3] = static_cast<uint8_t>(gi0);
+	output[4] = static_cast<uint8_t>(bi1);
+	output[5] = static_cast<uint8_t>(bi0);
+
+	return true;
+}
+
+/**
+ * @brief Try to quantize an LDR RGBA color using blue-contraction.
+ *
+ * Blue-contraction is only usable if encoded color 1 RGB is larger than color 0 RGB.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (r1, r0, g1, g0, b1, b0, a1, a0).
+ * @param      quant_level   The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static int try_quantize_rgba_blue_contract(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[8],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
+
+	output[6] = quant_color(quant_level, astc::flt2int_rtn(a1));
+	output[7] = quant_color(quant_level, astc::flt2int_rtn(a0));
+
+	return try_quantize_rgb_blue_contract(color0, color1, output, quant_level);
+}
+
+/**
+ * @brief Try to quantize an LDR RGB color using delta encoding.
+ *
+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
+ * non-negative, then we encode a regular delta.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (r0, r1, g0, g1, b0, b1).
+ * @param      quant_level   The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static bool try_quantize_rgb_delta(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[6],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float r0 = astc::clamp255f(color0.lane<0>() * scale);
+	float g0 = astc::clamp255f(color0.lane<1>() * scale);
+	float b0 = astc::clamp255f(color0.lane<2>() * scale);
+
+	float r1 = astc::clamp255f(color1.lane<0>() * scale);
+	float g1 = astc::clamp255f(color1.lane<1>() * scale);
+	float b1 = astc::clamp255f(color1.lane<2>() * scale);
+
+	// Transform r0 to unorm9
+	int r0a = astc::flt2int_rtn(r0);
+	int g0a = astc::flt2int_rtn(g0);
+	int b0a = astc::flt2int_rtn(b0);
+
+	r0a <<= 1;
+	g0a <<= 1;
+	b0a <<= 1;
+
+	// Mask off the top bit
+	int r0b = r0a & 0xFF;
+	int g0b = g0a & 0xFF;
+	int b0b = b0a & 0xFF;
+
+	// Quantize then unquantize in order to get a value that we take differences against
+	int r0be = quant_color(quant_level, r0b);
+	int g0be = quant_color(quant_level, g0b);
+	int b0be = quant_color(quant_level, b0b);
+
+	r0b = r0be | (r0a & 0x100);
+	g0b = g0be | (g0a & 0x100);
+	b0b = b0be | (b0a & 0x100);
+
+	// Get hold of the second value
+	int r1d = astc::flt2int_rtn(r1);
+	int g1d = astc::flt2int_rtn(g1);
+	int b1d = astc::flt2int_rtn(b1);
+
+	r1d <<= 1;
+	g1d <<= 1;
+	b1d <<= 1;
+
+	// ... and take differences
+	r1d -= r0b;
+	g1d -= g0b;
+	b1d -= b0b;
+
+	// Check if the difference is too large to be encodable
+	if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
+	{
+		return false;
+	}
+
+	// Insert top bit of the base into the offset
+	r1d &= 0x7F;
+	g1d &= 0x7F;
+	b1d &= 0x7F;
+
+	r1d |= (r0b & 0x100) >> 1;
+	g1d |= (g0b & 0x100) >> 1;
+	b1d |= (b0b & 0x100) >> 1;
+
+	// Then quantize and unquantize; if this causes either top two bits to flip, then encoding fails
+	// since we have then corrupted either the top bit of the base or the sign bit of the offset
+	int r1de = quant_color(quant_level, r1d);
+	int g1de = quant_color(quant_level, g1d);
+	int b1de = quant_color(quant_level, b1d);
+
+	if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0)
+	{
+		return false;
+	}
+
+	// If the sum of offsets triggers blue-contraction then encoding fails
+	vint4 ep0(r0be, g0be, b0be, 0);
+	vint4 ep1(r1de, g1de, b1de, 0);
+	bit_transfer_signed(ep1, ep0);
+	if (hadd_rgb_s(ep1) < 0)
+	{
+		return false;
+	}
+
+	// Check that the offsets produce legitimate sums as well
+	ep0 = ep0 + ep1;
+	if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
+	{
+		return false;
+	}
+
+	output[0] = static_cast<uint8_t>(r0be);
+	output[1] = static_cast<uint8_t>(r1de);
+	output[2] = static_cast<uint8_t>(g0be);
+	output[3] = static_cast<uint8_t>(g1de);
+	output[4] = static_cast<uint8_t>(b0be);
+	output[5] = static_cast<uint8_t>(b1de);
+
+	return true;
+}
+
+static bool try_quantize_rgb_delta_blue_contract(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[6],
+	quant_method quant_level
+) {
+	// Note: Switch around endpoint colors already at start
+	float scale = 1.0f / 257.0f;
+
+	float r1 = color0.lane<0>() * scale;
+	float g1 = color0.lane<1>() * scale;
+	float b1 = color0.lane<2>() * scale;
+
+	float r0 = color1.lane<0>() * scale;
+	float g0 = color1.lane<1>() * scale;
+	float b0 = color1.lane<2>() * scale;
+
+	// Apply inverse blue-contraction. This can produce an overflow; which means BC cannot be used.
+	r0 += (r0 - b0);
+	g0 += (g0 - b0);
+	r1 += (r1 - b1);
+	g1 += (g1 - b1);
+
+	if (r0 < 0.0f || r0 > 255.0f || g0 < 0.0f || g0 > 255.0f || b0 < 0.0f || b0 > 255.0f ||
+	    r1 < 0.0f || r1 > 255.0f || g1 < 0.0f || g1 > 255.0f || b1 < 0.0f || b1 > 255.0f)
+	{
+		return false;
+	}
+
+	// Transform r0 to unorm9
+	int r0a = astc::flt2int_rtn(r0);
+	int g0a = astc::flt2int_rtn(g0);
+	int b0a = astc::flt2int_rtn(b0);
+	r0a <<= 1;
+	g0a <<= 1;
+	b0a <<= 1;
+
+	// Mask off the top bit
+	int r0b = r0a & 0xFF;
+	int g0b = g0a & 0xFF;
+	int b0b = b0a & 0xFF;
+
+	// Quantize, then unquantize in order to get a value that we take differences against.
+	int r0be = quant_color(quant_level, r0b);
+	int g0be = quant_color(quant_level, g0b);
+	int b0be = quant_color(quant_level, b0b);
+
+	r0b = r0be | (r0a & 0x100);
+	g0b = g0be | (g0a & 0x100);
+	b0b = b0be | (b0a & 0x100);
+
+	// Get hold of the second value
+	int r1d = astc::flt2int_rtn(r1);
+	int g1d = astc::flt2int_rtn(g1);
+	int b1d = astc::flt2int_rtn(b1);
+
+	r1d <<= 1;
+	g1d <<= 1;
+	b1d <<= 1;
+
+	// .. and take differences!
+	r1d -= r0b;
+	g1d -= g0b;
+	b1d -= b0b;
+
+	// Check if the difference is too large to be encodable
+	if (r1d > 63 || g1d > 63 || b1d > 63 || r1d < -64 || g1d < -64 || b1d < -64)
+	{
+		return false;
+	}
+
+	// Insert top bit of the base into the offset
+	r1d &= 0x7F;
+	g1d &= 0x7F;
+	b1d &= 0x7F;
+
+	r1d |= (r0b & 0x100) >> 1;
+	g1d |= (g0b & 0x100) >> 1;
+	b1d |= (b0b & 0x100) >> 1;
+
+	// Then quantize and  unquantize; if this causes any of the top two bits to flip,
+	// then encoding fails, since we have then corrupted either the top bit of the base
+	// or the sign bit of the offset.
+	int r1de = quant_color(quant_level, r1d);
+	int g1de = quant_color(quant_level, g1d);
+	int b1de = quant_color(quant_level, b1d);
+
+	if (((r1d ^ r1de) | (g1d ^ g1de) | (b1d ^ b1de)) & 0xC0)
+	{
+		return false;
+	}
+
+	// If the sum of offsets does not trigger blue-contraction then encoding fails
+	vint4 ep0(r0be, g0be, b0be, 0);
+	vint4 ep1(r1de, g1de, b1de, 0);
+	bit_transfer_signed(ep1, ep0);
+	if (hadd_rgb_s(ep1) >= 0)
+	{
+		return false;
+	}
+
+	// Check that the offsets produce legitimate sums as well
+	ep0 = ep0 + ep1;
+	if (any((ep0 < vint4(0)) | (ep0 > vint4(0xFF))))
+	{
+		return false;
+	}
+
+	output[0] = static_cast<uint8_t>(r0be);
+	output[1] = static_cast<uint8_t>(r1de);
+	output[2] = static_cast<uint8_t>(g0be);
+	output[3] = static_cast<uint8_t>(g1de);
+	output[4] = static_cast<uint8_t>(b0be);
+	output[5] = static_cast<uint8_t>(b1de);
+
+	return true;
+}
+
+/**
+ * @brief Try to quantize an LDR A color using delta encoding.
+ *
+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
+ * non-negative, then we encode a regular delta.
+ *
+ * This function only compressed the alpha - the other elements in the output array are not touched.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (x, x, x, x, x, x, a0, a1).
+ * @param      quant_level   The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static bool try_quantize_alpha_delta(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[8],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
+
+	int a0a = astc::flt2int_rtn(a0);
+	a0a <<= 1;
+	int a0b = a0a & 0xFF;
+	int a0be = quant_color(quant_level, a0b);
+	a0b = a0be;
+	a0b |= a0a & 0x100;
+	int a1d = astc::flt2int_rtn(a1);
+	a1d <<= 1;
+	a1d -= a0b;
+
+	if (a1d > 63 || a1d < -64)
+	{
+		return false;
+	}
+
+	a1d &= 0x7F;
+	a1d |= (a0b & 0x100) >> 1;
+
+	int a1de = quant_color(quant_level, a1d);
+	int a1du = a1de;
+	if ((a1d ^ a1du) & 0xC0)
+	{
+		return false;
+	}
+
+	a1du &= 0x7F;
+	if (a1du & 0x40)
+	{
+		a1du -= 0x80;
+	}
+
+	a1du += a0b;
+	if (a1du < 0 || a1du > 0x1FF)
+	{
+		return false;
+	}
+
+	output[6] = static_cast<uint8_t>(a0be);
+	output[7] = static_cast<uint8_t>(a1de);
+
+	return true;
+}
+
+/**
+ * @brief Try to quantize an LDR LA color using delta encoding.
+ *
+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
+ * non-negative, then we encode a regular delta.
+ *
+ * This function only compressed the alpha - the other elements in the output array are not touched.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (l0, l1, a0, a1).
+ * @param      quant_level   The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static bool try_quantize_luminance_alpha_delta(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[4],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float l0 = astc::clamp255f(hadd_rgb_s(color0) * ((1.0f / 3.0f) * scale));
+	float l1 = astc::clamp255f(hadd_rgb_s(color1) * ((1.0f / 3.0f) * scale));
+
+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
+
+	int l0a = astc::flt2int_rtn(l0);
+	int a0a = astc::flt2int_rtn(a0);
+	l0a <<= 1;
+	a0a <<= 1;
+
+	int l0b = l0a & 0xFF;
+	int a0b = a0a & 0xFF;
+	int l0be = quant_color(quant_level, l0b);
+	int a0be = quant_color(quant_level, a0b);
+	l0b = l0be;
+	a0b = a0be;
+	l0b |= l0a & 0x100;
+	a0b |= a0a & 0x100;
+
+	int l1d = astc::flt2int_rtn(l1);
+	int a1d = astc::flt2int_rtn(a1);
+	l1d <<= 1;
+	a1d <<= 1;
+	l1d -= l0b;
+	a1d -= a0b;
+
+	if (l1d > 63 || l1d < -64)
+	{
+		return false;
+	}
+
+	if (a1d > 63 || a1d < -64)
+	{
+		return false;
+	}
+
+	l1d &= 0x7F;
+	a1d &= 0x7F;
+	l1d |= (l0b & 0x100) >> 1;
+	a1d |= (a0b & 0x100) >> 1;
+
+	int l1de = quant_color(quant_level, l1d);
+	int a1de = quant_color(quant_level, a1d);
+	int l1du = l1de;
+	int a1du = a1de;
+
+	if ((l1d ^ l1du) & 0xC0)
+	{
+		return false;
+	}
+
+	if ((a1d ^ a1du) & 0xC0)
+	{
+		return false;
+	}
+
+	l1du &= 0x7F;
+	a1du &= 0x7F;
+
+	if (l1du & 0x40)
+	{
+		l1du -= 0x80;
+	}
+
+	if (a1du & 0x40)
+	{
+		a1du -= 0x80;
+	}
+
+	l1du += l0b;
+	a1du += a0b;
+
+	if (l1du < 0 || l1du > 0x1FF)
+	{
+		return false;
+	}
+
+	if (a1du < 0 || a1du > 0x1FF)
+	{
+		return false;
+	}
+
+	output[0] = static_cast<uint8_t>(l0be);
+	output[1] = static_cast<uint8_t>(l1de);
+	output[2] = static_cast<uint8_t>(a0be);
+	output[3] = static_cast<uint8_t>(a1de);
+
+	return true;
+}
+
+/**
+ * @brief Try to quantize an LDR RGBA color using delta encoding.
+ *
+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
+ * non-negative, then we encode a regular delta.
+ *
+ * This function only compressed the alpha - the other elements in the output array are not touched.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1).
+ * @param      quant_level   The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static bool try_quantize_rgba_delta(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[8],
+	quant_method quant_level
+) {
+	return try_quantize_rgb_delta(color0, color1, output, quant_level) &&
+	       try_quantize_alpha_delta(color0, color1, output, quant_level);
+}
+
+
+/**
+ * @brief Try to quantize an LDR RGBA color using delta and blue contract encoding.
+ *
+ * At decode time we move one bit from the offset to the base and seize another bit as a sign bit;
+ * we then unquantize both values as if they contain one extra bit. If the sum of the offsets is
+ * non-negative, then we encode a regular delta.
+ *
+ * This function only compressed the alpha - the other elements in the output array are not touched.
+ *
+ * @param      color0       The input unquantized color0 endpoint.
+ * @param      color1       The input unquantized color1 endpoint.
+ * @param[out] output       The output endpoints, returned as (r0, r1, b0, b1, g0, g1, a0, a1).
+ * @param      quant_level  The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static bool try_quantize_rgba_delta_blue_contract(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[8],
+	quant_method quant_level
+) {
+	// Note that we swap the color0 and color1 ordering for alpha to match RGB blue-contract
+	return try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level) &&
+	       try_quantize_alpha_delta(color1, color0, output, quant_level);
+}
+
+/**
+ * @brief Quantize an LDR RGB color using scale encoding.
+ *
+ * @param      color         The input unquantized color endpoint and scale factor.
+ * @param[out] output        The output endpoints, returned as (r0, g0, b0, s).
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_rgbs(
+	vfloat4 color,
+	uint8_t output[4],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float r = astc::clamp255f(color.lane<0>() * scale);
+	float g = astc::clamp255f(color.lane<1>() * scale);
+	float b = astc::clamp255f(color.lane<2>() * scale);
+
+	int ri = quant_color(quant_level, astc::flt2int_rtn(r));
+	int gi = quant_color(quant_level, astc::flt2int_rtn(g));
+	int bi = quant_color(quant_level, astc::flt2int_rtn(b));
+
+	float oldcolorsum = hadd_rgb_s(color) * scale;
+	float newcolorsum = static_cast<float>(ri + gi + bi);
+
+	float scalea = astc::clamp1f(color.lane<3>() * (oldcolorsum + 1e-10f) / (newcolorsum + 1e-10f));
+	int scale_idx = astc::flt2int_rtn(scalea * 256.0f);
+	scale_idx = astc::clamp(scale_idx, 0, 255);
+
+	output[0] = static_cast<uint8_t>(ri);
+	output[1] = static_cast<uint8_t>(gi);
+	output[2] = static_cast<uint8_t>(bi);
+	output[3] = quant_color(quant_level, scale_idx);
+}
+
+/**
+ * @brief Quantize an LDR RGBA color using scale encoding.
+ *
+ * @param      color        The input unquantized color endpoint and scale factor.
+ * @param[out] output       The output endpoints, returned as (r0, g0, b0, s, a0, a1).
+ * @param      quant_level  The quantization level to use.
+ */
+static void quantize_rgbs_alpha(
+	vfloat4 color0,
+	vfloat4 color1,
+	vfloat4 color,
+	uint8_t output[6],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
+
+	output[4] = quant_color(quant_level, astc::flt2int_rtn(a0));
+	output[5] = quant_color(quant_level, astc::flt2int_rtn(a1));
+
+	quantize_rgbs(color, output, quant_level);
+}
+
+/**
+ * @brief Quantize a LDR L color.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (l0, l1).
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_luminance(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[2],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	color0 = color0 * scale;
+	color1 = color1 * scale;
+
+	float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
+	float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
+
+	if (lum0 > lum1)
+	{
+		float avg = (lum0 + lum1) * 0.5f;
+		lum0 = avg;
+		lum1 = avg;
+	}
+
+	output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0));
+	output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1));
+}
+
+/**
+ * @brief Quantize a LDR LA color.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as (l0, l1, a0, a1).
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_luminance_alpha(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[4],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	color0 = color0 * scale;
+	color1 = color1 * scale;
+
+	float lum0 = astc::clamp255f(hadd_rgb_s(color0) * (1.0f / 3.0f));
+	float lum1 = astc::clamp255f(hadd_rgb_s(color1) * (1.0f / 3.0f));
+
+	float a0 = astc::clamp255f(color0.lane<3>());
+	float a1 = astc::clamp255f(color1.lane<3>());
+
+	// If endpoints are close then pull apart slightly; this gives > 8 bit normal map precision.
+	if (quant_level > 18)
+	{
+		if (fabsf(lum0 - lum1) < 3.0f)
+		{
+			if (lum0 < lum1)
+			{
+				lum0 -= 0.5f;
+				lum1 += 0.5f;
+			}
+			else
+			{
+				lum0 += 0.5f;
+				lum1 -= 0.5f;
+			}
+
+			lum0 = astc::clamp255f(lum0);
+			lum1 = astc::clamp255f(lum1);
+		}
+
+		if (fabsf(a0 - a1) < 3.0f)
+		{
+			if (a0 < a1)
+			{
+				a0 -= 0.5f;
+				a1 += 0.5f;
+			}
+			else
+			{
+				a0 += 0.5f;
+				a1 -= 0.5f;
+			}
+
+			a0 = astc::clamp255f(a0);
+			a1 = astc::clamp255f(a1);
+		}
+	}
+
+	output[0] = quant_color(quant_level, astc::flt2int_rtn(lum0));
+	output[1] = quant_color(quant_level, astc::flt2int_rtn(lum1));
+	output[2] = quant_color(quant_level, astc::flt2int_rtn(a0));
+	output[3] = quant_color(quant_level, astc::flt2int_rtn(a1));
+}
+
+/**
+ * @brief Quantize and unquantize a value ensuring top two bits are the same.
+ *
+ * @param      quant_level     The quantization level to use.
+ * @param      value           The input unquantized value.
+ * @param[out] quant_value     The quantized value.
+ */
+static inline void quantize_and_unquantize_retain_top_two_bits(
+	quant_method quant_level,
+	uint8_t value,
+	uint8_t& quant_value
+) {
+	int perform_loop;
+	uint8_t quantval;
+
+	do
+	{
+		quantval = quant_color(quant_level, value);
+
+		// Perform looping if the top two bits were modified by quant/unquant
+		perform_loop = (value & 0xC0) != (quantval & 0xC0);
+
+		if ((quantval & 0xC0) > (value & 0xC0))
+		{
+			// Quant/unquant rounded UP so that the top two bits changed;
+			// decrement the input in hopes that this will avoid rounding up.
+			value--;
+		}
+		else if ((quantval & 0xC0) < (value & 0xC0))
+		{
+			// Quant/unquant rounded DOWN so that the top two bits changed;
+			// decrement the input in hopes that this will avoid rounding down.
+			value--;
+		}
+	} while (perform_loop);
+
+	quant_value = quantval;
+}
+
+/**
+ * @brief Quantize and unquantize a value ensuring top four bits are the same.
+ *
+ * @param      quant_level     The quantization level to use.
+ * @param      value           The input unquantized value.
+ * @param[out] quant_value     The quantized value in 0-255 range.
+ */
+static inline void quantize_and_unquantize_retain_top_four_bits(
+	quant_method quant_level,
+	uint8_t value,
+	uint8_t& quant_value
+) {
+	uint8_t perform_loop;
+	uint8_t quantval;
+
+	do
+	{
+		quantval = quant_color(quant_level, value);
+		// Perform looping if the top four bits were modified by quant/unquant
+		perform_loop = (value & 0xF0) != (quantval & 0xF0);
+
+		if ((quantval & 0xF0) > (value & 0xF0))
+		{
+			// Quant/unquant rounded UP so that the top four bits changed;
+			// decrement the input value in hopes that this will avoid rounding up.
+			value--;
+		}
+		else if ((quantval & 0xF0) < (value & 0xF0))
+		{
+			// Quant/unquant rounded DOWN so that the top four bits changed;
+			// decrement the input value in hopes that this will avoid rounding down.
+			value--;
+		}
+	} while (perform_loop);
+
+	quant_value = quantval;
+}
+
+/**
+ * @brief Quantize a HDR RGB color using RGB + offset.
+ *
+ * @param      color         The input unquantized color endpoint and offset.
+ * @param[out] output        The output endpoints, returned as packed RGBS with some mode bits.
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_hdr_rgbo(
+	vfloat4 color,
+	uint8_t output[4],
+	quant_method quant_level
+) {
+	color.set_lane<0>(color.lane<0>() + color.lane<3>());
+	color.set_lane<1>(color.lane<1>() + color.lane<3>());
+	color.set_lane<2>(color.lane<2>() + color.lane<3>());
+
+	color = clamp(0.0f, 65535.0f, color);
+
+	vfloat4 color_bak = color;
+
+	int majcomp;
+	if (color.lane<0>() > color.lane<1>() && color.lane<0>() > color.lane<2>())
+	{
+		majcomp = 0;			// red is largest component
+	}
+	else if (color.lane<1>() > color.lane<2>())
+	{
+		majcomp = 1;			// green is largest component
+	}
+	else
+	{
+		majcomp = 2;			// blue is largest component
+	}
+
+	// swap around the red component and the largest component.
+	switch (majcomp)
+	{
+	case 1:
+		color = color.swz<1, 0, 2, 3>();
+		break;
+	case 2:
+		color = color.swz<2, 1, 0, 3>();
+		break;
+	default:
+		break;
+	}
+
+	static const int mode_bits[5][3] {
+		{11, 5, 7},
+		{11, 6, 5},
+		{10, 5, 8},
+		{9, 6, 7},
+		{8, 7, 6}
+	};
+
+	static const float mode_cutoffs[5][2] {
+		{1024, 4096},
+		{2048, 1024},
+		{2048, 16384},
+		{8192, 16384},
+		{32768, 16384}
+	};
+
+	static const float mode_rscales[5] {
+		32.0f,
+		32.0f,
+		64.0f,
+		128.0f,
+		256.0f,
+	};
+
+	static const float mode_scales[5] {
+		1.0f / 32.0f,
+		1.0f / 32.0f,
+		1.0f / 64.0f,
+		1.0f / 128.0f,
+		1.0f / 256.0f,
+	};
+
+	float r_base = color.lane<0>();
+	float g_base = color.lane<0>() - color.lane<1>() ;
+	float b_base = color.lane<0>() - color.lane<2>() ;
+	float s_base = color.lane<3>() ;
+
+	for (int mode = 0; mode < 5; mode++)
+	{
+		if (g_base > mode_cutoffs[mode][0] || b_base > mode_cutoffs[mode][0] || s_base > mode_cutoffs[mode][1])
+		{
+			continue;
+		}
+
+		// Encode the mode into a 4-bit vector
+		int mode_enc = mode < 4 ? (mode | (majcomp << 2)) : (majcomp | 0xC);
+
+		float mode_scale = mode_scales[mode];
+		float mode_rscale = mode_rscales[mode];
+
+		int gb_intcutoff = 1 << mode_bits[mode][1];
+		int s_intcutoff = 1 << mode_bits[mode][2];
+
+		// Quantize and unquantize R
+		int r_intval = astc::flt2int_rtn(r_base * mode_scale);
+
+		int r_lowbits = r_intval & 0x3f;
+
+		r_lowbits |= (mode_enc & 3) << 6;
+
+		uint8_t r_quantval;
+		quantize_and_unquantize_retain_top_two_bits(
+		    quant_level, static_cast<uint8_t>(r_lowbits), r_quantval);
+
+		r_intval = (r_intval & ~0x3f) | (r_quantval & 0x3f);
+		float r_fval = static_cast<float>(r_intval) * mode_rscale;
+
+		// Recompute G and B, then quantize and unquantize them
+		float g_fval = r_fval - color.lane<1>() ;
+		float b_fval = r_fval - color.lane<2>() ;
+
+		g_fval = astc::clamp(g_fval, 0.0f, 65535.0f);
+		b_fval = astc::clamp(b_fval, 0.0f, 65535.0f);
+
+		int g_intval = astc::flt2int_rtn(g_fval * mode_scale);
+		int b_intval = astc::flt2int_rtn(b_fval * mode_scale);
+
+		if (g_intval >= gb_intcutoff || b_intval >= gb_intcutoff)
+		{
+			continue;
+		}
+
+		int g_lowbits = g_intval & 0x1f;
+		int b_lowbits = b_intval & 0x1f;
+
+		int bit0 = 0;
+		int bit1 = 0;
+		int bit2 = 0;
+		int bit3 = 0;
+
+		switch (mode)
+		{
+		case 0:
+		case 2:
+			bit0 = (r_intval >> 9) & 1;
+			break;
+		case 1:
+		case 3:
+			bit0 = (r_intval >> 8) & 1;
+			break;
+		case 4:
+		case 5:
+			bit0 = (g_intval >> 6) & 1;
+			break;
+		}
+
+		switch (mode)
+		{
+		case 0:
+		case 1:
+		case 2:
+		case 3:
+			bit2 = (r_intval >> 7) & 1;
+			break;
+		case 4:
+		case 5:
+			bit2 = (b_intval >> 6) & 1;
+			break;
+		}
+
+		switch (mode)
+		{
+		case 0:
+		case 2:
+			bit1 = (r_intval >> 8) & 1;
+			break;
+		case 1:
+		case 3:
+		case 4:
+		case 5:
+			bit1 = (g_intval >> 5) & 1;
+			break;
+		}
+
+		switch (mode)
+		{
+		case 0:
+			bit3 = (r_intval >> 10) & 1;
+			break;
+		case 2:
+			bit3 = (r_intval >> 6) & 1;
+			break;
+		case 1:
+		case 3:
+		case 4:
+		case 5:
+			bit3 = (b_intval >> 5) & 1;
+			break;
+		}
+
+		g_lowbits |= (mode_enc & 0x4) << 5;
+		b_lowbits |= (mode_enc & 0x8) << 4;
+
+		g_lowbits |= bit0 << 6;
+		g_lowbits |= bit1 << 5;
+		b_lowbits |= bit2 << 6;
+		b_lowbits |= bit3 << 5;
+
+		uint8_t g_quantval;
+		uint8_t b_quantval;
+
+		quantize_and_unquantize_retain_top_four_bits(
+		    quant_level, static_cast<uint8_t>(g_lowbits), g_quantval);
+		quantize_and_unquantize_retain_top_four_bits(
+		    quant_level, static_cast<uint8_t>(b_lowbits), b_quantval);
+
+		g_intval = (g_intval & ~0x1f) | (g_quantval & 0x1f);
+		b_intval = (b_intval & ~0x1f) | (b_quantval & 0x1f);
+
+		g_fval = static_cast<float>(g_intval) * mode_rscale;
+		b_fval = static_cast<float>(b_intval) * mode_rscale;
+
+		// Recompute the scale value, based on the errors introduced to red, green and blue
+
+		// If the error is positive, then the R,G,B errors combined have raised the color
+		// value overall; as such, the scale value needs to be increased.
+		float rgb_errorsum = (r_fval - color.lane<0>() ) + (r_fval - g_fval - color.lane<1>() ) + (r_fval - b_fval - color.lane<2>() );
+
+		float s_fval = s_base + rgb_errorsum * (1.0f / 3.0f);
+		s_fval = astc::clamp(s_fval, 0.0f, 1e9f);
+
+		int s_intval = astc::flt2int_rtn(s_fval * mode_scale);
+
+		if (s_intval >= s_intcutoff)
+		{
+			continue;
+		}
+
+		int s_lowbits = s_intval & 0x1f;
+
+		int bit4;
+		int bit5;
+		int bit6;
+		switch (mode)
+		{
+		case 1:
+			bit6 = (r_intval >> 9) & 1;
+			break;
+		default:
+			bit6 = (s_intval >> 5) & 1;
+			break;
+		}
+
+		switch (mode)
+		{
+		case 4:
+			bit5 = (r_intval >> 7) & 1;
+			break;
+		case 1:
+			bit5 = (r_intval >> 10) & 1;
+			break;
+		default:
+			bit5 = (s_intval >> 6) & 1;
+			break;
+		}
+
+		switch (mode)
+		{
+		case 2:
+			bit4 = (s_intval >> 7) & 1;
+			break;
+		default:
+			bit4 = (r_intval >> 6) & 1;
+			break;
+		}
+
+		s_lowbits |= bit6 << 5;
+		s_lowbits |= bit5 << 6;
+		s_lowbits |= bit4 << 7;
+
+		uint8_t s_quantval;
+
+		quantize_and_unquantize_retain_top_four_bits(
+		    quant_level, static_cast<uint8_t>(s_lowbits), s_quantval);
+
+		output[0] = r_quantval;
+		output[1] = g_quantval;
+		output[2] = b_quantval;
+		output[3] = s_quantval;
+		return;
+	}
+
+	// Failed to encode any of the modes above? In that case encode using mode #5
+	float vals[4];
+	vals[0] = color_bak.lane<0>();
+	vals[1] = color_bak.lane<1>();
+	vals[2] = color_bak.lane<2>();
+	vals[3] = color_bak.lane<3>();
+
+	int ivals[4];
+	float cvals[3];
+
+	for (int i = 0; i < 3; i++)
+	{
+		vals[i] = astc::clamp(vals[i], 0.0f, 65020.0f);
+		ivals[i] = astc::flt2int_rtn(vals[i] * (1.0f / 512.0f));
+		cvals[i] = static_cast<float>(ivals[i]) * 512.0f;
+	}
+
+	float rgb_errorsum = (cvals[0] - vals[0]) + (cvals[1] - vals[1]) + (cvals[2] - vals[2]);
+	vals[3] += rgb_errorsum * (1.0f / 3.0f);
+
+	vals[3] = astc::clamp(vals[3], 0.0f, 65020.0f);
+	ivals[3] = astc::flt2int_rtn(vals[3] * (1.0f / 512.0f));
+
+	int encvals[4];
+	encvals[0] = (ivals[0] & 0x3f) | 0xC0;
+	encvals[1] = (ivals[1] & 0x7f) | 0x80;
+	encvals[2] = (ivals[2] & 0x7f) | 0x80;
+	encvals[3] = (ivals[3] & 0x7f) | ((ivals[0] & 0x40) << 1);
+
+	for (uint8_t i = 0; i < 4; i++)
+	{
+		quantize_and_unquantize_retain_top_four_bits(
+		    quant_level, static_cast<uint8_t>(encvals[i]), output[i]);
+	}
+
+	return;
+}
+
+/**
+ * @brief Quantize a HDR RGB color using direct RGB encoding.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as packed RGB+RGB pairs with mode bits.
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_hdr_rgb(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[6],
+	quant_method quant_level
+) {
+	// Note: color*.lane<3> is not used so we can ignore it
+	color0 = clamp(0.0f, 65535.0f, color0);
+	color1 = clamp(0.0f, 65535.0f, color1);
+
+	vfloat4 color0_bak = color0;
+	vfloat4 color1_bak = color1;
+
+	int majcomp;
+	if (color1.lane<0>() > color1.lane<1>() && color1.lane<0>() > color1.lane<2>())
+	{
+		majcomp = 0;
+	}
+	else if (color1.lane<1>() > color1.lane<2>())
+	{
+		majcomp = 1;
+	}
+	else
+	{
+		majcomp = 2;
+	}
+
+	// Swizzle the components
+	switch (majcomp)
+	{
+	case 1:  // red-green swap
+		color0 = color0.swz<1, 0, 2, 3>();
+		color1 = color1.swz<1, 0, 2, 3>();
+		break;
+	case 2:  // red-blue swap
+		color0 = color0.swz<2, 1, 0, 3>();
+		color1 = color1.swz<2, 1, 0, 3>();
+		break;
+	default:
+		break;
+	}
+
+	float a_base = color1.lane<0>();
+	a_base = astc::clamp(a_base, 0.0f, 65535.0f);
+
+	float b0_base = a_base - color1.lane<1>();
+	float b1_base = a_base - color1.lane<2>();
+	float c_base = a_base - color0.lane<0>();
+	float d0_base = a_base - b0_base - c_base - color0.lane<1>();
+	float d1_base = a_base - b1_base - c_base - color0.lane<2>();
+
+	// Number of bits in the various fields in the various modes
+	static const int mode_bits[8][4] {
+		{9, 7, 6, 7},
+		{9, 8, 6, 6},
+		{10, 6, 7, 7},
+		{10, 7, 7, 6},
+		{11, 8, 6, 5},
+		{11, 6, 8, 6},
+		{12, 7, 7, 5},
+		{12, 6, 7, 6}
+	};
+
+	// Cutoffs to use for the computed values of a,b,c,d, assuming the
+	// range 0..65535 are LNS values corresponding to fp16.
+	static const float mode_cutoffs[8][4] {
+		{16384, 8192, 8192, 8},	// mode 0: 9,7,6,7
+		{32768, 8192, 4096, 8},	// mode 1: 9,8,6,6
+		{4096, 8192, 4096, 4},	// mode 2: 10,6,7,7
+		{8192, 8192, 2048, 4},	// mode 3: 10,7,7,6
+		{8192, 2048, 512, 2},	// mode 4: 11,8,6,5
+		{2048, 8192, 1024, 2},	// mode 5: 11,6,8,6
+		{2048, 2048, 256, 1},	// mode 6: 12,7,7,5
+		{1024, 2048, 512, 1},	// mode 7: 12,6,7,6
+	};
+
+	static const float mode_scales[8] {
+		1.0f / 128.0f,
+		1.0f / 128.0f,
+		1.0f / 64.0f,
+		1.0f / 64.0f,
+		1.0f / 32.0f,
+		1.0f / 32.0f,
+		1.0f / 16.0f,
+		1.0f / 16.0f,
+	};
+
+	// Scaling factors when going from what was encoded in the mode to 16 bits.
+	static const float mode_rscales[8] {
+		128.0f,
+		128.0f,
+		64.0f,
+		64.0f,
+		32.0f,
+		32.0f,
+		16.0f,
+		16.0f
+	};
+
+	// Try modes one by one, with the highest-precision mode first.
+	for (int mode = 7; mode >= 0; mode--)
+	{
+		// For each mode, test if we can in fact accommodate the computed b, c, and d values.
+		// If we clearly can't, then we skip to the next mode.
+
+		float b_cutoff = mode_cutoffs[mode][0];
+		float c_cutoff = mode_cutoffs[mode][1];
+		float d_cutoff = mode_cutoffs[mode][2];
+
+		if (b0_base > b_cutoff || b1_base > b_cutoff || c_base > c_cutoff || fabsf(d0_base) > d_cutoff || fabsf(d1_base) > d_cutoff)
+		{
+			continue;
+		}
+
+		float mode_scale = mode_scales[mode];
+		float mode_rscale = mode_rscales[mode];
+
+		int b_intcutoff = 1 << mode_bits[mode][1];
+		int c_intcutoff = 1 << mode_bits[mode][2];
+		int d_intcutoff = 1 << (mode_bits[mode][3] - 1);
+
+		// Quantize and unquantize A, with the assumption that its high bits can be handled safely.
+		int a_intval = astc::flt2int_rtn(a_base * mode_scale);
+		int a_lowbits = a_intval & 0xFF;
+
+		int a_quantval = quant_color(quant_level, a_lowbits);
+		int a_uquantval = a_quantval;
+		a_intval = (a_intval & ~0xFF) | a_uquantval;
+		float a_fval = static_cast<float>(a_intval) * mode_rscale;
+
+		// Recompute C, then quantize and unquantize it
+		float c_fval = a_fval - color0.lane<0>();
+		c_fval = astc::clamp(c_fval, 0.0f, 65535.0f);
+
+		int c_intval = astc::flt2int_rtn(c_fval * mode_scale);
+
+		if (c_intval >= c_intcutoff)
+		{
+			continue;
+		}
+
+		int c_lowbits = c_intval & 0x3f;
+
+		c_lowbits |= (mode & 1) << 7;
+		c_lowbits |= (a_intval & 0x100) >> 2;
+
+		uint8_t c_quantval;
+
+		quantize_and_unquantize_retain_top_two_bits(
+		    quant_level, static_cast<uint8_t>(c_lowbits), c_quantval);
+
+		c_intval = (c_intval & ~0x3F) | (c_quantval & 0x3F);
+		c_fval = static_cast<float>(c_intval) * mode_rscale;
+
+		// Recompute B0 and B1, then quantize and unquantize them
+		float b0_fval = a_fval - color1.lane<1>();
+		float b1_fval = a_fval - color1.lane<2>();
+
+		b0_fval = astc::clamp(b0_fval, 0.0f, 65535.0f);
+		b1_fval = astc::clamp(b1_fval, 0.0f, 65535.0f);
+		int b0_intval = astc::flt2int_rtn(b0_fval * mode_scale);
+		int b1_intval = astc::flt2int_rtn(b1_fval * mode_scale);
+
+		if (b0_intval >= b_intcutoff || b1_intval >= b_intcutoff)
+		{
+			continue;
+		}
+
+		int b0_lowbits = b0_intval & 0x3f;
+		int b1_lowbits = b1_intval & 0x3f;
+
+		int bit0 = 0;
+		int bit1 = 0;
+		switch (mode)
+		{
+		case 0:
+		case 1:
+		case 3:
+		case 4:
+		case 6:
+			bit0 = (b0_intval >> 6) & 1;
+			break;
+		case 2:
+		case 5:
+		case 7:
+			bit0 = (a_intval >> 9) & 1;
+			break;
+		}
+
+		switch (mode)
+		{
+		case 0:
+		case 1:
+		case 3:
+		case 4:
+		case 6:
+			bit1 = (b1_intval >> 6) & 1;
+			break;
+		case 2:
+			bit1 = (c_intval >> 6) & 1;
+			break;
+		case 5:
+		case 7:
+			bit1 = (a_intval >> 10) & 1;
+			break;
+		}
+
+		b0_lowbits |= bit0 << 6;
+		b1_lowbits |= bit1 << 6;
+
+		b0_lowbits |= ((mode >> 1) & 1) << 7;
+		b1_lowbits |= ((mode >> 2) & 1) << 7;
+
+		uint8_t b0_quantval;
+		uint8_t b1_quantval;
+
+		quantize_and_unquantize_retain_top_two_bits(
+		    quant_level, static_cast<uint8_t>(b0_lowbits), b0_quantval);
+		quantize_and_unquantize_retain_top_two_bits(
+		    quant_level, static_cast<uint8_t>(b1_lowbits), b1_quantval);
+
+		b0_intval = (b0_intval & ~0x3f) | (b0_quantval & 0x3f);
+		b1_intval = (b1_intval & ~0x3f) | (b1_quantval & 0x3f);
+		b0_fval = static_cast<float>(b0_intval) * mode_rscale;
+		b1_fval = static_cast<float>(b1_intval) * mode_rscale;
+
+		// Recompute D0 and D1, then quantize and unquantize them
+		float d0_fval = a_fval - b0_fval - c_fval - color0.lane<1>();
+		float d1_fval = a_fval - b1_fval - c_fval - color0.lane<2>();
+
+		d0_fval = astc::clamp(d0_fval, -65535.0f, 65535.0f);
+		d1_fval = astc::clamp(d1_fval, -65535.0f, 65535.0f);
+
+		int d0_intval = astc::flt2int_rtn(d0_fval * mode_scale);
+		int d1_intval = astc::flt2int_rtn(d1_fval * mode_scale);
+
+		if (abs(d0_intval) >= d_intcutoff || abs(d1_intval) >= d_intcutoff)
+		{
+			continue;
+		}
+
+		int d0_lowbits = d0_intval & 0x1f;
+		int d1_lowbits = d1_intval & 0x1f;
+
+		int bit2 = 0;
+		int bit3 = 0;
+		int bit4;
+		int bit5;
+		switch (mode)
+		{
+		case 0:
+		case 2:
+			bit2 = (d0_intval >> 6) & 1;
+			break;
+		case 1:
+		case 4:
+			bit2 = (b0_intval >> 7) & 1;
+			break;
+		case 3:
+			bit2 = (a_intval >> 9) & 1;
+			break;
+		case 5:
+			bit2 = (c_intval >> 7) & 1;
+			break;
+		case 6:
+		case 7:
+			bit2 = (a_intval >> 11) & 1;
+			break;
+		}
+		switch (mode)
+		{
+		case 0:
+		case 2:
+			bit3 = (d1_intval >> 6) & 1;
+			break;
+		case 1:
+		case 4:
+			bit3 = (b1_intval >> 7) & 1;
+			break;
+		case 3:
+		case 5:
+		case 6:
+		case 7:
+			bit3 = (c_intval >> 6) & 1;
+			break;
+		}
+
+		switch (mode)
+		{
+		case 4:
+		case 6:
+			bit4 = (a_intval >> 9) & 1;
+			bit5 = (a_intval >> 10) & 1;
+			break;
+		default:
+			bit4 = (d0_intval >> 5) & 1;
+			bit5 = (d1_intval >> 5) & 1;
+			break;
+		}
+
+		d0_lowbits |= bit2 << 6;
+		d1_lowbits |= bit3 << 6;
+		d0_lowbits |= bit4 << 5;
+		d1_lowbits |= bit5 << 5;
+
+		d0_lowbits |= (majcomp & 1) << 7;
+		d1_lowbits |= ((majcomp >> 1) & 1) << 7;
+
+		uint8_t d0_quantval;
+		uint8_t d1_quantval;
+
+		quantize_and_unquantize_retain_top_four_bits(
+		    quant_level, static_cast<uint8_t>(d0_lowbits), d0_quantval);
+		quantize_and_unquantize_retain_top_four_bits(
+		    quant_level, static_cast<uint8_t>(d1_lowbits), d1_quantval);
+
+		output[0] = static_cast<uint8_t>(a_quantval);
+		output[1] = c_quantval;
+		output[2] = b0_quantval;
+		output[3] = b1_quantval;
+		output[4] = d0_quantval;
+		output[5] = d1_quantval;
+		return;
+	}
+
+	// If neither of the modes fit we will use a flat representation for storing data, using 8 bits
+	// for red and green, and 7 bits for blue. This gives color accuracy roughly similar to LDR
+	// 4:4:3 which is not at all great but usable. This representation is used if the light color is
+	// more than 4x the color value of the dark color.
+	float vals[6];
+	vals[0] = color0_bak.lane<0>();
+	vals[1] = color1_bak.lane<0>();
+	vals[2] = color0_bak.lane<1>();
+	vals[3] = color1_bak.lane<1>();
+	vals[4] = color0_bak.lane<2>();
+	vals[5] = color1_bak.lane<2>();
+
+	for (int i = 0; i < 6; i++)
+	{
+		vals[i] = astc::clamp(vals[i], 0.0f, 65020.0f);
+	}
+
+	for (int i = 0; i < 4; i++)
+	{
+		int idx = astc::flt2int_rtn(vals[i] * 1.0f / 256.0f);
+		output[i] = quant_color(quant_level, idx);
+	}
+
+	for (int i = 4; i < 6; i++)
+	{
+		int idx = astc::flt2int_rtn(vals[i] * 1.0f / 512.0f) + 128;
+		quantize_and_unquantize_retain_top_two_bits(
+		    quant_level, static_cast<uint8_t>(idx), output[i]);
+	}
+
+	return;
+}
+
+/**
+ * @brief Quantize a HDR RGB + LDR A color using direct RGBA encoding.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as packed RGBA+RGBA pairs with mode bits.
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_hdr_rgb_ldr_alpha(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[8],
+	quant_method quant_level
+) {
+	float scale = 1.0f / 257.0f;
+
+	float a0 = astc::clamp255f(color0.lane<3>() * scale);
+	float a1 = astc::clamp255f(color1.lane<3>() * scale);
+
+	output[6] = quant_color(quant_level, astc::flt2int_rtn(a0));
+	output[7] = quant_color(quant_level, astc::flt2int_rtn(a1));
+
+	quantize_hdr_rgb(color0, color1, output, quant_level);
+}
+
+/**
+ * @brief Quantize a HDR L color using the large range encoding.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as packed (l0, l1).
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_hdr_luminance_large_range(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[2],
+	quant_method quant_level
+) {
+	float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
+	float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
+
+	if (lum1 < lum0)
+	{
+		float avg = (lum0 + lum1) * 0.5f;
+		lum0 = avg;
+		lum1 = avg;
+	}
+
+	int ilum1 = astc::flt2int_rtn(lum1);
+	int ilum0 = astc::flt2int_rtn(lum0);
+
+	// Find the closest encodable point in the upper half of the code-point space
+	int upper_v0 = (ilum0 + 128) >> 8;
+	int upper_v1 = (ilum1 + 128) >> 8;
+
+	upper_v0 = astc::clamp(upper_v0, 0, 255);
+	upper_v1 = astc::clamp(upper_v1, 0, 255);
+
+	// Find the closest encodable point in the lower half of the code-point space
+	int lower_v0 = (ilum1 + 256) >> 8;
+	int lower_v1 = ilum0 >> 8;
+
+	lower_v0 = astc::clamp(lower_v0, 0, 255);
+	lower_v1 = astc::clamp(lower_v1, 0, 255);
+
+	// Determine the distance between the point in code-point space and the input value
+	int upper0_dec = upper_v0 << 8;
+	int upper1_dec = upper_v1 << 8;
+	int lower0_dec = (lower_v1 << 8) + 128;
+	int lower1_dec = (lower_v0 << 8) - 128;
+
+	int upper0_diff = upper0_dec - ilum0;
+	int upper1_diff = upper1_dec - ilum1;
+	int lower0_diff = lower0_dec - ilum0;
+	int lower1_diff = lower1_dec - ilum1;
+
+	int upper_error = (upper0_diff * upper0_diff) + (upper1_diff * upper1_diff);
+	int lower_error = (lower0_diff * lower0_diff) + (lower1_diff * lower1_diff);
+
+	int v0, v1;
+	if (upper_error < lower_error)
+	{
+		v0 = upper_v0;
+		v1 = upper_v1;
+	}
+	else
+	{
+		v0 = lower_v0;
+		v1 = lower_v1;
+	}
+
+	// OK; encode
+	output[0] = quant_color(quant_level, v0);
+	output[1] = quant_color(quant_level, v1);
+}
+
+/**
+ * @brief Quantize a HDR L color using the small range encoding.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as packed (l0, l1) with mode bits.
+ * @param      quant_level   The quantization level to use.
+ *
+ * @return Returns @c false on failure, @c true on success.
+ */
+static bool try_quantize_hdr_luminance_small_range(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[2],
+	quant_method quant_level
+) {
+	float lum0 = hadd_rgb_s(color0) * (1.0f / 3.0f);
+	float lum1 = hadd_rgb_s(color1) * (1.0f / 3.0f);
+
+	if (lum1 < lum0)
+	{
+		float avg = (lum0 + lum1) * 0.5f;
+		lum0 = avg;
+		lum1 = avg;
+	}
+
+	int ilum1 = astc::flt2int_rtn(lum1);
+	int ilum0 = astc::flt2int_rtn(lum0);
+
+	// Difference of more than a factor-of-2 results in immediate failure
+	if (ilum1 - ilum0 > 2048)
+	{
+		return false;
+	}
+
+	int lowval, highval, diffval;
+	int v0, v1;
+	int v0e, v1e;
+	int v0d, v1d;
+
+	// Try to encode the high-precision submode
+	lowval = (ilum0 + 16) >> 5;
+	highval = (ilum1 + 16) >> 5;
+
+	lowval = astc::clamp(lowval, 0, 2047);
+	highval = astc::clamp(highval, 0, 2047);
+
+	v0 = lowval & 0x7F;
+	v0e = quant_color(quant_level, v0);
+	v0d = v0e;
+
+	if (v0d < 0x80)
+	{
+		lowval = (lowval & ~0x7F) | v0d;
+		diffval = highval - lowval;
+		if (diffval >= 0 && diffval <= 15)
+		{
+			v1 = ((lowval >> 3) & 0xF0) | diffval;
+			v1e = quant_color(quant_level, v1);
+			v1d = v1e;
+			if ((v1d & 0xF0) == (v1 & 0xF0))
+			{
+				output[0] = static_cast<uint8_t>(v0e);
+				output[1] = static_cast<uint8_t>(v1e);
+				return true;
+			}
+		}
+	}
+
+	// Try to encode the low-precision submode
+	lowval = (ilum0 + 32) >> 6;
+	highval = (ilum1 + 32) >> 6;
+
+	lowval = astc::clamp(lowval, 0, 1023);
+	highval = astc::clamp(highval, 0, 1023);
+
+	v0 = (lowval & 0x7F) | 0x80;
+	v0e = quant_color(quant_level, v0);
+	v0d = v0e;
+	if ((v0d & 0x80) == 0)
+	{
+		return false;
+	}
+
+	lowval = (lowval & ~0x7F) | (v0d & 0x7F);
+	diffval = highval - lowval;
+	if (diffval < 0 || diffval > 31)
+	{
+		return false;
+	}
+
+	v1 = ((lowval >> 2) & 0xE0) | diffval;
+	v1e = quant_color(quant_level, v1);
+	v1d = v1e;
+	if ((v1d & 0xE0) != (v1 & 0xE0))
+	{
+		return false;
+	}
+
+	output[0] = static_cast<uint8_t>(v0e);
+	output[1] = static_cast<uint8_t>(v1e);
+	return true;
+}
+
+/**
+ * @brief Quantize a HDR A color using either delta or direct RGBA encoding.
+ *
+ * @param      alpha0        The input unquantized color0 endpoint.
+ * @param      alpha1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as packed RGBA+RGBA pairs with mode bits.
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_hdr_alpha(
+	float alpha0,
+	float alpha1,
+	uint8_t output[2],
+	quant_method quant_level
+) {
+	alpha0 = astc::clamp(alpha0, 0.0f, 65280.0f);
+	alpha1 = astc::clamp(alpha1, 0.0f, 65280.0f);
+
+	int ialpha0 = astc::flt2int_rtn(alpha0);
+	int ialpha1 = astc::flt2int_rtn(alpha1);
+
+	int val0, val1, diffval;
+	int v6, v7;
+	int v6e, v7e;
+	int v6d, v7d;
+
+	// Try to encode one of the delta submodes, in decreasing-precision order
+	for (int i = 2; i >= 0; i--)
+	{
+		val0 = (ialpha0 + (128 >> i)) >> (8 - i);
+		val1 = (ialpha1 + (128 >> i)) >> (8 - i);
+
+		v6 = (val0 & 0x7F) | ((i & 1) << 7);
+		v6e = quant_color(quant_level, v6);
+		v6d = v6e;
+
+		if ((v6 ^ v6d) & 0x80)
+		{
+			continue;
+		}
+
+		val0 = (val0 & ~0x7f) | (v6d & 0x7f);
+		diffval = val1 - val0;
+		int cutoff = 32 >> i;
+		int mask = 2 * cutoff - 1;
+
+		if (diffval < -cutoff || diffval >= cutoff)
+		{
+			continue;
+		}
+
+		v7 = ((i & 2) << 6) | ((val0 >> 7) << (6 - i)) | (diffval & mask);
+		v7e = quant_color(quant_level, v7);
+		v7d = v7e;
+
+		static const int testbits[3] { 0xE0, 0xF0, 0xF8 };
+
+		if ((v7 ^ v7d) & testbits[i])
+		{
+			continue;
+		}
+
+		output[0] = static_cast<uint8_t>(v6e);
+		output[1] = static_cast<uint8_t>(v7e);
+		return;
+	}
+
+	// Could not encode any of the delta modes; instead encode a flat value
+	val0 = (ialpha0 + 256) >> 9;
+	val1 = (ialpha1 + 256) >> 9;
+	v6 = val0 | 0x80;
+	v7 = val1 | 0x80;
+
+	output[0] = quant_color(quant_level, v6);
+	output[1] = quant_color(quant_level, v7);
+
+	return;
+}
+
+/**
+ * @brief Quantize a HDR RGBA color using either delta or direct RGBA encoding.
+ *
+ * @param      color0        The input unquantized color0 endpoint.
+ * @param      color1        The input unquantized color1 endpoint.
+ * @param[out] output        The output endpoints, returned as packed RGBA+RGBA pairs with mode bits.
+ * @param      quant_level   The quantization level to use.
+ */
+static void quantize_hdr_rgb_alpha(
+	vfloat4 color0,
+	vfloat4 color1,
+	uint8_t output[8],
+	quant_method quant_level
+) {
+	quantize_hdr_rgb(color0, color1, output, quant_level);
+	quantize_hdr_alpha(color0.lane<3>(), color1.lane<3>(), output + 6, quant_level);
+}
+
+/* See header for documentation. */
+uint8_t pack_color_endpoints(
+	vfloat4 color0,
+	vfloat4 color1,
+	vfloat4 rgbs_color,
+	vfloat4 rgbo_color,
+	int format,
+	uint8_t* output,
+	quant_method quant_level
+) {
+	assert(QUANT_6 <= quant_level && quant_level <= QUANT_256);
+
+	// We do not support negative colors
+	color0 = max(color0, 0.0f);
+	color1 = max(color1, 0.0f);
+
+	uint8_t retval = 0;
+
+	switch (format)
+	{
+	case FMT_RGB:
+		if (quant_level <= QUANT_160)
+		{
+			if (try_quantize_rgb_delta_blue_contract(color0, color1, output, quant_level))
+			{
+				retval = FMT_RGB_DELTA;
+				break;
+			}
+			if (try_quantize_rgb_delta(color0, color1, output, quant_level))
+			{
+				retval = FMT_RGB_DELTA;
+				break;
+			}
+		}
+		if (quant_level < QUANT_256 && try_quantize_rgb_blue_contract(color0, color1, output, quant_level))
+		{
+			retval = FMT_RGB;
+			break;
+		}
+		quantize_rgb(color0, color1, output, quant_level);
+		retval = FMT_RGB;
+		break;
+
+	case FMT_RGBA:
+		if (quant_level <= QUANT_160)
+		{
+			if (try_quantize_rgba_delta_blue_contract(color0, color1, output, quant_level))
+			{
+				retval = FMT_RGBA_DELTA;
+				break;
+			}
+			if (try_quantize_rgba_delta(color0, color1, output, quant_level))
+			{
+				retval = FMT_RGBA_DELTA;
+				break;
+			}
+		}
+		if (quant_level < QUANT_256 && try_quantize_rgba_blue_contract(color0, color1, output, quant_level))
+		{
+			retval = FMT_RGBA;
+			break;
+		}
+		quantize_rgba(color0, color1, output, quant_level);
+		retval = FMT_RGBA;
+		break;
+
+	case FMT_RGB_SCALE:
+		quantize_rgbs(rgbs_color, output, quant_level);
+		retval = FMT_RGB_SCALE;
+		break;
+
+	case FMT_HDR_RGB_SCALE:
+		quantize_hdr_rgbo(rgbo_color, output, quant_level);
+		retval = FMT_HDR_RGB_SCALE;
+		break;
+
+	case FMT_HDR_RGB:
+		quantize_hdr_rgb(color0, color1, output, quant_level);
+		retval = FMT_HDR_RGB;
+		break;
+
+	case FMT_RGB_SCALE_ALPHA:
+		quantize_rgbs_alpha(color0, color1, rgbs_color, output, quant_level);
+		retval = FMT_RGB_SCALE_ALPHA;
+		break;
+
+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
+		if (try_quantize_hdr_luminance_small_range(color0, color1, output, quant_level))
+		{
+			retval = FMT_HDR_LUMINANCE_SMALL_RANGE;
+			break;
+		}
+		quantize_hdr_luminance_large_range(color0, color1, output, quant_level);
+		retval = FMT_HDR_LUMINANCE_LARGE_RANGE;
+		break;
+
+	case FMT_LUMINANCE:
+		quantize_luminance(color0, color1, output, quant_level);
+		retval = FMT_LUMINANCE;
+		break;
+
+	case FMT_LUMINANCE_ALPHA:
+		if (quant_level <= 18)
+		{
+			if (try_quantize_luminance_alpha_delta(color0, color1, output, quant_level))
+			{
+				retval = FMT_LUMINANCE_ALPHA_DELTA;
+				break;
+			}
+		}
+		quantize_luminance_alpha(color0, color1, output, quant_level);
+		retval = FMT_LUMINANCE_ALPHA;
+		break;
+
+	case FMT_HDR_RGB_LDR_ALPHA:
+		quantize_hdr_rgb_ldr_alpha(color0, color1, output, quant_level);
+		retval = FMT_HDR_RGB_LDR_ALPHA;
+		break;
+
+	case FMT_HDR_RGBA:
+		quantize_hdr_rgb_alpha(color0, color1, output, quant_level);
+		retval = FMT_HDR_RGBA;
+		break;
+	}
+
+	return retval;
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_color_unquantize.cpp b/thirdparty/astcenc/astcenc_color_unquantize.cpp
new file mode 100644
index 0000000000..d31895a627
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_color_unquantize.cpp
@@ -0,0 +1,941 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include <utility>
+
+/**
+ * @brief Functions for color unquantization.
+ */
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Un-blue-contract a color.
+ *
+ * This function reverses any applied blue contraction.
+ *
+ * @param input   The input color that has been blue-contracted.
+ *
+ * @return The uncontracted color.
+ */
+static ASTCENC_SIMD_INLINE vint4 uncontract_color(
+	vint4 input
+) {
+	vmask4 mask(true, true, false, false);
+	vint4 bc0 = asr<1>(input + input.lane<2>());
+	return select(input, bc0, mask);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses delta encoding.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color deltas.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgba_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply bit transfer
+	bit_transfer_signed(input1, input0);
+
+	// Apply blue-uncontraction if needed
+	int rgb_sum = hadd_rgb_s(input1);
+	input1 = input1 + input0;
+	if (rgb_sum < 0)
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = clamp(0, 255, input0);
+	output1 = clamp(0, 255, input1);
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses delta encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color deltas.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_delta_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_delta_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses direct encoding.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgba_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	// Apply blue-uncontraction if needed
+	if (hadd_rgb_s(input0) > hadd_rgb_s(input1))
+	{
+		input0 = uncontract_color(input0);
+		input1 = uncontract_color(input1);
+		std::swap(input0, input1);
+	}
+
+	output0 = input0;
+	output1 = input1;
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses direct encoding.
+ *
+ * Output alpha set to 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      input1    The packed endpoint 1 color.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_unpack(
+	vint4 input0,
+	vint4 input1,
+	vint4& output0,
+	vint4& output1
+) {
+	rgba_unpack(input0, input1, output0, output1);
+	output0.set_lane<3>(255);
+	output1.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR RGBA color that uses scaled encoding.
+ *
+ * Note only the RGB channels use the scaled encoding, alpha uses direct.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      alpha1    The packed endpoint 1 alpha value.
+ * @param      scale     The packed quantized scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_alpha_unpack(
+	vint4 input0,
+	uint8_t alpha1,
+	uint8_t scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(alpha1);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(input0.lane<3>());
+}
+
+/**
+ * @brief Unpack an LDR RGB color that uses scaled encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input0    The packed endpoint 0 color.
+ * @param      scale     The packed scale.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void rgb_scale_unpack(
+	vint4 input0,
+	int scale,
+	vint4& output0,
+	vint4& output1
+) {
+	output1 = input0;
+	output1.set_lane<3>(255);
+
+	output0 = asr<8>(input0 * scale);
+	output0.set_lane<3>(255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses direct encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints.
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	output0 = vint4(lum0, lum0, lum0, 255);
+	output1 = vint4(lum1, lum1, lum1, 255);
+}
+
+/**
+ * @brief Unpack an LDR L color that uses delta encoding.
+ *
+ * Output alpha is 255.
+ *
+ * @param      input     The packed endpoints (L0, L1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_delta_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int l0 = (v0 >> 2) | (v1 & 0xC0);
+	int l1 = l0 + (v1 & 0x3F);
+
+	l1 = astc::min(l1, 255);
+
+	output0 = vint4(l0, l0, l0, 255);
+	output1 = vint4(l1, l1, l1, 255);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses direct encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an LDR LA color that uses delta encoding.
+ *
+ * @param      input     The packed endpoints (L0, L1, A0, A1).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void luminance_alpha_delta_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int lum0 = input[0];
+	int lum1 = input[1];
+	int alpha0 = input[2];
+	int alpha1 = input[3];
+
+	lum0 |= (lum1 & 0x80) << 1;
+	alpha0 |= (alpha1 & 0x80) << 1;
+	lum1 &= 0x7F;
+	alpha1 &= 0x7F;
+
+	if (lum1 & 0x40)
+	{
+		lum1 -= 0x80;
+	}
+
+	if (alpha1 & 0x40)
+	{
+		alpha1 -= 0x80;
+	}
+
+	lum0 >>= 1;
+	lum1 >>= 1;
+	alpha0 >>= 1;
+	alpha1 >>= 1;
+	lum1 += lum0;
+	alpha1 += alpha0;
+
+	lum1 = astc::clamp(lum1, 0, 255);
+	alpha1 = astc::clamp(alpha1, 0, 255);
+
+	output0 = vint4(lum0, lum0, lum0, alpha0);
+	output1 = vint4(lum1, lum1, lum1, alpha1);
+}
+
+/**
+ * @brief Unpack an HDR RGB + offset encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgbo_unpack(
+	const uint8_t input[4],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+
+	int modeval = ((v0 & 0xC0) >> 6) | (((v1 & 0x80) >> 7) << 2) | (((v2 & 0x80) >> 7) << 3);
+
+	int majcomp;
+	int mode;
+	if ((modeval & 0xC) != 0xC)
+	{
+		majcomp = modeval >> 2;
+		mode = modeval & 3;
+	}
+	else if (modeval != 0xF)
+	{
+		majcomp = modeval & 3;
+		mode = 4;
+	}
+	else
+	{
+		majcomp = 0;
+		mode = 5;
+	}
+
+	int red = v0 & 0x3F;
+	int green = v1 & 0x1F;
+	int blue = v2 & 0x1F;
+	int scale = v3 & 0x1F;
+
+	int bit0 = (v1 >> 6) & 1;
+	int bit1 = (v1 >> 5) & 1;
+	int bit2 = (v2 >> 6) & 1;
+	int bit3 = (v2 >> 5) & 1;
+	int bit4 = (v3 >> 7) & 1;
+	int bit5 = (v3 >> 6) & 1;
+	int bit6 = (v3 >> 5) & 1;
+
+	int ohcomp = 1 << mode;
+
+	if (ohcomp & 0x30)
+		green |= bit0 << 6;
+	if (ohcomp & 0x3A)
+		green |= bit1 << 5;
+	if (ohcomp & 0x30)
+		blue |= bit2 << 6;
+	if (ohcomp & 0x3A)
+		blue |= bit3 << 5;
+
+	if (ohcomp & 0x3D)
+		scale |= bit6 << 5;
+	if (ohcomp & 0x2D)
+		scale |= bit5 << 6;
+	if (ohcomp & 0x04)
+		scale |= bit4 << 7;
+
+	if (ohcomp & 0x3B)
+		red |= bit4 << 6;
+	if (ohcomp & 0x04)
+		red |= bit3 << 6;
+
+	if (ohcomp & 0x10)
+		red |= bit5 << 7;
+	if (ohcomp & 0x0F)
+		red |= bit2 << 7;
+
+	if (ohcomp & 0x05)
+		red |= bit1 << 8;
+	if (ohcomp & 0x0A)
+		red |= bit0 << 8;
+
+	if (ohcomp & 0x05)
+		red |= bit0 << 9;
+	if (ohcomp & 0x02)
+		red |= bit6 << 9;
+
+	if (ohcomp & 0x01)
+		red |= bit3 << 10;
+	if (ohcomp & 0x02)
+		red |= bit5 << 10;
+
+	// expand to 12 bits.
+	static const int shamts[6] { 1, 1, 2, 3, 4, 5 };
+	int shamt = shamts[mode];
+	red <<= shamt;
+	green <<= shamt;
+	blue <<= shamt;
+	scale <<= shamt;
+
+	// on modes 0 to 4, the values stored for "green" and "blue" are differentials,
+	// not absolute values.
+	if (mode != 5)
+	{
+		green = red - green;
+		blue = red - blue;
+	}
+
+	// switch around components.
+	int temp;
+	switch (majcomp)
+	{
+	case 1:
+		temp = red;
+		red = green;
+		green = temp;
+		break;
+	case 2:
+		temp = red;
+		red = blue;
+		blue = temp;
+		break;
+	default:
+		break;
+	}
+
+	int red0 = red - scale;
+	int green0 = green - scale;
+	int blue0 = blue - scale;
+
+	// clamp to [0,0xFFF].
+	if (red < 0)
+		red = 0;
+	if (green < 0)
+		green = 0;
+	if (blue < 0)
+		blue = 0;
+
+	if (red0 < 0)
+		red0 = 0;
+	if (green0 < 0)
+		green0 = 0;
+	if (blue0 < 0)
+		blue0 = 0;
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red << 4, green << 4, blue << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_unpack(
+	const uint8_t input[6],
+	vint4& output0,
+	vint4& output1
+) {
+
+	int v0 = input[0];
+	int v1 = input[1];
+	int v2 = input[2];
+	int v3 = input[3];
+	int v4 = input[4];
+	int v5 = input[5];
+
+	// extract all the fixed-placement bitfields
+	int modeval = ((v1 & 0x80) >> 7) | (((v2 & 0x80) >> 7) << 1) | (((v3 & 0x80) >> 7) << 2);
+
+	int majcomp = ((v4 & 0x80) >> 7) | (((v5 & 0x80) >> 7) << 1);
+
+	if (majcomp == 3)
+	{
+		output0 = vint4(v0 << 8, v2 << 8, (v4 & 0x7F) << 9, 0x7800);
+		output1 = vint4(v1 << 8, v3 << 8, (v5 & 0x7F) << 9, 0x7800);
+		return;
+	}
+
+	int a = v0 | ((v1 & 0x40) << 2);
+	int b0 = v2 & 0x3f;
+	int b1 = v3 & 0x3f;
+	int c = v1 & 0x3f;
+	int d0 = v4 & 0x7f;
+	int d1 = v5 & 0x7f;
+
+	// get hold of the number of bits in 'd0' and 'd1'
+	static const int dbits_tab[8] { 7, 6, 7, 6, 5, 6, 5, 6 };
+	int dbits = dbits_tab[modeval];
+
+	// extract six variable-placement bits
+	int bit0 = (v2 >> 6) & 1;
+	int bit1 = (v3 >> 6) & 1;
+	int bit2 = (v4 >> 6) & 1;
+	int bit3 = (v5 >> 6) & 1;
+	int bit4 = (v4 >> 5) & 1;
+	int bit5 = (v5 >> 5) & 1;
+
+	// and prepend the variable-placement bits depending on mode.
+	int ohmod = 1 << modeval;	// one-hot-mode
+	if (ohmod & 0xA4)
+		a |= bit0 << 9;
+	if (ohmod & 0x8)
+		a |= bit2 << 9;
+	if (ohmod & 0x50)
+		a |= bit4 << 9;
+
+	if (ohmod & 0x50)
+		a |= bit5 << 10;
+	if (ohmod & 0xA0)
+		a |= bit1 << 10;
+
+	if (ohmod & 0xC0)
+		a |= bit2 << 11;
+
+	if (ohmod & 0x4)
+		c |= bit1 << 6;
+	if (ohmod & 0xE8)
+		c |= bit3 << 6;
+
+	if (ohmod & 0x20)
+		c |= bit2 << 7;
+
+	if (ohmod & 0x5B)
+	{
+		b0 |= bit0 << 6;
+		b1 |= bit1 << 6;
+	}
+
+	if (ohmod & 0x12)
+	{
+		b0 |= bit2 << 7;
+		b1 |= bit3 << 7;
+	}
+
+	if (ohmod & 0xAF)
+	{
+		d0 |= bit4 << 5;
+		d1 |= bit5 << 5;
+	}
+
+	if (ohmod & 0x5)
+	{
+		d0 |= bit2 << 6;
+		d1 |= bit3 << 6;
+	}
+
+	// sign-extend 'd0' and 'd1'
+	// note: this code assumes that signed right-shift actually sign-fills, not zero-fills.
+	int32_t d0x = d0;
+	int32_t d1x = d1;
+	int sx_shamt = 32 - dbits;
+	d0x <<= sx_shamt;
+	d0x >>= sx_shamt;
+	d1x <<= sx_shamt;
+	d1x >>= sx_shamt;
+	d0 = d0x;
+	d1 = d1x;
+
+	// expand all values to 12 bits, with left-shift as needed.
+	int val_shamt = (modeval >> 1) ^ 3;
+	a <<= val_shamt;
+	b0 <<= val_shamt;
+	b1 <<= val_shamt;
+	c <<= val_shamt;
+	d0 <<= val_shamt;
+	d1 <<= val_shamt;
+
+	// then compute the actual color values.
+	int red1 = a;
+	int green1 = a - b0;
+	int blue1 = a - b1;
+	int red0 = a - c;
+	int green0 = a - b0 - c - d0;
+	int blue0 = a - b1 - c - d1;
+
+	// clamp the color components to [0,2^12 - 1]
+	red0 = astc::clamp(red0, 0, 4095);
+	green0 = astc::clamp(green0, 0, 4095);
+	blue0 = astc::clamp(blue0, 0, 4095);
+
+	red1 = astc::clamp(red1, 0, 4095);
+	green1 = astc::clamp(green1, 0, 4095);
+	blue1 = astc::clamp(blue1, 0, 4095);
+
+	// switch around the color components
+	int temp0, temp1;
+	switch (majcomp)
+	{
+	case 1:					// switch around red and green
+		temp0 = red0;
+		temp1 = red1;
+		red0 = green0;
+		red1 = green1;
+		green0 = temp0;
+		green1 = temp1;
+		break;
+	case 2:					// switch around red and blue
+		temp0 = red0;
+		temp1 = red1;
+		red0 = blue0;
+		red1 = blue1;
+		blue0 = temp0;
+		blue1 = temp1;
+		break;
+	case 0:					// no switch
+		break;
+	}
+
+	output0 = vint4(red0 << 4, green0 << 4, blue0 << 4, 0x7800);
+	output1 = vint4(red1 << 4, green1 << 4, blue1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR RGB + LDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_ldr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int v6 = input[6];
+	int v7 = input[7];
+	output0.set_lane<3>(v6);
+	output1.set_lane<3>(v7);
+}
+
+/**
+ * @brief Unpack an HDR L (small range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_small_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v0 & 0x80)
+	{
+		y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
+		y1 = (v1 & 0x1F) << 2;
+	}
+	else
+	{
+		y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
+		y1 = (v1 & 0xF) << 1;
+	}
+
+	y1 += y0;
+	if (y1 > 0xFFF)
+	{
+		y1 = 0xFFF;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR L (large range) direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_luminance_large_range_unpack(
+	const uint8_t input[2],
+	vint4& output0,
+	vint4& output1
+) {
+	int v0 = input[0];
+	int v1 = input[1];
+
+	int y0, y1;
+	if (v1 >= v0)
+	{
+		y0 = v0 << 4;
+		y1 = v1 << 4;
+	}
+	else
+	{
+		y0 = (v1 << 4) + 8;
+		y1 = (v0 << 4) - 8;
+	}
+
+	output0 = vint4(y0 << 4, y0 << 4, y0 << 4, 0x7800);
+	output1 = vint4(y1 << 4, y1 << 4, y1 << 4, 0x7800);
+}
+
+/**
+ * @brief Unpack an HDR A direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_alpha_unpack(
+	const uint8_t input[2],
+	int& output0,
+	int& output1
+) {
+
+	int v6 = input[0];
+	int v7 = input[1];
+
+	int selector = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
+	v6 &= 0x7F;
+	v7 &= 0x7F;
+	if (selector == 3)
+	{
+		output0 = v6 << 5;
+		output1 = v7 << 5;
+	}
+	else
+	{
+		v6 |= (v7 << (selector + 1)) & 0x780;
+		v7 &= (0x3f >> selector);
+		v7 ^= 32 >> selector;
+		v7 -= 32 >> selector;
+		v6 <<= (4 - selector);
+		v7 <<= (4 - selector);
+		v7 += v6;
+
+		if (v7 < 0)
+		{
+			v7 = 0;
+		}
+		else if (v7 > 0xFFF)
+		{
+			v7 = 0xFFF;
+		}
+
+		output0 = v6;
+		output1 = v7;
+	}
+
+	output0 <<= 4;
+	output1 <<= 4;
+}
+
+/**
+ * @brief Unpack an HDR RGBA direct encoding.
+ *
+ * @param      input     The packed endpoints (packed and modal).
+ * @param[out] output0   The unpacked endpoint 0 color.
+ * @param[out] output1   The unpacked endpoint 1 color.
+ */
+static void hdr_rgb_hdr_alpha_unpack(
+	const uint8_t input[8],
+	vint4& output0,
+	vint4& output1
+) {
+	hdr_rgb_unpack(input, output0, output1);
+
+	int alpha0, alpha1;
+	hdr_alpha_unpack(input + 6, alpha0, alpha1);
+
+	output0.set_lane<3>(alpha0);
+	output1.set_lane<3>(alpha1);
+}
+
+/* See header for documentation. */
+void unpack_color_endpoints(
+	astcenc_profile decode_mode,
+	int format,
+	const uint8_t* input,
+	bool& rgb_hdr,
+	bool& alpha_hdr,
+	vint4& output0,
+	vint4& output1
+) {
+	// Assume no NaNs and LDR endpoints unless set later
+	rgb_hdr = false;
+	alpha_hdr = false;
+
+	bool alpha_hdr_default = false;
+
+	switch (format)
+	{
+	case FMT_LUMINANCE:
+		luminance_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_DELTA:
+		luminance_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_SMALL_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_small_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_LUMINANCE_LARGE_RANGE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_luminance_large_range_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA:
+		luminance_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_LUMINANCE_ALPHA_DELTA:
+		luminance_alpha_delta_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB_SCALE:
+		{
+			vint4 input0q(input[0], input[1], input[2], 0);
+			uint8_t scale = input[3];
+			rgb_scale_unpack(input0q, scale, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_SCALE_ALPHA:
+		{
+			vint4 input0q(input[0], input[1], input[2], input[4]);
+			uint8_t alpha1q = input[5];
+			uint8_t scaleq = input[3];
+			rgb_scale_alpha_unpack(input0q, alpha1q, scaleq, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_SCALE:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgbo_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGB:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGB_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], 0);
+			vint4 input1q(input[1], input[3], input[5], 0);
+			rgb_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB:
+		rgb_hdr = true;
+		alpha_hdr_default = true;
+		hdr_rgb_unpack(input, output0, output1);
+		break;
+
+	case FMT_RGBA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_RGBA_DELTA:
+		{
+			vint4 input0q(input[0], input[2], input[4], input[6]);
+			vint4 input1q(input[1], input[3], input[5], input[7]);
+			rgba_delta_unpack(input0q, input1q, output0, output1);
+		}
+		break;
+
+	case FMT_HDR_RGB_LDR_ALPHA:
+		rgb_hdr = true;
+		hdr_rgb_ldr_alpha_unpack(input, output0, output1);
+		break;
+
+	case FMT_HDR_RGBA:
+		rgb_hdr = true;
+		alpha_hdr = true;
+		hdr_rgb_hdr_alpha_unpack(input, output0, output1);
+		break;
+	}
+
+	// Assign a correct default alpha
+	if (alpha_hdr_default)
+	{
+		if (decode_mode == ASTCENC_PRF_HDR)
+		{
+			output0.set_lane<3>(0x7800);
+			output1.set_lane<3>(0x7800);
+			alpha_hdr = true;
+		}
+		else
+		{
+			output0.set_lane<3>(0x00FF);
+			output1.set_lane<3>(0x00FF);
+			alpha_hdr = false;
+		}
+	}
+
+	vint4 ldr_scale(257);
+	vint4 hdr_scale(1);
+	vint4 output_scale = ldr_scale;
+
+	// An LDR profile image
+	if ((decode_mode == ASTCENC_PRF_LDR) ||
+	    (decode_mode == ASTCENC_PRF_LDR_SRGB))
+	{
+		// Also matches HDR alpha, as cannot have HDR alpha without HDR RGB
+		if (rgb_hdr == true)
+		{
+			output0 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
+			output1 = vint4(0xFF00, 0x0000, 0xFF00, 0xFF00);
+			output_scale = hdr_scale;
+
+			rgb_hdr = false;
+			alpha_hdr = false;
+		}
+	}
+	// An HDR profile image
+	else
+	{
+		vmask4 hdr_lanes(rgb_hdr, rgb_hdr, rgb_hdr, alpha_hdr);
+		output_scale = select(ldr_scale, hdr_scale, hdr_lanes);
+	}
+
+	output0 = output0 * output_scale;
+	output1 = output1 * output_scale;
+}
diff --git a/thirdparty/astcenc/astcenc_compress_symbolic.cpp b/thirdparty/astcenc/astcenc_compress_symbolic.cpp
new file mode 100644
index 0000000000..afb76246e7
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_compress_symbolic.cpp
@@ -0,0 +1,1455 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions to compress a symbolic block.
+ */
+
+#include "astcenc_internal.h"
+#include "astcenc_diagnostic_trace.h"
+
+#include <cassert>
+
+/**
+ * @brief Merge two planes of endpoints into a single vector.
+ *
+ * @param      ep_plane1          The endpoints for plane 1.
+ * @param      ep_plane2          The endpoints for plane 2.
+ * @param      component_plane2   The color component for plane 2.
+ * @param[out] result             The merged output.
+ */
+static void merge_endpoints(
+	const endpoints& ep_plane1,
+	const endpoints& ep_plane2,
+	unsigned int component_plane2,
+	endpoints& result
+) {
+	unsigned int partition_count = ep_plane1.partition_count;
+	assert(partition_count == 1);
+
+	vmask4 sep_mask = vint4::lane_id() == vint4(component_plane2);
+
+	result.partition_count = partition_count;
+	result.endpt0[0] = select(ep_plane1.endpt0[0], ep_plane2.endpt0[0], sep_mask);
+	result.endpt1[0] = select(ep_plane1.endpt1[0], ep_plane2.endpt1[0], sep_mask);
+}
+
+/**
+ * @brief Attempt to improve weights given a chosen configuration.
+ *
+ * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
+ * partition and per plane) and attempt to improve image quality by moving each weight up by one or
+ * down by one quantization step.
+ *
+ * This is a specialized function which only supports operating on undecimated weight grids,
+ * therefore primarily improving the performance of 4x4 and 5x5 blocks where grid decimation
+ * is needed less often.
+ *
+ * @param      decode_mode   The decode mode (LDR, HDR).
+ * @param      bsd           The block size information.
+ * @param      blk           The image block color data to compress.
+ * @param[out] scb           The symbolic compressed block output.
+ */
+static bool realign_weights_undecimated(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	symbolic_compressed_block& scb
+) {
+	// Get the partition descriptor
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the quantization table
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	unsigned int weight_quant_level = bm.quant_mode;
+	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
+
+	unsigned int max_plane = bm.is_dual_plane;
+	int plane2_component = scb.plane2_component;
+	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
+
+	// Decode the color endpoints
+	bool rgb_hdr;
+	bool alpha_hdr;
+	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
+	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
+	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
+	vfloat4 offset[BLOCK_MAX_PARTITIONS];
+
+	promise(partition_count > 0);
+
+	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+	{
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[pa_idx],
+		                       scb.color_values[pa_idx],
+		                       rgb_hdr, alpha_hdr,
+		                       endpnt0[pa_idx],
+		                       endpnt1[pa_idx]);
+	}
+
+	uint8_t* dec_weights_uquant = scb.weights;
+	bool adjustments = false;
+
+	// For each plane and partition ...
+	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
+	{
+		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+		{
+			// Compute the endpoint delta for all components in current plane
+			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
+			epd = select(epd, vint4::zero(), plane_mask);
+
+			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
+			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
+		}
+
+		// For each weight compute previous, current, and next errors
+		promise(bsd.texel_count > 0);
+		for (unsigned int texel = 0; texel < bsd.texel_count; texel++)
+		{
+			int uqw = dec_weights_uquant[texel];
+
+			uint32_t prev_and_next = qat.prev_next_values[uqw];
+			int uqw_down = prev_and_next & 0xFF;
+			int uqw_up = (prev_and_next >> 8) & 0xFF;
+
+			// Interpolate the colors to create the diffs
+			float weight_base = static_cast<float>(uqw);
+			float weight_down = static_cast<float>(uqw_down - uqw);
+			float weight_up = static_cast<float>(uqw_up - uqw);
+
+			unsigned int partition = pi.partition_of_texel[texel];
+			vfloat4 color_offset = offset[partition];
+			vfloat4 color_base   = endpnt0f[partition];
+
+			vfloat4 color = color_base + color_offset * weight_base;
+			vfloat4 orig_color   = blk.texel(texel);
+			vfloat4 error_weight = blk.channel_weight;
+
+			vfloat4 color_diff      = color - orig_color;
+			vfloat4 color_diff_down = color_diff + color_offset * weight_down;
+			vfloat4 color_diff_up   = color_diff + color_offset * weight_up;
+
+			float error_base = dot_s(color_diff      * color_diff,      error_weight);
+			float error_down = dot_s(color_diff_down * color_diff_down, error_weight);
+			float error_up   = dot_s(color_diff_up   * color_diff_up,   error_weight);
+
+			// Check if the prev or next error is better, and if so use it
+			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
+			{
+				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_up);
+				adjustments = true;
+			}
+			else if ((error_down < error_base) && (uqw > 0))
+			{
+				dec_weights_uquant[texel] = static_cast<uint8_t>(uqw_down);
+				adjustments = true;
+			}
+		}
+
+		// Prepare iteration for plane 2
+		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
+		plane_mask = ~plane_mask;
+	}
+
+	return adjustments;
+}
+
+/**
+ * @brief Attempt to improve weights given a chosen configuration.
+ *
+ * Given a fixed weight grid decimation and weight value quantization, iterate over all weights (per
+ * partition and per plane) and attempt to improve image quality by moving each weight up by one or
+ * down by one quantization step.
+ *
+ * @param      decode_mode   The decode mode (LDR, HDR).
+ * @param      bsd           The block size information.
+ * @param      blk           The image block color data to compress.
+ * @param[out] scb           The symbolic compressed block output.
+ */
+static bool realign_weights_decimated(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	symbolic_compressed_block& scb
+) {
+	// Get the partition descriptor
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the quantization table
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	unsigned int weight_quant_level = bm.quant_mode;
+	const quant_and_transfer_table& qat = quant_and_xfer_tables[weight_quant_level];
+
+	// Get the decimation table
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+	unsigned int weight_count = di.weight_count;
+	assert(weight_count != bsd.texel_count);
+
+	unsigned int max_plane = bm.is_dual_plane;
+	int plane2_component = scb.plane2_component;
+	vmask4 plane_mask = vint4::lane_id() == vint4(plane2_component);
+
+	// Decode the color endpoints
+	bool rgb_hdr;
+	bool alpha_hdr;
+	vint4 endpnt0[BLOCK_MAX_PARTITIONS];
+	vint4 endpnt1[BLOCK_MAX_PARTITIONS];
+	vfloat4 endpnt0f[BLOCK_MAX_PARTITIONS];
+	vfloat4 offset[BLOCK_MAX_PARTITIONS];
+
+	promise(partition_count > 0);
+	promise(weight_count > 0);
+
+	for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+	{
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[pa_idx],
+		                       scb.color_values[pa_idx],
+		                       rgb_hdr, alpha_hdr,
+		                       endpnt0[pa_idx],
+		                       endpnt1[pa_idx]);
+	}
+
+	uint8_t* dec_weights_uquant = scb.weights;
+	bool adjustments = false;
+
+	// For each plane and partition ...
+	for (unsigned int pl_idx = 0; pl_idx <= max_plane; pl_idx++)
+	{
+		for (unsigned int pa_idx = 0; pa_idx < partition_count; pa_idx++)
+		{
+			// Compute the endpoint delta for all components in current plane
+			vint4 epd = endpnt1[pa_idx] - endpnt0[pa_idx];
+			epd = select(epd, vint4::zero(), plane_mask);
+
+			endpnt0f[pa_idx] = int_to_float(endpnt0[pa_idx]);
+			offset[pa_idx] = int_to_float(epd) * (1.0f / 64.0f);
+		}
+
+		// Create an unquantized weight grid for this decimation level
+		alignas(ASTCENC_VECALIGN) float uq_weightsf[BLOCK_MAX_WEIGHTS];
+		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx += ASTCENC_SIMD_WIDTH)
+		{
+			vint unquant_value(dec_weights_uquant + we_idx);
+			vfloat unquant_valuef = int_to_float(unquant_value);
+			storea(unquant_valuef, uq_weightsf + we_idx);
+		}
+
+		// For each weight compute previous, current, and next errors
+		for (unsigned int we_idx = 0; we_idx < weight_count; we_idx++)
+		{
+			int uqw = dec_weights_uquant[we_idx];
+			uint32_t prev_and_next = qat.prev_next_values[uqw];
+
+			float uqw_base = uq_weightsf[we_idx];
+			float uqw_down = static_cast<float>(prev_and_next & 0xFF);
+			float uqw_up = static_cast<float>((prev_and_next >> 8) & 0xFF);
+
+			float uqw_diff_down = uqw_down - uqw_base;
+			float uqw_diff_up = uqw_up - uqw_base;
+
+			vfloat4 error_basev = vfloat4::zero();
+			vfloat4 error_downv = vfloat4::zero();
+			vfloat4 error_upv = vfloat4::zero();
+
+			// Interpolate the colors to create the diffs
+			unsigned int texels_to_evaluate = di.weight_texel_count[we_idx];
+			promise(texels_to_evaluate > 0);
+			for (unsigned int te_idx = 0; te_idx < texels_to_evaluate; te_idx++)
+			{
+				unsigned int texel = di.weight_texels_tr[te_idx][we_idx];
+
+				float tw_base = di.texel_contrib_for_weight[te_idx][we_idx];
+
+				float weight_base = (uq_weightsf[di.texel_weights_tr[0][texel]] * di.texel_weight_contribs_float_tr[0][texel]
+				                   + uq_weightsf[di.texel_weights_tr[1][texel]] * di.texel_weight_contribs_float_tr[1][texel])
+					              + (uq_weightsf[di.texel_weights_tr[2][texel]] * di.texel_weight_contribs_float_tr[2][texel]
+				                   + uq_weightsf[di.texel_weights_tr[3][texel]] * di.texel_weight_contribs_float_tr[3][texel]);
+
+				// Ideally this is integer rounded, but IQ gain it isn't worth the overhead
+				// float weight = astc::flt_rd(weight_base + 0.5f);
+				// float weight_down = astc::flt_rd(weight_base + 0.5f + uqw_diff_down * tw_base) - weight;
+				// float weight_up = astc::flt_rd(weight_base + 0.5f + uqw_diff_up * tw_base) - weight;
+				float weight_down = weight_base + uqw_diff_down * tw_base - weight_base;
+				float weight_up = weight_base + uqw_diff_up * tw_base - weight_base;
+
+				unsigned int partition = pi.partition_of_texel[texel];
+				vfloat4 color_offset = offset[partition];
+				vfloat4 color_base   = endpnt0f[partition];
+
+				vfloat4 color = color_base + color_offset * weight_base;
+				vfloat4 orig_color = blk.texel(texel);
+
+				vfloat4 color_diff      = color - orig_color;
+				vfloat4 color_down_diff = color_diff + color_offset * weight_down;
+				vfloat4 color_up_diff   = color_diff + color_offset * weight_up;
+
+				error_basev += color_diff * color_diff;
+				error_downv += color_down_diff * color_down_diff;
+				error_upv   += color_up_diff * color_up_diff;
+			}
+
+			vfloat4 error_weight = blk.channel_weight;
+			float error_base = hadd_s(error_basev * error_weight);
+			float error_down = hadd_s(error_downv * error_weight);
+			float error_up   = hadd_s(error_upv   * error_weight);
+
+			// Check if the prev or next error is better, and if so use it
+			if ((error_up < error_base) && (error_up < error_down) && (uqw < 64))
+			{
+				uq_weightsf[we_idx] = uqw_up;
+				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_up);
+				adjustments = true;
+			}
+			else if ((error_down < error_base) && (uqw > 0))
+			{
+				uq_weightsf[we_idx] = uqw_down;
+				dec_weights_uquant[we_idx] = static_cast<uint8_t>(uqw_down);
+				adjustments = true;
+			}
+		}
+
+		// Prepare iteration for plane 2
+		dec_weights_uquant += WEIGHTS_PLANE2_OFFSET;
+		plane_mask = ~plane_mask;
+	}
+
+	return adjustments;
+}
+
+/**
+ * @brief Compress a block using a chosen partitioning and 1 plane of weights.
+ *
+ * @param      config                    The compressor configuration.
+ * @param      bsd                       The block size information.
+ * @param      blk                       The image block color data to compress.
+ * @param      only_always               True if we only use "always" percentile block modes.
+ * @param      tune_errorval_threshold   The error value threshold.
+ * @param      partition_count           The partition count.
+ * @param      partition_index           The partition index if @c partition_count is 2-4.
+ * @param[out] scb                       The symbolic compressed block output.
+ * @param[out] tmpbuf                    The quantized weights for plane 1.
+ */
+static float compress_symbolic_block_for_partition_1plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	bool only_always,
+	float tune_errorval_threshold,
+	unsigned int partition_count,
+	unsigned int partition_index,
+	symbolic_compressed_block& scb,
+	compression_working_buffers& tmpbuf,
+	int quant_limit
+) {
+	promise(partition_count > 0);
+	promise(config.tune_candidate_limit > 0);
+	promise(config.tune_refinement_limit > 0);
+
+	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
+
+	auto compute_difference = &compute_symbolic_block_difference_1plane;
+	if ((partition_count == 1) && !(config.flags & ASTCENC_FLG_MAP_RGBM))
+	{
+		compute_difference = &compute_symbolic_block_difference_1plane_1partition;
+	}
+
+	const auto& pi = bsd.get_partition_info(partition_count, partition_index);
+
+	// Compute ideal weights and endpoint colors, with no quantization or decimation
+	endpoints_and_weights& ei = tmpbuf.ei1;
+	compute_ideal_colors_and_weights_1plane(blk, pi, ei);
+
+	// Compute ideal weights and endpoint colors for every decimation
+	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
+	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
+
+	// For each decimation mode, compute an ideal set of weights with no quantization
+	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
+	                                                : bsd.decimation_mode_count_selected;
+	promise(max_decimation_modes > 0);
+	for (unsigned int i = 0; i < max_decimation_modes; i++)
+	{
+		const auto& dm = bsd.get_decimation_mode(i);
+		if (!dm.is_ref_1_plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		const auto& di = bsd.get_decimation_info(i);
+
+		compute_ideal_weights_for_decimation(
+		    ei,
+		    di,
+		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
+	}
+
+	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
+	// weight pair, compute the smallest weight that will result in a color value greater than 1
+	vfloat4 min_ep(10.0f);
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		vfloat4 ep = (vfloat4(1.0f) - ei.ep.endpt0[i]) / (ei.ep.endpt1[i] - ei.ep.endpt0[i]);
+
+		vmask4 use_ep = (ep > vfloat4(0.5f)) & (ep < min_ep);
+		min_ep = select(min_ep, ep, use_ep);
+	}
+
+	float min_wt_cutoff = hmin_s(min_ep);
+
+	// For each mode, use the angular method to compute a shift
+	compute_angular_endpoints_1plane(
+	    only_always, bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
+
+	float* weight_low_value = tmpbuf.weight_low_value1;
+	float* weight_high_value = tmpbuf.weight_high_value1;
+	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
+	float* qwt_errors = tmpbuf.qwt_errors;
+
+	// For each mode (which specifies a decimation and a quantization):
+	//     * Compute number of bits needed for the quantized weights
+	//     * Generate an optimized set of quantized weights
+	//     * Compute quantization errors for the mode
+
+
+	static const int8_t free_bits_for_partition_count[4] {
+		115 - 4, 111 - 4 - PARTITION_INDEX_BITS, 108 - 4 - PARTITION_INDEX_BITS, 105 - 4 - PARTITION_INDEX_BITS
+	};
+
+	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
+	                                           : bsd.block_mode_count_1plane_selected;
+	promise(max_block_modes > 0);
+	for (unsigned int i = 0; i < max_block_modes; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+
+		if (bm.quant_mode > max_weight_quant)
+		{
+			qwt_errors[i] = 1e38f;
+			continue;
+		}
+
+		assert(!bm.is_dual_plane);
+		int bitcount = free_bits_for_partition_count[partition_count - 1] - bm.weight_bits;
+		if (bitcount <= 0)
+		{
+			qwt_errors[i] = 1e38f;
+			continue;
+		}
+
+		if (weight_high_value[i] > 1.02f * min_wt_cutoff)
+		{
+			weight_high_value[i] = 1.0f;
+		}
+
+		int decimation_mode = bm.decimation_mode;
+		const auto& di = bsd.get_decimation_info(decimation_mode);
+
+		qwt_bitcounts[i] = static_cast<int8_t>(bitcount);
+
+		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
+
+		// Generate the optimized set of weights for the weight mode
+		compute_quantized_weights_for_decimation(
+		    di,
+		    weight_low_value[i], weight_high_value[i],
+		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
+		    dec_weights_uquantf,
+		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
+		    bm.get_weight_quant_mode());
+
+		// Compute weight quantization errors for the block mode
+		qwt_errors[i] = compute_error_of_weight_set_1plane(
+		    ei,
+		    di,
+		    dec_weights_uquantf);
+	}
+
+	// Decide the optimal combination of color endpoint encodings and weight encodings
+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
+	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
+
+	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
+	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
+
+	unsigned int candidate_count = compute_ideal_endpoint_formats(
+	    pi, blk, ei.ep, qwt_bitcounts, qwt_errors,
+	    config.tune_candidate_limit, 0, max_block_modes,
+	    partition_format_specifiers, block_mode_index,
+	    color_quant_level, color_quant_level_mod, tmpbuf);
+
+	// Iterate over the N believed-to-be-best modes to find out which one is actually best
+	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
+	float best_errorval_in_scb = scb.errorval;
+
+	for (unsigned int i = 0; i < candidate_count; i++)
+	{
+		TRACE_NODE(node0, "candidate");
+
+		const int bm_packed_index = block_mode_index[i];
+		assert(bm_packed_index >= 0 && bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_selected));
+		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
+
+		int decimation_mode = qw_bm.decimation_mode;
+		const auto& di = bsd.get_decimation_info(decimation_mode);
+		promise(di.weight_count > 0);
+
+		trace_add_data("weight_x", di.weight_x);
+		trace_add_data("weight_y", di.weight_y);
+		trace_add_data("weight_z", di.weight_z);
+		trace_add_data("weight_quant", qw_bm.quant_mode);
+
+		// Recompute the ideal color endpoints before storing them
+		vfloat4 rgbs_colors[BLOCK_MAX_PARTITIONS];
+		vfloat4 rgbo_colors[BLOCK_MAX_PARTITIONS];
+
+		symbolic_compressed_block workscb;
+		endpoints workep = ei.ep;
+
+		uint8_t* u8_weight_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
+
+		for (unsigned int j = 0; j < di.weight_count; j++)
+		{
+			workscb.weights[j] = u8_weight_src[j];
+		}
+
+		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
+		{
+			recompute_ideal_colors_1plane(
+			    blk, pi, di, workscb.weights,
+			    workep, rgbs_colors, rgbo_colors);
+
+			// Quantize the chosen color, tracking if worth trying the mod value
+			bool all_same = color_quant_level[i] != color_quant_level_mod[i];
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				workscb.color_formats[j] = pack_color_endpoints(
+				    workep.endpt0[j],
+				    workep.endpt1[j],
+				    rgbs_colors[j],
+				    rgbo_colors[j],
+				    partition_format_specifiers[i][j],
+				    workscb.color_values[j],
+				    color_quant_level[i]);
+
+				all_same = all_same && workscb.color_formats[j] == workscb.color_formats[0];
+			}
+
+			// If all the color endpoint modes are the same, we get a few more bits to store colors;
+			// let's see if we can take advantage of this: requantize all the colors and see if the
+			// endpoint modes remain the same.
+			workscb.color_formats_matched = 0;
+			if (partition_count >= 2 && all_same)
+			{
+				uint8_t colorvals[BLOCK_MAX_PARTITIONS][12];
+				uint8_t color_formats_mod[BLOCK_MAX_PARTITIONS] { 0 };
+				bool all_same_mod = true;
+				for (unsigned int j = 0; j < partition_count; j++)
+				{
+					color_formats_mod[j] = pack_color_endpoints(
+					    workep.endpt0[j],
+					    workep.endpt1[j],
+					    rgbs_colors[j],
+					    rgbo_colors[j],
+					    partition_format_specifiers[i][j],
+					    colorvals[j],
+					    color_quant_level_mod[i]);
+
+					// Early out as soon as it's no longer possible to use mod
+					if (color_formats_mod[j] != color_formats_mod[0])
+					{
+						all_same_mod = false;
+						break;
+					}
+				}
+
+				if (all_same_mod)
+				{
+					workscb.color_formats_matched = 1;
+					for (unsigned int j = 0; j < BLOCK_MAX_PARTITIONS; j++)
+					{
+						for (unsigned int k = 0; k < 8; k++)
+						{
+							workscb.color_values[j][k] = colorvals[j][k];
+						}
+
+						workscb.color_formats[j] = color_formats_mod[j];
+					}
+				}
+			}
+
+			// Store header fields
+			workscb.partition_count = static_cast<uint8_t>(partition_count);
+			workscb.partition_index = static_cast<uint16_t>(partition_index);
+			workscb.plane2_component = -1;
+			workscb.quant_mode = workscb.color_formats_matched ? color_quant_level_mod[i] : color_quant_level[i];
+			workscb.block_mode = qw_bm.mode_index;
+			workscb.block_type = SYM_BTYPE_NONCONST;
+
+			// Pre-realign test
+			if (l == 0)
+			{
+				float errorval = compute_difference(config, bsd, workscb, blk);
+				if (errorval == -ERROR_CALC_DEFAULT)
+				{
+					errorval = -errorval;
+					workscb.block_type = SYM_BTYPE_ERROR;
+				}
+
+				trace_add_data("error_prerealign", errorval);
+				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
+
+				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
+				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
+				// drive a heuristic to skip blocks that are unlikely to catch up with the best
+				// block we have already.
+				unsigned int iters_remaining = config.tune_refinement_limit - l;
+				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
+				if (errorval > (threshold * best_errorval_in_scb))
+				{
+					break;
+				}
+
+				if (errorval < best_errorval_in_scb)
+				{
+					best_errorval_in_scb = errorval;
+					workscb.errorval = errorval;
+					scb = workscb;
+
+					if (errorval < tune_errorval_threshold)
+					{
+						// Skip remaining candidates - this is "good enough"
+						i = candidate_count;
+						break;
+					}
+				}
+			}
+
+			bool adjustments;
+			if (di.weight_count != bsd.texel_count)
+			{
+				adjustments = realign_weights_decimated(
+					config.profile, bsd, blk, workscb);
+			}
+			else
+			{
+				adjustments = realign_weights_undecimated(
+					config.profile, bsd, blk, workscb);
+			}
+
+			// Post-realign test
+			float errorval = compute_difference(config, bsd, workscb, blk);
+			if (errorval == -ERROR_CALC_DEFAULT)
+			{
+				errorval = -errorval;
+				workscb.block_type = SYM_BTYPE_ERROR;
+			}
+
+			trace_add_data("error_postrealign", errorval);
+			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
+
+			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
+			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
+			// give benefit of the doubt ...
+			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
+			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
+			if (errorval > (threshold * best_errorval_in_scb))
+			{
+				break;
+			}
+
+			if (errorval < best_errorval_in_scb)
+			{
+				best_errorval_in_scb = errorval;
+				workscb.errorval = errorval;
+				scb = workscb;
+
+				if (errorval < tune_errorval_threshold)
+				{
+					// Skip remaining candidates - this is "good enough"
+					i = candidate_count;
+					break;
+				}
+			}
+
+			if (!adjustments)
+			{
+				break;
+			}
+		}
+	}
+
+	return best_errorval_in_mode;
+}
+
+/**
+ * @brief Compress a block using a chosen partitioning and 2 planes of weights.
+ *
+ * @param      config                    The compressor configuration.
+ * @param      bsd                       The block size information.
+ * @param      blk                       The image block color data to compress.
+ * @param      tune_errorval_threshold   The error value threshold.
+ * @param      plane2_component          The component index for the second plane of weights.
+ * @param[out] scb                       The symbolic compressed block output.
+ * @param[out] tmpbuf                    The quantized weights for plane 1.
+ */
+static float compress_symbolic_block_for_partition_2planes(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	float tune_errorval_threshold,
+	unsigned int plane2_component,
+	symbolic_compressed_block& scb,
+	compression_working_buffers& tmpbuf,
+	int quant_limit
+) {
+	promise(config.tune_candidate_limit > 0);
+	promise(config.tune_refinement_limit > 0);
+	promise(bsd.decimation_mode_count_selected > 0);
+
+	int max_weight_quant = astc::min(static_cast<int>(QUANT_32), quant_limit);
+
+	// Compute ideal weights and endpoint colors, with no quantization or decimation
+	endpoints_and_weights& ei1 = tmpbuf.ei1;
+	endpoints_and_weights& ei2 = tmpbuf.ei2;
+
+	compute_ideal_colors_and_weights_2planes(bsd, blk, plane2_component, ei1, ei2);
+
+	// Compute ideal weights and endpoint colors for every decimation
+	float* dec_weights_ideal = tmpbuf.dec_weights_ideal;
+	uint8_t* dec_weights_uquant = tmpbuf.dec_weights_uquant;
+
+	// For each decimation mode, compute an ideal set of weights with no quantization
+	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
+	{
+		const auto& dm = bsd.get_decimation_mode(i);
+		if (!dm.is_ref_2_plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		const auto& di = bsd.get_decimation_info(i);
+
+		compute_ideal_weights_for_decimation(
+		    ei1,
+		    di,
+		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS);
+
+		compute_ideal_weights_for_decimation(
+		    ei2,
+		    di,
+		    dec_weights_ideal + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET);
+	}
+
+	// Compute maximum colors for the endpoints and ideal weights, then for each endpoint and ideal
+	// weight pair, compute the smallest weight that will result in a color value greater than 1
+	vfloat4 min_ep1(10.0f);
+	vfloat4 min_ep2(10.0f);
+
+	vfloat4 ep1 = (vfloat4(1.0f) - ei1.ep.endpt0[0]) / (ei1.ep.endpt1[0] - ei1.ep.endpt0[0]);
+	vmask4 use_ep1 = (ep1 > vfloat4(0.5f)) & (ep1 < min_ep1);
+	min_ep1 = select(min_ep1, ep1, use_ep1);
+
+	vfloat4 ep2 = (vfloat4(1.0f) - ei2.ep.endpt0[0]) / (ei2.ep.endpt1[0] - ei2.ep.endpt0[0]);
+	vmask4 use_ep2 = (ep2 > vfloat4(0.5f)) & (ep2 < min_ep2);
+	min_ep2 = select(min_ep2, ep2, use_ep2);
+
+	vfloat4 err_max(ERROR_CALC_DEFAULT);
+	vmask4 err_mask = vint4::lane_id() == vint4(plane2_component);
+
+	// Set the plane2 component to max error in ep1
+	min_ep1 = select(min_ep1, err_max, err_mask);
+
+	float min_wt_cutoff1 = hmin_s(min_ep1);
+
+	// Set the minwt2 to the plane2 component min in ep2
+	float min_wt_cutoff2 = hmin_s(select(err_max, min_ep2, err_mask));
+
+	compute_angular_endpoints_2planes(
+	    bsd, dec_weights_ideal, max_weight_quant, tmpbuf);
+
+	// For each mode (which specifies a decimation and a quantization):
+	//     * Compute number of bits needed for the quantized weights
+	//     * Generate an optimized set of quantized weights
+	//     * Compute quantization errors for the mode
+
+	float* weight_low_value1 = tmpbuf.weight_low_value1;
+	float* weight_high_value1 = tmpbuf.weight_high_value1;
+	float* weight_low_value2 = tmpbuf.weight_low_value2;
+	float* weight_high_value2 = tmpbuf.weight_high_value2;
+
+	int8_t* qwt_bitcounts = tmpbuf.qwt_bitcounts;
+	float* qwt_errors = tmpbuf.qwt_errors;
+
+	unsigned int start_2plane = bsd.block_mode_count_1plane_selected;
+	unsigned int end_2plane = bsd.block_mode_count_1plane_2plane_selected;
+
+	for (unsigned int i = start_2plane; i < end_2plane; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		assert(bm.is_dual_plane);
+
+		if (bm.quant_mode > max_weight_quant)
+		{
+			qwt_errors[i] = 1e38f;
+			continue;
+		}
+
+		qwt_bitcounts[i] = static_cast<int8_t>(109 - bm.weight_bits);
+
+		if (weight_high_value1[i] > 1.02f * min_wt_cutoff1)
+		{
+			weight_high_value1[i] = 1.0f;
+		}
+
+		if (weight_high_value2[i] > 1.02f * min_wt_cutoff2)
+		{
+			weight_high_value2[i] = 1.0f;
+		}
+
+		unsigned int decimation_mode = bm.decimation_mode;
+		const auto& di = bsd.get_decimation_info(decimation_mode);
+
+		alignas(ASTCENC_VECALIGN) float dec_weights_uquantf[BLOCK_MAX_WEIGHTS];
+
+		// Generate the optimized set of weights for the mode
+		compute_quantized_weights_for_decimation(
+		    di,
+		    weight_low_value1[i],
+		    weight_high_value1[i],
+		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode,
+		    dec_weights_uquantf,
+		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i,
+		    bm.get_weight_quant_mode());
+
+		compute_quantized_weights_for_decimation(
+		    di,
+		    weight_low_value2[i],
+		    weight_high_value2[i],
+		    dec_weights_ideal + BLOCK_MAX_WEIGHTS * decimation_mode + WEIGHTS_PLANE2_OFFSET,
+		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET,
+		    dec_weights_uquant + BLOCK_MAX_WEIGHTS * i + WEIGHTS_PLANE2_OFFSET,
+		    bm.get_weight_quant_mode());
+
+		// Compute weight quantization errors for the block mode
+		qwt_errors[i] = compute_error_of_weight_set_2planes(
+		    ei1,
+		    ei2,
+		    di,
+		    dec_weights_uquantf,
+		    dec_weights_uquantf + WEIGHTS_PLANE2_OFFSET);
+	}
+
+	// Decide the optimal combination of color endpoint encodings and weight encodings
+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS];
+	int block_mode_index[TUNE_MAX_TRIAL_CANDIDATES];
+
+	quant_method color_quant_level[TUNE_MAX_TRIAL_CANDIDATES];
+	quant_method color_quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES];
+
+	endpoints epm;
+	merge_endpoints(ei1.ep, ei2.ep, plane2_component, epm);
+
+	const auto& pi = bsd.get_partition_info(1, 0);
+	unsigned int candidate_count = compute_ideal_endpoint_formats(
+	    pi, blk, epm, qwt_bitcounts, qwt_errors,
+	    config.tune_candidate_limit,
+		bsd.block_mode_count_1plane_selected, bsd.block_mode_count_1plane_2plane_selected,
+	    partition_format_specifiers, block_mode_index,
+	    color_quant_level, color_quant_level_mod, tmpbuf);
+
+	// Iterate over the N believed-to-be-best modes to find out which one is actually best
+	float best_errorval_in_mode = ERROR_CALC_DEFAULT;
+	float best_errorval_in_scb = scb.errorval;
+
+	for (unsigned int i = 0; i < candidate_count; i++)
+	{
+		TRACE_NODE(node0, "candidate");
+
+		const int bm_packed_index = block_mode_index[i];
+		assert(bm_packed_index >= static_cast<int>(bsd.block_mode_count_1plane_selected) &&
+		       bm_packed_index < static_cast<int>(bsd.block_mode_count_1plane_2plane_selected));
+		const block_mode& qw_bm = bsd.block_modes[bm_packed_index];
+
+		int decimation_mode = qw_bm.decimation_mode;
+		const auto& di = bsd.get_decimation_info(decimation_mode);
+		promise(di.weight_count > 0);
+
+		trace_add_data("weight_x", di.weight_x);
+		trace_add_data("weight_y", di.weight_y);
+		trace_add_data("weight_z", di.weight_z);
+		trace_add_data("weight_quant", qw_bm.quant_mode);
+
+		vfloat4 rgbs_color;
+		vfloat4 rgbo_color;
+
+		symbolic_compressed_block workscb;
+		endpoints workep = epm;
+
+		uint8_t* u8_weight1_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index;
+		uint8_t* u8_weight2_src = dec_weights_uquant + BLOCK_MAX_WEIGHTS * bm_packed_index + WEIGHTS_PLANE2_OFFSET;
+
+		for (int j = 0; j < di.weight_count; j++)
+		{
+			workscb.weights[j] = u8_weight1_src[j];
+			workscb.weights[j + WEIGHTS_PLANE2_OFFSET] = u8_weight2_src[j];
+		}
+
+		for (unsigned int l = 0; l < config.tune_refinement_limit; l++)
+		{
+			recompute_ideal_colors_2planes(
+			    blk, bsd, di,
+			    workscb.weights, workscb.weights + WEIGHTS_PLANE2_OFFSET,
+			    workep, rgbs_color, rgbo_color, plane2_component);
+
+			// Quantize the chosen color
+			workscb.color_formats[0] = pack_color_endpoints(
+			                               workep.endpt0[0],
+			                               workep.endpt1[0],
+			                               rgbs_color, rgbo_color,
+			                               partition_format_specifiers[i][0],
+			                               workscb.color_values[0],
+			                               color_quant_level[i]);
+
+			// Store header fields
+			workscb.partition_count = 1;
+			workscb.partition_index = 0;
+			workscb.quant_mode = color_quant_level[i];
+			workscb.color_formats_matched = 0;
+			workscb.block_mode = qw_bm.mode_index;
+			workscb.plane2_component = static_cast<int8_t>(plane2_component);
+			workscb.block_type = SYM_BTYPE_NONCONST;
+
+			// Pre-realign test
+			if (l == 0)
+			{
+				float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
+				if (errorval == -ERROR_CALC_DEFAULT)
+				{
+					errorval = -errorval;
+					workscb.block_type = SYM_BTYPE_ERROR;
+				}
+
+				trace_add_data("error_prerealign", errorval);
+				best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
+
+				// Average refinement improvement is 3.5% per iteration (allow 4.5%), but the first
+				// iteration can help more so we give it a extra 8% leeway. Use this knowledge to
+				// drive a heuristic to skip blocks that are unlikely to catch up with the best
+				// block we have already.
+				unsigned int iters_remaining = config.tune_refinement_limit - l;
+				float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.08f;
+				if (errorval > (threshold * best_errorval_in_scb))
+				{
+					break;
+				}
+
+				if (errorval < best_errorval_in_scb)
+				{
+					best_errorval_in_scb = errorval;
+					workscb.errorval = errorval;
+					scb = workscb;
+
+					if (errorval < tune_errorval_threshold)
+					{
+						// Skip remaining candidates - this is "good enough"
+						i = candidate_count;
+						break;
+					}
+				}
+			}
+
+			// Perform a final pass over the weights to try to improve them.
+			bool adjustments;
+			if (di.weight_count != bsd.texel_count)
+			{
+				adjustments = realign_weights_decimated(
+					config.profile, bsd, blk, workscb);
+			}
+			else
+			{
+				adjustments = realign_weights_undecimated(
+					config.profile, bsd, blk, workscb);
+			}
+
+			// Post-realign test
+			float errorval = compute_symbolic_block_difference_2plane(config, bsd, workscb, blk);
+			if (errorval == -ERROR_CALC_DEFAULT)
+			{
+				errorval = -errorval;
+				workscb.block_type = SYM_BTYPE_ERROR;
+			}
+
+			trace_add_data("error_postrealign", errorval);
+			best_errorval_in_mode = astc::min(errorval, best_errorval_in_mode);
+
+			// Average refinement improvement is 3.5% per iteration, so skip blocks that are
+			// unlikely to catch up with the best block we have already. Assume a 4.5% per step to
+			// give benefit of the doubt ...
+			unsigned int iters_remaining = config.tune_refinement_limit - 1 - l;
+			float threshold = (0.045f * static_cast<float>(iters_remaining)) + 1.0f;
+			if (errorval > (threshold * best_errorval_in_scb))
+			{
+				break;
+			}
+
+			if (errorval < best_errorval_in_scb)
+			{
+				best_errorval_in_scb = errorval;
+				workscb.errorval = errorval;
+				scb = workscb;
+
+				if (errorval < tune_errorval_threshold)
+				{
+					// Skip remaining candidates - this is "good enough"
+					i = candidate_count;
+					break;
+				}
+			}
+
+			if (!adjustments)
+			{
+				break;
+			}
+		}
+	}
+
+	return best_errorval_in_mode;
+}
+
+/**
+ * @brief Determine the lowest cross-channel correlation factor.
+ *
+ * @param texels_per_block   The number of texels in a block.
+ * @param blk                The image block color data to compress.
+ *
+ * @return Return the lowest correlation factor.
+ */
+static float prepare_block_statistics(
+	int texels_per_block,
+	const image_block& blk
+) {
+	// Compute covariance matrix, as a collection of 10 scalars that form the upper-triangular row
+	// of the matrix. The matrix is symmetric, so this is all we need for this use case.
+	float rs = 0.0f;
+	float gs = 0.0f;
+	float bs = 0.0f;
+	float as = 0.0f;
+	float rr_var = 0.0f;
+	float gg_var = 0.0f;
+	float bb_var = 0.0f;
+	float aa_var = 0.0f;
+	float rg_cov = 0.0f;
+	float rb_cov = 0.0f;
+	float ra_cov = 0.0f;
+	float gb_cov = 0.0f;
+	float ga_cov = 0.0f;
+	float ba_cov = 0.0f;
+
+	float weight_sum = 0.0f;
+
+	promise(texels_per_block > 0);
+	for (int i = 0; i < texels_per_block; i++)
+	{
+		float weight = hadd_s(blk.channel_weight) / 4.0f;
+		assert(weight >= 0.0f);
+		weight_sum += weight;
+
+		float r = blk.data_r[i];
+		float g = blk.data_g[i];
+		float b = blk.data_b[i];
+		float a = blk.data_a[i];
+
+		float rw = r * weight;
+		rs += rw;
+		rr_var += r * rw;
+		rg_cov += g * rw;
+		rb_cov += b * rw;
+		ra_cov += a * rw;
+
+		float gw = g * weight;
+		gs += gw;
+		gg_var += g * gw;
+		gb_cov += b * gw;
+		ga_cov += a * gw;
+
+		float bw = b * weight;
+		bs += bw;
+		bb_var += b * bw;
+		ba_cov += a * bw;
+
+		float aw = a * weight;
+		as += aw;
+		aa_var += a * aw;
+	}
+
+	float rpt = 1.0f / astc::max(weight_sum, 1e-7f);
+
+	rr_var -= rs * (rs * rpt);
+	rg_cov -= gs * (rs * rpt);
+	rb_cov -= bs * (rs * rpt);
+	ra_cov -= as * (rs * rpt);
+
+	gg_var -= gs * (gs * rpt);
+	gb_cov -= bs * (gs * rpt);
+	ga_cov -= as * (gs * rpt);
+
+	bb_var -= bs * (bs * rpt);
+	ba_cov -= as * (bs * rpt);
+
+	aa_var -= as * (as * rpt);
+
+	// These will give a NaN if a channel is constant - these are fixed up in the next step
+	rg_cov *= astc::rsqrt(rr_var * gg_var);
+	rb_cov *= astc::rsqrt(rr_var * bb_var);
+	ra_cov *= astc::rsqrt(rr_var * aa_var);
+	gb_cov *= astc::rsqrt(gg_var * bb_var);
+	ga_cov *= astc::rsqrt(gg_var * aa_var);
+	ba_cov *= astc::rsqrt(bb_var * aa_var);
+
+	if (astc::isnan(rg_cov)) rg_cov = 1.0f;
+	if (astc::isnan(rb_cov)) rb_cov = 1.0f;
+	if (astc::isnan(ra_cov)) ra_cov = 1.0f;
+	if (astc::isnan(gb_cov)) gb_cov = 1.0f;
+	if (astc::isnan(ga_cov)) ga_cov = 1.0f;
+	if (astc::isnan(ba_cov)) ba_cov = 1.0f;
+
+	float lowest_correlation = astc::min(fabsf(rg_cov),      fabsf(rb_cov));
+	lowest_correlation       = astc::min(lowest_correlation, fabsf(ra_cov));
+	lowest_correlation       = astc::min(lowest_correlation, fabsf(gb_cov));
+	lowest_correlation       = astc::min(lowest_correlation, fabsf(ga_cov));
+	lowest_correlation       = astc::min(lowest_correlation, fabsf(ba_cov));
+
+	// Diagnostic trace points
+	trace_add_data("min_r", blk.data_min.lane<0>());
+	trace_add_data("max_r", blk.data_max.lane<0>());
+	trace_add_data("min_g", blk.data_min.lane<1>());
+	trace_add_data("max_g", blk.data_max.lane<1>());
+	trace_add_data("min_b", blk.data_min.lane<2>());
+	trace_add_data("max_b", blk.data_max.lane<2>());
+	trace_add_data("min_a", blk.data_min.lane<3>());
+	trace_add_data("max_a", blk.data_max.lane<3>());
+	trace_add_data("cov_rg", fabsf(rg_cov));
+	trace_add_data("cov_rb", fabsf(rb_cov));
+	trace_add_data("cov_ra", fabsf(ra_cov));
+	trace_add_data("cov_gb", fabsf(gb_cov));
+	trace_add_data("cov_ga", fabsf(ga_cov));
+	trace_add_data("cov_ba", fabsf(ba_cov));
+
+	return lowest_correlation;
+}
+
+/* See header for documentation. */
+void compress_block(
+	const astcenc_contexti& ctx,
+	const image_block& blk,
+	physical_compressed_block& pcb,
+	compression_working_buffers& tmpbuf)
+{
+	astcenc_profile decode_mode = ctx.config.profile;
+	symbolic_compressed_block scb;
+	const block_size_descriptor& bsd = *ctx.bsd;
+	float lowest_correl;
+
+	TRACE_NODE(node0, "block");
+	trace_add_data("pos_x", blk.xpos);
+	trace_add_data("pos_y", blk.ypos);
+	trace_add_data("pos_z", blk.zpos);
+
+	// Set stricter block targets for luminance data as we have more bits to play with
+	bool block_is_l = blk.is_luminance();
+	float block_is_l_scale = block_is_l ? 1.0f / 1.5f : 1.0f;
+
+	// Set slightly stricter block targets for lumalpha data as we have more bits to play with
+	bool block_is_la = blk.is_luminancealpha();
+	float block_is_la_scale = block_is_la ? 1.0f / 1.05f : 1.0f;
+
+	bool block_skip_two_plane = false;
+	int max_partitions = ctx.config.tune_partition_count_limit;
+
+	unsigned int requested_partition_indices[3] {
+		ctx.config.tune_2partition_index_limit,
+		ctx.config.tune_3partition_index_limit,
+		ctx.config.tune_4partition_index_limit
+	};
+
+	unsigned int requested_partition_trials[3] {
+		ctx.config.tune_2partitioning_candidate_limit,
+		ctx.config.tune_3partitioning_candidate_limit,
+		ctx.config.tune_4partitioning_candidate_limit
+	};
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	// Do this early in diagnostic builds so we can dump uniform metrics
+	// for every block. Do it later in release builds to avoid redundant work!
+	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
+	float error_threshold = ctx.config.tune_db_limit
+	                      * error_weight_sum
+	                      * block_is_l_scale
+	                      * block_is_la_scale;
+
+	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
+	trace_add_data("lowest_correl", lowest_correl);
+	trace_add_data("tune_error_threshold", error_threshold);
+#endif
+
+	// Detected a constant-color block
+	if (all(blk.data_min == blk.data_max))
+	{
+		TRACE_NODE(node1, "pass");
+		trace_add_data("partition_count", 0);
+		trace_add_data("plane_count", 1);
+
+		scb.partition_count = 0;
+
+		// Encode as FP16 if using HDR
+		if ((decode_mode == ASTCENC_PRF_HDR) ||
+		    (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A))
+		{
+			scb.block_type = SYM_BTYPE_CONST_F16;
+			vint4 color_f16 = float_to_float16(blk.origin_texel);
+			store(color_f16, scb.constant_color);
+		}
+		// Encode as UNORM16 if NOT using HDR
+		else
+		{
+			scb.block_type = SYM_BTYPE_CONST_U16;
+			vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
+			vint4 color_u16 = float_to_int_rtn(color_f32);
+			store(color_u16, scb.constant_color);
+		}
+
+		trace_add_data("exit", "quality hit");
+
+		symbolic_to_physical(bsd, scb, pcb);
+		return;
+	}
+
+#if !defined(ASTCENC_DIAGNOSTICS)
+	float error_weight_sum = hadd_s(blk.channel_weight) * bsd.texel_count;
+	float error_threshold = ctx.config.tune_db_limit
+	                      * error_weight_sum
+	                      * block_is_l_scale
+	                      * block_is_la_scale;
+#endif
+
+	// Set SCB and mode errors to a very high error value
+	scb.errorval = ERROR_CALC_DEFAULT;
+	scb.block_type = SYM_BTYPE_ERROR;
+
+	float best_errorvals_for_pcount[BLOCK_MAX_PARTITIONS] {
+		ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT, ERROR_CALC_DEFAULT
+	};
+
+	float exit_thresholds_for_pcount[BLOCK_MAX_PARTITIONS] {
+		0.0f,
+		ctx.config.tune_2_partition_early_out_limit_factor,
+		ctx.config.tune_3_partition_early_out_limit_factor,
+		0.0f
+	};
+
+	// Trial using 1 plane of weights and 1 partition.
+
+	// Most of the time we test it twice, first with a mode cutoff of 0 and then with the specified
+	// mode cutoff. This causes an early-out that speeds up encoding of easy blocks. However, this
+	// optimization is disabled for 4x4 and 5x4 blocks where it nearly always slows down the
+	// compression and slightly reduces image quality.
+
+	float errorval_mult[2] {
+		1.0f / ctx.config.tune_mse_overshoot,
+		1.0f
+	};
+
+	static const float errorval_overshoot = 1.0f / ctx.config.tune_mse_overshoot;
+
+	// Only enable MODE0 fast path (trial 0) if 2D, and more than 25 texels
+	int start_trial = 1;
+	if ((bsd.texel_count >= TUNE_MIN_TEXELS_MODE0_FASTPATH) && (bsd.zdim == 1))
+	{
+		start_trial = 0;
+	}
+
+	int quant_limit = QUANT_32;
+	for (int i = start_trial; i < 2; i++)
+	{
+		TRACE_NODE(node1, "pass");
+		trace_add_data("partition_count", 1);
+		trace_add_data("plane_count", 1);
+		trace_add_data("search_mode", i);
+
+		float errorval = compress_symbolic_block_for_partition_1plane(
+		    ctx.config, bsd, blk, i == 0,
+		    error_threshold * errorval_mult[i] * errorval_overshoot,
+		    1, 0,  scb, tmpbuf, QUANT_32);
+
+		// Record the quant level so we can use the filter later searches
+		const auto& bm = bsd.get_block_mode(scb.block_mode);
+		quant_limit = bm.get_weight_quant_mode();
+
+		best_errorvals_for_pcount[0] = astc::min(best_errorvals_for_pcount[0], errorval);
+		if (errorval < (error_threshold * errorval_mult[i]))
+		{
+			trace_add_data("exit", "quality hit");
+			goto END_OF_TESTS;
+		}
+	}
+
+#if !defined(ASTCENC_DIAGNOSTICS)
+	lowest_correl = prepare_block_statistics(bsd.texel_count, blk);
+#endif
+
+	block_skip_two_plane = lowest_correl > ctx.config.tune_2_plane_early_out_limit_correlation;
+
+	// Test the four possible 1-partition, 2-planes modes. Do this in reverse, as
+	// alpha is the most likely to be non-correlated if it is present in the data.
+	for (int i = BLOCK_MAX_COMPONENTS - 1; i >= 0; i--)
+	{
+		TRACE_NODE(node1, "pass");
+		trace_add_data("partition_count", 1);
+		trace_add_data("plane_count", 2);
+		trace_add_data("plane_component", i);
+
+		if (block_skip_two_plane)
+		{
+			trace_add_data("skip", "tune_2_plane_early_out_limit_correlation");
+			continue;
+		}
+
+		if (blk.grayscale && i != 3)
+		{
+			trace_add_data("skip", "grayscale block");
+			continue;
+		}
+
+		if (blk.is_constant_channel(i))
+		{
+			trace_add_data("skip", "constant component");
+			continue;
+		}
+
+		float errorval = compress_symbolic_block_for_partition_2planes(
+		    ctx.config, bsd, blk, error_threshold * errorval_overshoot,
+		    i, scb, tmpbuf, quant_limit);
+
+		// If attempting two planes is much worse than the best one plane result
+		// then further two plane searches are unlikely to help so move on ...
+		if (errorval > (best_errorvals_for_pcount[0] * 1.85f))
+		{
+			break;
+		}
+
+		if (errorval < error_threshold)
+		{
+			trace_add_data("exit", "quality hit");
+			goto END_OF_TESTS;
+		}
+	}
+
+	// Find best blocks for 2, 3 and 4 partitions
+	for (int partition_count = 2; partition_count <= max_partitions; partition_count++)
+	{
+		unsigned int partition_indices[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+		unsigned int requested_indices = requested_partition_indices[partition_count - 2];
+
+		unsigned int requested_trials = requested_partition_trials[partition_count - 2];
+		requested_trials = astc::min(requested_trials, requested_indices);
+
+		unsigned int actual_trials = find_best_partition_candidates(
+		    bsd, blk, partition_count, requested_indices, partition_indices, requested_trials);
+
+		float best_error_in_prev = best_errorvals_for_pcount[partition_count - 2];
+
+		for (unsigned int i = 0; i < actual_trials; i++)
+		{
+			TRACE_NODE(node1, "pass");
+			trace_add_data("partition_count", partition_count);
+			trace_add_data("partition_index", partition_indices[i]);
+			trace_add_data("plane_count", 1);
+			trace_add_data("search_mode", i);
+
+			float errorval = compress_symbolic_block_for_partition_1plane(
+			    ctx.config, bsd, blk, false,
+			    error_threshold * errorval_overshoot,
+			    partition_count, partition_indices[i],
+			    scb, tmpbuf, quant_limit);
+
+			best_errorvals_for_pcount[partition_count - 1] = astc::min(best_errorvals_for_pcount[partition_count - 1], errorval);
+
+			// If using N partitions doesn't improve much over using N-1 partitions then skip trying
+			// N+1. Error can dramatically improve if the data is correlated or non-correlated and
+			// aligns with a partitioning that suits that encoding, so for this inner loop check add
+			// a large error scale because the "other" trial could be a lot better.
+			float best_error = best_errorvals_for_pcount[partition_count - 1];
+			float best_error_scale = exit_thresholds_for_pcount[partition_count - 1] * 1.85f;
+			if (best_error > (best_error_in_prev * best_error_scale))
+			{
+				trace_add_data("skip", "tune_partition_early_out_limit_factor");
+				goto END_OF_TESTS;
+			}
+
+			if (errorval < error_threshold)
+			{
+				trace_add_data("exit", "quality hit");
+				goto END_OF_TESTS;
+			}
+		}
+
+		// If using N partitions doesn't improve much over using N-1 partitions then skip trying N+1
+		float best_error = best_errorvals_for_pcount[partition_count - 1];
+		float best_error_scale = exit_thresholds_for_pcount[partition_count - 1];
+		if (best_error > (best_error_in_prev * best_error_scale))
+		{
+			trace_add_data("skip", "tune_partition_early_out_limit_factor");
+			goto END_OF_TESTS;
+		}
+	}
+
+	trace_add_data("exit", "quality not hit");
+
+END_OF_TESTS:
+	// If we still have an error block then convert to something we can encode
+	// TODO: Do something more sensible here, such as average color block
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+#if defined(ASTCENC_DIAGNOSTICS)
+		static bool printed_once = false;
+		if (!printed_once)
+		{
+			printed_once = true;
+			printf("WARN: At least one block failed to find a valid encoding.\n"
+			       "      Try increasing compression quality settings.\n\n");
+		}
+#endif
+
+		scb.block_type = SYM_BTYPE_CONST_U16;
+		vfloat4 color_f32 = clamp(0.0f, 1.0f, blk.origin_texel) * 65535.0f;
+		vint4 color_u16 = float_to_int_rtn(color_f32);
+		store(color_u16, scb.constant_color);
+	}
+
+	// Compress to a physical block
+	symbolic_to_physical(bsd, scb, pcb);
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_compute_variance.cpp b/thirdparty/astcenc/astcenc_compute_variance.cpp
new file mode 100644
index 0000000000..48a4af8cef
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_compute_variance.cpp
@@ -0,0 +1,472 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions to calculate variance per component in a NxN footprint.
+ *
+ * We need N to be parametric, so the routine below uses summed area tables in order to execute in
+ * O(1) time independent of how big N is.
+ *
+ * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first
+ * perform a binary reduction, and then distributes the results. This method means that there is no
+ * serial dependency between a given element and the next one, and also significantly improves
+ * numerical stability allowing us to use floats rather than doubles.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Generate a prefix-sum array using the Brent-Kung algorithm.
+ *
+ * This will take an input array of the form:
+ *     v0, v1, v2, ...
+ * ... and modify in-place to turn it into a prefix-sum array of the form:
+ *     v0, v0+v1, v0+v1+v2, ...
+ *
+ * @param d      The array to prefix-sum.
+ * @param items  The number of items in the array.
+ * @param stride The item spacing in the array; i.e. dense arrays should use 1.
+ */
+static void brent_kung_prefix_sum(
+	vfloat4* d,
+	size_t items,
+	int stride
+) {
+	if (items < 2)
+		return;
+
+	size_t lc_stride = 2;
+	size_t log2_stride = 1;
+
+	// The reduction-tree loop
+	do {
+		size_t step = lc_stride >> 1;
+		size_t start = lc_stride - 1;
+		size_t iters = items >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+
+		log2_stride += 1;
+		lc_stride <<= 1;
+	} while (lc_stride <= items);
+
+	// The expansion-tree loop
+	do {
+		log2_stride -= 1;
+		lc_stride >>= 1;
+
+		size_t step = lc_stride >> 1;
+		size_t start = step + lc_stride - 1;
+		size_t iters = (items - step) >> log2_stride;
+
+		vfloat4 *da = d + (start * stride);
+		ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride);
+		size_t ofs_stride = stride << log2_stride;
+
+		while (iters)
+		{
+			*da = *da + da[ofs];
+			da += ofs_stride;
+			iters--;
+		}
+	} while (lc_stride > 2);
+}
+
+/* See header for documentation. */
+void compute_pixel_region_variance(
+	astcenc_contexti& ctx,
+	const pixel_region_args& arg
+) {
+	// Unpack the memory structure into local variables
+	const astcenc_image* img = arg.img;
+	astcenc_swizzle swz = arg.swz;
+	bool have_z = arg.have_z;
+
+	int size_x = arg.size_x;
+	int size_y = arg.size_y;
+	int size_z = arg.size_z;
+
+	int offset_x = arg.offset_x;
+	int offset_y = arg.offset_y;
+	int offset_z = arg.offset_z;
+
+	int alpha_kernel_radius = arg.alpha_kernel_radius;
+
+	float*   input_alpha_averages = ctx.input_alpha_averages;
+	vfloat4* work_memory = arg.work_memory;
+
+	// Compute memory sizes and dimensions that we need
+	int kernel_radius = alpha_kernel_radius;
+	int kerneldim = 2 * kernel_radius + 1;
+	int kernel_radius_xy = kernel_radius;
+	int kernel_radius_z = have_z ? kernel_radius : 0;
+
+	int padsize_x = size_x + kerneldim;
+	int padsize_y = size_y + kerneldim;
+	int padsize_z = size_z + (have_z ? kerneldim : 0);
+	int sizeprod = padsize_x * padsize_y * padsize_z;
+
+	int zd_start = have_z ? 1 : 0;
+
+	vfloat4 *varbuf1 = work_memory;
+	vfloat4 *varbuf2 = work_memory + sizeprod;
+
+	// Scaling factors to apply to Y and Z for accesses into the work buffers
+	int yst = padsize_x;
+	int zst = padsize_x * padsize_y;
+
+	// Scaling factors to apply to Y and Z for accesses into result buffers
+	int ydt = img->dim_x;
+	int zdt = img->dim_x * img->dim_y;
+
+	// Macros to act as accessor functions for the work-memory
+	#define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x]
+	#define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x]
+
+	// Load N and N^2 values into the work buffers
+	if (img->data_type == ASTCENC_TYPE_U8)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE
+		uint8_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 255;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					uint8_t r = data[swz.r];
+					uint8_t g = data[swz.g];
+					uint8_t b = data[swz.b];
+					uint8_t a = data[swz.a];
+
+					vfloat4 d = vfloat4 (r * (1.0f / 255.0f),
+					                     g * (1.0f / 255.0f),
+					                     b * (1.0f / 255.0f),
+					                     a * (1.0f / 255.0f));
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else if (img->data_type == ASTCENC_TYPE_F16)
+	{
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		uint16_t data[6];
+		data[ASTCENC_SWZ_0] = 0;
+		data[ASTCENC_SWZ_1] = 0x3C00;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					vfloat4 d = float16_to_float(di);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+	else // if (img->data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img->data_type == ASTCENC_TYPE_F32);
+
+		// Swizzle data structure 4 = ZERO, 5 = ONE (in FP16)
+		float data[6];
+		data[ASTCENC_SWZ_0] = 0.0f;
+		data[ASTCENC_SWZ_1] = 1.0f;
+
+		for (int z = zd_start; z < padsize_z; z++)
+		{
+			int z_src = (z - zd_start) + offset_z - kernel_radius_z;
+			z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1));
+			float* data32 = static_cast<float*>(img->data[z_src]);
+
+			for (int y = 1; y < padsize_y; y++)
+			{
+				int y_src = (y - 1) + offset_y - kernel_radius_xy;
+				y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1));
+
+				for (int x = 1; x < padsize_x; x++)
+				{
+					int x_src = (x - 1) + offset_x - kernel_radius_xy;
+					x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1));
+
+					data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src    )];
+					data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)];
+					data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)];
+					data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)];
+
+					float r = data[swz.r];
+					float g = data[swz.g];
+					float b = data[swz.b];
+					float a = data[swz.a];
+
+					vfloat4 d(r, g, b, a);
+
+					VARBUF1(z, y, x) = d;
+					VARBUF2(z, y, x) = d * d;
+				}
+			}
+		}
+	}
+
+	// Pad with an extra layer of 0s; this forms the edge of the SAT tables
+	vfloat4 vbz = vfloat4::zero();
+	for (int z = 0; z < padsize_z; z++)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			VARBUF1(z, y, 0) = vbz;
+			VARBUF2(z, y, 0) = vbz;
+		}
+
+		for (int x = 0; x < padsize_x; x++)
+		{
+			VARBUF1(z, 0, x) = vbz;
+			VARBUF2(z, 0, x) = vbz;
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 0; y < padsize_y; y++)
+		{
+			for (int x = 0; x < padsize_x; x++)
+			{
+				VARBUF1(0, y, x) = vbz;
+				VARBUF2(0, y, x) = vbz;
+			}
+		}
+	}
+
+	// Generate summed-area tables for N and N^2; this is done in-place, using
+	// a Brent-Kung parallel-prefix based algorithm to minimize precision loss
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1);
+			brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1);
+		}
+	}
+
+	for (int z = zd_start; z < padsize_z; z++)
+	{
+		for (int x = 1; x < padsize_x; x++)
+		{
+			brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst);
+			brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst);
+		}
+	}
+
+	if (have_z)
+	{
+		for (int y = 1; y < padsize_y; y++)
+		{
+			for (int x = 1; x < padsize_x; x++)
+			{
+				brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst);
+				brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst);
+			}
+		}
+	}
+
+	// Compute a few constants used in the variance-calculation.
+	float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1);
+	float alpha_rsamples;
+
+	if (have_z)
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim);
+	}
+	else
+	{
+		alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim);
+	}
+
+	// Use the summed-area tables to compute variance for each neighborhood
+	if (have_z)
+	{
+		for (int z = 0; z < size_z; z++)
+		{
+			int z_src = z + kernel_radius_z;
+			int z_dst = z + offset_z;
+			int z_low  = z_src - alpha_kernel_radius;
+			int z_high = z_src + alpha_kernel_radius + 1;
+
+			for (int y = 0; y < size_y; y++)
+			{
+				int y_src = y + kernel_radius_xy;
+				int y_dst = y + offset_y;
+				int y_low  = y_src - alpha_kernel_radius;
+				int y_high = y_src + alpha_kernel_radius + 1;
+
+				for (int x = 0; x < size_x; x++)
+				{
+					int x_src = x + kernel_radius_xy;
+					int x_dst = x + offset_x;
+					int x_low  = x_src - alpha_kernel_radius;
+					int x_high = x_src + alpha_kernel_radius + 1;
+
+					// Summed-area table lookups for alpha average
+					float vasum = (  VARBUF1(z_high, y_low,  x_low).lane<3>()
+					               - VARBUF1(z_high, y_low,  x_high).lane<3>()
+					               - VARBUF1(z_high, y_high, x_low).lane<3>()
+					               + VARBUF1(z_high, y_high, x_high).lane<3>()) -
+					              (  VARBUF1(z_low,  y_low,  x_low).lane<3>()
+					               - VARBUF1(z_low,  y_low,  x_high).lane<3>()
+					               - VARBUF1(z_low,  y_high, x_low).lane<3>()
+					               + VARBUF1(z_low,  y_high, x_high).lane<3>());
+
+					int out_index = z_dst * zdt + y_dst * ydt + x_dst;
+					input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+				}
+			}
+		}
+	}
+	else
+	{
+		for (int y = 0; y < size_y; y++)
+		{
+			int y_src = y + kernel_radius_xy;
+			int y_dst = y + offset_y;
+			int y_low  = y_src - alpha_kernel_radius;
+			int y_high = y_src + alpha_kernel_radius + 1;
+
+			for (int x = 0; x < size_x; x++)
+			{
+				int x_src = x + kernel_radius_xy;
+				int x_dst = x + offset_x;
+				int x_low  = x_src - alpha_kernel_radius;
+				int x_high = x_src + alpha_kernel_radius + 1;
+
+				// Summed-area table lookups for alpha average
+				float vasum = VARBUF1(0, y_low,  x_low).lane<3>()
+				            - VARBUF1(0, y_low,  x_high).lane<3>()
+				            - VARBUF1(0, y_high, x_low).lane<3>()
+				            + VARBUF1(0, y_high, x_high).lane<3>();
+
+				int out_index = y_dst * ydt + x_dst;
+				input_alpha_averages[out_index] = (vasum * alpha_rsamples);
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+unsigned int init_compute_averages(
+	const astcenc_image& img,
+	unsigned int alpha_kernel_radius,
+	const astcenc_swizzle& swz,
+	avg_args& ag
+) {
+	unsigned int size_x = img.dim_x;
+	unsigned int size_y = img.dim_y;
+	unsigned int size_z = img.dim_z;
+
+	// Compute maximum block size and from that the working memory buffer size
+	unsigned int kernel_radius = alpha_kernel_radius;
+	unsigned int kerneldim = 2 * kernel_radius + 1;
+
+	bool have_z = (size_z > 1);
+	unsigned int max_blk_size_xy = have_z ? 16 : 32;
+	unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u);
+
+	unsigned int max_padsize_xy = max_blk_size_xy + kerneldim;
+	unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0);
+
+	// Perform block-wise averages calculations across the image
+	// Initialize fields which are not populated until later
+	ag.arg.size_x = 0;
+	ag.arg.size_y = 0;
+	ag.arg.size_z = 0;
+	ag.arg.offset_x = 0;
+	ag.arg.offset_y = 0;
+	ag.arg.offset_z = 0;
+	ag.arg.work_memory = nullptr;
+
+	ag.arg.img = &img;
+	ag.arg.swz = swz;
+	ag.arg.have_z = have_z;
+	ag.arg.alpha_kernel_radius = alpha_kernel_radius;
+
+	ag.img_size_x = size_x;
+	ag.img_size_y = size_y;
+	ag.img_size_z = size_z;
+	ag.blk_size_xy = max_blk_size_xy;
+	ag.blk_size_z = max_blk_size_z;
+	ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z;
+
+	// The parallel task count
+	unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z;
+	unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy;
+	return z_tasks * y_tasks;
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
new file mode 100644
index 0000000000..39e5525c3b
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp
@@ -0,0 +1,623 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions to decompress a symbolic block.
+ */
+
+#include "astcenc_internal.h"
+
+#include <stdio.h>
+#include <assert.h>
+
+/**
+ * @brief Compute the integer linear interpolation of two color endpoints.
+ *
+ * @param decode_mode   The ASTC profile (linear or sRGB)
+ * @param color0        The endpoint0 color.
+ * @param color1        The endpoint1 color.
+ * @param weights        The interpolation weight (between 0 and 64).
+ *
+ * @return The interpolated color.
+ */
+static vint4 lerp_color_int(
+	astcenc_profile decode_mode,
+	vint4 color0,
+	vint4 color1,
+	vint4 weights
+) {
+	vint4 weight1 = weights;
+	vint4 weight0 = vint4(64) - weight1;
+
+	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		color0 = asr<8>(color0);
+		color1 = asr<8>(color1);
+	}
+
+	vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32);
+	color = asr<6>(color);
+
+	if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+	{
+		color = color * vint4(257);
+	}
+
+	return color;
+}
+
+
+/**
+ * @brief Convert integer color value into a float value for the decoder.
+ *
+ * @param data       The integer color value post-interpolation.
+ * @param lns_mask   If set treat lane as HDR (LNS) else LDR (unorm16).
+ *
+ * @return The float color value.
+ */
+static inline vfloat4 decode_texel(
+	vint4 data,
+	vmask4 lns_mask
+) {
+	vint4 color_lns = vint4::zero();
+	vint4 color_unorm = vint4::zero();
+
+	if (any(lns_mask))
+	{
+		color_lns = lns_to_sf16(data);
+	}
+
+	if (!all(lns_mask))
+	{
+		color_unorm = unorm16_to_sf16(data);
+	}
+
+	// Pick components and then convert to FP16
+	vint4 datai = select(color_unorm, color_lns, lns_mask);
+	return float16_to_float(datai);
+}
+
+/* See header for documentation. */
+void unpack_weights(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const decimation_info& di,
+	bool is_dual_plane,
+	int weights_plane1[BLOCK_MAX_TEXELS],
+	int weights_plane2[BLOCK_MAX_TEXELS]
+) {
+	// Safe to overshoot as all arrays are allocated to full size
+	if (!is_dual_plane)
+	{
+		// Build full 64-entry weight lookup table
+		vint4 tab0(reinterpret_cast<const int*>(scb.weights +  0));
+		vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16));
+		vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32));
+		vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48));
+
+		vint tab0p, tab1p, tab2p, tab3p;
+		vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint summed_value(8);
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax(weight_count).lane<0>();
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(summed_value), weights_plane1 + i);
+		}
+	}
+	else
+	{
+		// Build a 32-entry weight lookup table per plane
+		// Plane 1
+		vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights +  0));
+		vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16));
+		vint tab0_plane1p, tab1_plane1p;
+		vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
+
+		// Plane 2
+		vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32));
+		vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48));
+		vint tab0_plane2p, tab1_plane2p;
+		vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
+
+		for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vint sum_plane1(8);
+			vint sum_plane2(8);
+
+			vint weight_count(di.texel_weight_count + i);
+			int max_weight_count = hmax(weight_count).lane<0>();
+
+			promise(max_weight_count > 0);
+			for (int j = 0; j < max_weight_count; j++)
+			{
+				vint texel_weights(di.texel_weights_tr[j] + i);
+				vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
+
+				sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
+				sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
+			}
+
+			store(lsr<4>(sum_plane1), weights_plane1 + i);
+			store(lsr<4>(sum_plane2), weights_plane2 + i);
+		}
+	}
+}
+
+/**
+ * @brief Return an FP32 NaN value for use in error colors.
+ *
+ * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN.
+ *
+ * @return The float color value.
+ */
+static float error_color_nan()
+{
+	if32 v;
+	v.u = 0xFFFFE000U;
+	return v.f;
+}
+
+/* See header for documentation. */
+void decompress_symbolic_block(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	int xpos,
+	int ypos,
+	int zpos,
+	const symbolic_compressed_block& scb,
+	image_block& blk
+) {
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	blk.data_min = vfloat4::zero();
+	blk.data_mean = vfloat4::zero();
+	blk.data_max = vfloat4::zero();
+	blk.grayscale = false;
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = error_color_nan();
+			blk.data_g[i] = error_color_nan();
+			blk.data_b[i] = error_color_nan();
+			blk.data_a[i] = error_color_nan();
+			blk.rgb_lns[i] = 0;
+			blk.alpha_lns[i] = 0;
+		}
+
+		return;
+	}
+
+	if ((scb.block_type == SYM_BTYPE_CONST_F16) ||
+	    (scb.block_type == SYM_BTYPE_CONST_U16))
+	{
+		vfloat4 color;
+		uint8_t use_lns = 0;
+
+		// UNORM16 constant color block
+		if (scb.block_type == SYM_BTYPE_CONST_U16)
+		{
+			vint4 colori(scb.constant_color);
+
+			// For sRGB decoding a real decoder would just use the top 8 bits for color conversion.
+			// We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range.
+			if (decode_mode == ASTCENC_PRF_LDR_SRGB)
+			{
+				colori = asr<8>(colori) * 257;
+			}
+
+			vint4 colorf16 = unorm16_to_sf16(colori);
+			color = float16_to_float(colorf16);
+		}
+		// FLOAT16 constant color block
+		else
+		{
+			switch (decode_mode)
+			{
+			case ASTCENC_PRF_LDR_SRGB:
+			case ASTCENC_PRF_LDR:
+				color = vfloat4(error_color_nan());
+				break;
+			case ASTCENC_PRF_HDR_RGB_LDR_A:
+			case ASTCENC_PRF_HDR:
+				// Constant-color block; unpack from FP16 to FP32.
+				color = float16_to_float(vint4(scb.constant_color));
+				use_lns = 1;
+				break;
+			}
+		}
+
+		for (unsigned int i = 0; i < bsd.texel_count; i++)
+		{
+			blk.data_r[i] = color.lane<0>();
+			blk.data_g[i] = color.lane<1>();
+			blk.data_b[i] = color.lane<2>();
+			blk.data_a[i] = color.lane<3>();
+			blk.rgb_lns[i] = use_lns;
+			blk.alpha_lns[i] = use_lns;
+		}
+
+		return;
+	}
+
+	// Get the appropriate partition-table entry
+	int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptors
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	bool is_dual_plane = static_cast<bool>(bm.is_dual_plane);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights);
+
+	// Now that we have endpoint colors and weights, we can unpack texel colors
+	int plane2_component = scb.plane2_component;
+	vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component);
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(decode_mode,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns);
+
+		int texel_count = pi.partition_texel_count[i];
+		for (int j = 0; j < texel_count; j++)
+		{
+			int tix = pi.texels_of_partition[i][j];
+			vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask);
+			vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight);
+			vfloat4 colorf = decode_texel(color, lns_mask);
+
+			blk.data_r[tix] = colorf.lane<0>();
+			blk.data_g[tix] = colorf.lane<1>();
+			blk.data_b[tix] = colorf.lane<2>();
+			blk.data_a[tix] = colorf.lane<3>();
+		}
+	}
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_2plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(scb.partition_count == 1);
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	int plane2_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights);
+
+	vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component);
+
+	vfloat4 summa = vfloat4::zero();
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+	// Unpack and compute error for each texel in the partition
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask);
+		vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight);
+
+		vfloat4 color = int_to_float(colori);
+		vfloat4 oldColor = blk.texel(i);
+
+		// Compare error using a perceptual decode metric for RGBM textures
+		if (config.flags & ASTCENC_FLG_MAP_RGBM)
+		{
+			// Fail encodings that result in zero weight M pixels. Note that this can cause
+			// "interesting" artifacts if we reject all useful encodings - we typically get max
+			// brightness encodings instead which look just as bad. We recommend users apply a
+			// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+			// getting small M values post-quantization, but we can't prove it would never
+			// happen, especially at low bit rates ...
+			if (color.lane<3>() == 0.0f)
+			{
+				return -ERROR_CALC_DEFAULT;
+			}
+
+			// Compute error based on decoded RGBM color
+			color = vfloat4(
+				color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+				color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+
+			oldColor = vfloat4(
+				oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+				1.0f
+			);
+		}
+
+		vfloat4 error = oldColor - color;
+		error = min(abs(error), 1e15f);
+		error = error * error;
+
+		summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0);
+
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+
+	// Get the appropriate partition-table entry
+	unsigned int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	vfloat4 summa = vfloat4::zero();
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		// Decode the color endpoints for this partition
+		vint4 ep0;
+		vint4 ep1;
+		bool rgb_lns;
+		bool a_lns;
+
+		unpack_color_endpoints(config.profile,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_lns, a_lns,
+		                       ep0, ep1);
+
+		// Unpack and compute error for each texel in the partition
+		unsigned int texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			vint4 colori = lerp_color_int(config.profile, ep0, ep1,
+			                              vint4(plane1_weights[tix]));
+
+			vfloat4 color = int_to_float(colori);
+			vfloat4 oldColor = blk.texel(tix);
+
+			// Compare error using a perceptual decode metric for RGBM textures
+			if (config.flags & ASTCENC_FLG_MAP_RGBM)
+			{
+				// Fail encodings that result in zero weight M pixels. Note that this can cause
+				// "interesting" artifacts if we reject all useful encodings - we typically get max
+				// brightness encodings instead which look just as bad. We recommend users apply a
+				// bias to their stored M value, limiting the lower value to 16 or 32 to avoid
+				// getting small M values post-quantization, but we can't prove it would never
+				// happen, especially at low bit rates ...
+				if (color.lane<3>() == 0.0f)
+				{
+					return -ERROR_CALC_DEFAULT;
+				}
+
+				// Compute error based on decoded RGBM color
+				color = vfloat4(
+					color.lane<0>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<1>() * color.lane<3>() * config.rgbm_m_scale,
+					color.lane<2>() * color.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+
+				oldColor = vfloat4(
+					oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale,
+					1.0f
+				);
+			}
+
+			vfloat4 error = oldColor - color;
+			error = min(abs(error), 1e15f);
+			error = error * error;
+
+			summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT);
+		}
+	}
+
+	return summa.lane<0>();
+}
+
+/* See header for documentation. */
+float compute_symbolic_block_difference_1plane_1partition(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk
+) {
+	// If we detected an error-block, blow up immediately.
+	if (scb.block_type == SYM_BTYPE_ERROR)
+	{
+		return ERROR_CALC_DEFAULT;
+	}
+
+	assert(scb.block_mode >= 0);
+	assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1);
+
+	// Get the appropriate block descriptor
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	// Unquantize and undecimate the weights
+	alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS];
+	unpack_weights(bsd, scb, di, false, plane1_weights, nullptr);
+
+	// Decode the color endpoints for this partition
+	vint4 ep0;
+	vint4 ep1;
+	bool rgb_lns;
+	bool a_lns;
+
+	unpack_color_endpoints(config.profile,
+	                       scb.color_formats[0],
+	                       scb.color_values[0],
+	                       rgb_lns, a_lns,
+	                       ep0, ep1);
+
+
+	// Pre-shift sRGB so things round correctly
+	if (config.profile == ASTCENC_PRF_LDR_SRGB)
+	{
+		ep0 = asr<8>(ep0);
+		ep1 = asr<8>(ep1);
+	}
+
+	// Unpack and compute error for each texel in the partition
+	vfloatacc summav = vfloatacc::zero();
+
+	vint lane_id = vint::lane_id();
+	vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1);
+
+	unsigned int texel_count = bsd.texel_count;
+	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Compute EP1 contribution
+		vint weight1 = vint::loada(plane1_weights + i);
+		vint ep1_r = vint(ep1.lane<0>()) * weight1;
+		vint ep1_g = vint(ep1.lane<1>()) * weight1;
+		vint ep1_b = vint(ep1.lane<2>()) * weight1;
+		vint ep1_a = vint(ep1.lane<3>()) * weight1;
+
+		// Compute EP0 contribution
+		vint weight0 = vint(64) - weight1;
+		vint ep0_r = vint(ep0.lane<0>()) * weight0;
+		vint ep0_g = vint(ep0.lane<1>()) * weight0;
+		vint ep0_b = vint(ep0.lane<2>()) * weight0;
+		vint ep0_a = vint(ep0.lane<3>()) * weight0;
+
+		// Shift so things round correctly
+		vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale;
+		vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale;
+		vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale;
+		vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale;
+
+		// Compute color diff
+		vfloat color_r = int_to_float(colori_r);
+		vfloat color_g = int_to_float(colori_g);
+		vfloat color_b = int_to_float(colori_b);
+		vfloat color_a = int_to_float(colori_a);
+
+		vfloat color_orig_r = loada(blk.data_r + i);
+		vfloat color_orig_g = loada(blk.data_g + i);
+		vfloat color_orig_b = loada(blk.data_b + i);
+		vfloat color_orig_a = loada(blk.data_a + i);
+
+		vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f));
+		vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f));
+		vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f));
+		vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f));
+
+		// Compute squared error metric
+		color_error_r = color_error_r * color_error_r;
+		color_error_g = color_error_g * color_error_g;
+		color_error_b = color_error_b * color_error_b;
+		color_error_a = color_error_a * color_error_a;
+
+		vfloat metric = color_error_r * blk.channel_weight.lane<0>()
+		              + color_error_g * blk.channel_weight.lane<1>()
+		              + color_error_b * blk.channel_weight.lane<2>()
+		              + color_error_a * blk.channel_weight.lane<3>();
+
+		// Mask off bad lanes
+		vmask mask = lane_id < vint(texel_count);
+		lane_id += vint(ASTCENC_SIMD_WIDTH);
+		haccumulate(summav, metric, mask);
+	}
+
+	return hadd_s(summav);
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_diagnostic_trace.cpp b/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
new file mode 100644
index 0000000000..7fa7ab1a8b
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.cpp
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for the library entrypoint.
+ */
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <cassert>
+#include <cstdarg>
+#include <cstdio>
+#include <string>
+
+#include "astcenc_diagnostic_trace.h"
+
+/** @brief The global trace logger. */
+static TraceLog* g_TraceLog = nullptr;
+
+/** @brief The JSON indentation level. */
+static const size_t g_trace_indent = 2;
+
+TraceLog::TraceLog(
+	const char* file_name):
+	m_file(file_name, std::ofstream::out | std::ofstream::binary)
+{
+	assert(!g_TraceLog);
+	g_TraceLog = this;
+	m_root = new TraceNode("root");
+}
+
+/* See header for documentation. */
+TraceNode* TraceLog::get_current_leaf()
+{
+	if (m_stack.size())
+	{
+		return m_stack.back();
+	}
+
+	return nullptr;
+}
+
+/* See header for documentation. */
+size_t TraceLog::get_depth()
+{
+	return m_stack.size();
+}
+
+/* See header for documentation. */
+TraceLog::~TraceLog()
+{
+	assert(g_TraceLog == this);
+	delete m_root;
+	g_TraceLog = nullptr;
+}
+
+/* See header for documentation. */
+TraceNode::TraceNode(
+	const char* format,
+	...
+) {
+	// Format the name string
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	// Generate the node
+	TraceNode* parent = g_TraceLog->get_current_leaf();
+	size_t depth = g_TraceLog->get_depth();
+	g_TraceLog->m_stack.push_back(this);
+
+	bool comma = parent && parent->m_attrib_count;
+	auto& out = g_TraceLog->m_file;
+
+	if (parent)
+	{
+		parent->m_attrib_count++;
+	}
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	if (depth)
+	{
+		out << '\n';
+	}
+
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	out << out_indents << "[ \"node\", \"" << buffer << "\",\n";
+	out << in_indents << "[";
+}
+
+/* See header for documentation. */
+void TraceNode::add_attrib(
+	std::string type,
+	std::string key,
+	std::string value
+) {
+	(void)type;
+
+	size_t depth = g_TraceLog->get_depth();
+	size_t indent = (depth * 2) * g_trace_indent;
+	auto& out = g_TraceLog->m_file;
+	bool comma = m_attrib_count;
+	m_attrib_count++;
+
+	if (comma)
+	{
+		out << ',';
+	}
+
+	out << '\n';
+	out << std::string(indent, ' ') << "[ "
+	                                << "\"" << key << "\", "
+	                                << value << " ]";
+}
+
+/* See header for documentation. */
+TraceNode::~TraceNode()
+{
+	g_TraceLog->m_stack.pop_back();
+
+	auto& out = g_TraceLog->m_file;
+	size_t depth = g_TraceLog->get_depth();
+	size_t out_indent = (depth * 2) * g_trace_indent;
+	size_t in_indent = (depth * 2 + 1) * g_trace_indent;
+
+	std::string out_indents("");
+	if (out_indent)
+	{
+		out_indents = std::string(out_indent, ' ');
+	}
+
+	std::string in_indents(in_indent, ' ');
+
+	if (m_attrib_count)
+	{
+		out << "\n" << in_indents;
+	}
+	out << "]\n";
+
+	out << out_indents << "]";
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	const char* format,
+	...
+) {
+	constexpr size_t bufsz = 256;
+	char buffer[bufsz];
+
+	va_list args;
+	va_start (args, format);
+	vsnprintf (buffer, bufsz, format, args);
+	va_end (args);
+
+	// Guarantee there is a nul terminator
+	buffer[bufsz - 1] = 0;
+
+	std::string value = "\"" + std::string(buffer) + "\"";
+
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("str", key, value);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	float value
+) {
+  	char buffer[256];
+	sprintf(buffer, "%.20g", (double)value);
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("float", key, buffer);
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+/* See header for documentation. */
+void trace_add_data(
+	const char* key,
+	unsigned int value
+) {
+	TraceNode* node = g_TraceLog->get_current_leaf();
+	node->add_attrib("int", key, std::to_string(value));
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_diagnostic_trace.h b/thirdparty/astcenc/astcenc_diagnostic_trace.h
new file mode 100644
index 0000000000..f5586b0ad5
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_diagnostic_trace.h
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2021-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief This module provides a set of diagnostic tracing utilities.
+ *
+ * Overview
+ * ========
+ *
+ * The built-in diagnostic trace tool generates a hierarchical JSON tree structure. The tree
+ * hierarchy contains three levels:
+ *
+ *    - block
+ *        - pass
+ *           - candidate
+ *
+ * One block node exists for each compressed block in the image. One pass node exists for each major
+ * pass (N partition, M planes, O components) applied to a block. One candidate node exists for each
+ * encoding candidate trialed for a pass.
+ *
+ * Each node contains both the hierarchy but also a number of attributes which explain the behavior.
+ * For example, the block node contains the block coordinates in the image, the pass explains the
+ * pass configuration, and the candidate will explain the candidate encoding such as weight
+ * decimation, refinement error, etc.
+ *
+ * Trace Nodes are designed as scope-managed C++ objects with stack-like push/pop behavior.
+ * Constructing a trace node on the stack will automatically add it to the current node as a child,
+ * and then make it the current node. Destroying the current node will pop the stack and set the
+ * parent to the current node. This provides a robust mechanism for ensuring reliable nesting in the
+ * tree structure.
+ *
+ * A set of utility macros are provided to add attribute annotations to the current trace node.
+ *
+ * Usage
+ * =====
+ *
+ * Create Trace Nodes on the stack using the @c TRACE_NODE() macro. This will compile-out completely
+ * in builds with diagnostics disabled.
+ *
+ * Add annotations to the current trace node using the @c trace_add_data() macro. This will
+ * similarly compile out completely in builds with diagnostics disabled.
+ *
+ * If you need to add additional code to support diagnostics-only behavior wrap
+ * it in preprocessor guards:
+ *
+ *     #if defined(ASTCENC_DIAGNOSTICS)
+ *     #endif
+ */
+
+#ifndef ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+#define ASTCENC_DIAGNOSTIC_TRACE_INCLUDED
+
+#if defined(ASTCENC_DIAGNOSTICS)
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+
+/**
+ * @brief Class representing a single node in the trace hierarchy.
+ */
+class TraceNode
+{
+public:
+	/**
+	 * @brief Construct a new node.
+	 *
+	 * Constructing a node will push to the the top of the stack, automatically making it a child of
+	 * the current node, and then setting it to become the current node.
+	 *
+	 * @param format   The format template for the node name.
+	 * @param ...      The format parameters.
+	 */
+	TraceNode(const char* format, ...);
+
+	/**
+	 * @brief Add an attribute to this node.
+	 *
+	 * Note that no quoting is applied to the @c value, so if quoting is needed it must be done by
+	 * the caller.
+	 *
+	 * @param type    The type of the attribute.
+	 * @param key     The key of the attribute.
+	 * @param value   The value of the attribute.
+	 */
+	void add_attrib(std::string type, std::string key, std::string value);
+
+	/**
+	 * @brief Destroy this node.
+	 *
+	 * Destroying a node will pop it from the top of the stack, making its parent the current node.
+	 * It is invalid behavior to destroy a node that is not the current node; usage must conform to
+	 * stack push-pop semantics.
+	 */
+	~TraceNode();
+
+	/**
+	 * @brief The number of attributes and child nodes in this node.
+	 */
+	unsigned int m_attrib_count { 0 };
+};
+
+/**
+ * @brief Class representing the trace log file being written.
+ */
+class TraceLog
+{
+public:
+	/**
+	 * @brief Create a new trace log.
+	 *
+	 * The trace log is global; there can be only one at a time.
+	 *
+	 * @param file_name   The name of the file to write.
+	 */
+	TraceLog(const char* file_name);
+
+	/**
+	 * @brief Detroy the trace log.
+	 *
+	 * Trace logs MUST be cleanly destroyed to ensure the file gets written.
+	 */
+	~TraceLog();
+
+	/**
+	 * @brief Get the current child node.
+	 *
+	 * @return The current leaf node.
+	 */
+	TraceNode* get_current_leaf();
+
+	/**
+	 * @brief Get the stack depth of the current child node.
+	 *
+	 * @return The current leaf node stack depth.
+	 */
+	size_t get_depth();
+
+	/**
+	 * @brief The file stream to write to.
+	 */
+	std::ofstream m_file;
+
+	/**
+	 * @brief The stack of nodes (newest at the back).
+	 */
+	std::vector<TraceNode*> m_stack;
+
+private:
+	/**
+	 * @brief The root node in the JSON file.
+	 */
+	TraceNode* m_root;
+};
+
+/**
+ * @brief Utility macro to create a trace node on the stack.
+ *
+ * @param name     The variable name to use.
+ * @param ...      The name template and format parameters.
+ */
+#define TRACE_NODE(name, ...) TraceNode name(__VA_ARGS__);
+
+/**
+ * @brief Add a string annotation to the current node.
+ *
+ * @param key      The name of the attribute.
+ * @param format   The format template for the attribute value.
+ * @param ...      The format parameters.
+ */
+void trace_add_data(const char* key, const char* format, ...);
+
+/**
+ * @brief Add a float annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, float value);
+
+/**
+ * @brief Add an integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, int value);
+
+/**
+ * @brief Add an unsigned integer annotation to the current node.
+ *
+ * @param key     The name of the attribute.
+ * @param value   The value of the attribute.
+ */
+void trace_add_data(const char* key, unsigned int value);
+
+#else
+
+#define TRACE_NODE(name, ...)
+
+#define trace_add_data(...)
+
+#endif
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_entry.cpp b/thirdparty/astcenc/astcenc_entry.cpp
new file mode 100644
index 0000000000..e59f1fe61a
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_entry.cpp
@@ -0,0 +1,1427 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for the library entrypoint.
+ */
+
+#include <array>
+#include <cstring>
+#include <new>
+
+#include "astcenc.h"
+#include "astcenc_internal_entry.h"
+#include "astcenc_diagnostic_trace.h"
+
+/**
+ * @brief Record of the quality tuning parameter values.
+ *
+ * See the @c astcenc_config structure for detailed parameter documentation.
+ *
+ * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
+ * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
+ * for the more through search presets because the underlying db_limit is so much higher.
+ */
+struct astcenc_preset_config
+{
+	float quality;
+	unsigned int tune_partition_count_limit;
+	unsigned int tune_2partition_index_limit;
+	unsigned int tune_3partition_index_limit;
+	unsigned int tune_4partition_index_limit;
+	unsigned int tune_block_mode_limit;
+	unsigned int tune_refinement_limit;
+	unsigned int tune_candidate_limit;
+	unsigned int tune_2partitioning_candidate_limit;
+	unsigned int tune_3partitioning_candidate_limit;
+	unsigned int tune_4partitioning_candidate_limit;
+	float tune_db_limit_a_base;
+	float tune_db_limit_b_base;
+	float tune_mse_overshoot;
+	float tune_2_partition_early_out_limit_factor;
+	float tune_3_partition_early_out_limit_factor;
+	float tune_2_plane_early_out_limit_correlation;
+};
+
+/**
+ * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
+ */
+static const std::array<astcenc_preset_config, 6> preset_configs_high {{
+	{
+		ASTCENC_PRE_FASTEST,
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
+	}, {
+		ASTCENC_PRE_FAST,
+		3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f
+	}, {
+		ASTCENC_PRE_MEDIUM,
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f
+	}, {
+		ASTCENC_PRE_THOROUGH,
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f
+	}, {
+		ASTCENC_PRE_VERYTHOROUGH,
+		4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
+	}, {
+		ASTCENC_PRE_EXHAUSTIVE,
+		4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
+	}
+}};
+
+/**
+ * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
+ */
+static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
+	{
+		ASTCENC_PRE_FASTEST,
+		2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f
+	}, {
+		ASTCENC_PRE_FAST,
+		3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
+	}, {
+		ASTCENC_PRE_MEDIUM,
+		4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f
+	}, {
+		ASTCENC_PRE_THOROUGH,
+		4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f
+	}, {
+		ASTCENC_PRE_VERYTHOROUGH,
+		4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
+	}, {
+		ASTCENC_PRE_EXHAUSTIVE,
+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
+	}
+}};
+
+/**
+ * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
+ */
+static const std::array<astcenc_preset_config, 6> preset_configs_low {{
+	{
+		ASTCENC_PRE_FASTEST,
+		2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f
+	}, {
+		ASTCENC_PRE_FAST,
+		2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f
+	}, {
+		ASTCENC_PRE_MEDIUM,
+		3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f
+	}, {
+		ASTCENC_PRE_THOROUGH,
+		4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f
+	}, {
+		ASTCENC_PRE_VERYTHOROUGH,
+		4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
+	}, {
+		ASTCENC_PRE_EXHAUSTIVE,
+		4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
+	}
+}};
+
+/**
+ * @brief Validate CPU floating point meets assumptions made in the codec.
+ *
+ * The codec is written with the assumption that a float threaded through the @c if32 union will be
+ * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
+ * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
+ * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_cpu_float()
+{
+	if32 p;
+	volatile float xprec_testval = 2.51f;
+	p.f = xprec_testval + 12582912.0f;
+	float q = p.f - 12582912.0f;
+
+	if (q != 3.0f)
+	{
+		return ASTCENC_ERR_BAD_CPU_FLOAT;
+	}
+
+	return ASTCENC_SUCCESS;
+}
+
+/**
+ * @brief Validate CPU ISA support meets the requirements of this build of the library.
+ *
+ * Each library build is statically compiled for a particular set of CPU ISA features, such as the
+ * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
+ * actually supports everything this build needs.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_cpu_isa()
+{
+	#if ASTCENC_SSE >= 41
+		if (!cpu_supports_sse41())
+		{
+			return ASTCENC_ERR_BAD_CPU_ISA;
+		}
+	#endif
+
+	#if ASTCENC_POPCNT >= 1
+		if (!cpu_supports_popcnt())
+		{
+			return ASTCENC_ERR_BAD_CPU_ISA;
+		}
+	#endif
+
+	#if ASTCENC_F16C >= 1
+		if (!cpu_supports_f16c())
+		{
+			return ASTCENC_ERR_BAD_CPU_ISA;
+		}
+	#endif
+
+	#if ASTCENC_AVX >= 2
+		if (!cpu_supports_avx2())
+		{
+			return ASTCENC_ERR_BAD_CPU_ISA;
+		}
+	#endif
+
+	return ASTCENC_SUCCESS;
+}
+
+/**
+ * @brief Validate config profile.
+ *
+ * @param profile   The profile to check.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_profile(
+	astcenc_profile profile
+) {
+	// Values in this enum are from an external user, so not guaranteed to be
+	// bounded to the enum values
+	switch (static_cast<int>(profile))
+	{
+	case ASTCENC_PRF_LDR_SRGB:
+	case ASTCENC_PRF_LDR:
+	case ASTCENC_PRF_HDR_RGB_LDR_A:
+	case ASTCENC_PRF_HDR:
+		return ASTCENC_SUCCESS;
+	default:
+		return ASTCENC_ERR_BAD_PROFILE;
+	}
+}
+
+/**
+ * @brief Validate block size.
+ *
+ * @param block_x   The block x dimensions.
+ * @param block_y   The block y dimensions.
+ * @param block_z   The block z dimensions.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_block_size(
+	unsigned int block_x,
+	unsigned int block_y,
+	unsigned int block_z
+) {
+	// Test if this is a legal block size at all
+	bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
+	                 ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
+	if (!is_legal)
+	{
+		return ASTCENC_ERR_BAD_BLOCK_SIZE;
+	}
+
+	// Test if this build has sufficient capacity for this block size
+	bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
+	if (!have_capacity)
+	{
+		return ASTCENC_ERR_NOT_IMPLEMENTED;
+	}
+
+	return ASTCENC_SUCCESS;
+}
+
+/**
+ * @brief Validate flags.
+ *
+ * @param flags   The flags to check.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_flags(
+	unsigned int flags
+) {
+	// Flags field must not contain any unknown flag bits
+	unsigned int exMask = ~ASTCENC_ALL_FLAGS;
+	if (popcount(flags & exMask) != 0)
+	{
+		return ASTCENC_ERR_BAD_FLAGS;
+	}
+
+	// Flags field must only contain at most a single map type
+	exMask = ASTCENC_FLG_MAP_NORMAL
+	       | ASTCENC_FLG_MAP_RGBM;
+	if (popcount(flags & exMask) > 1)
+	{
+		return ASTCENC_ERR_BAD_FLAGS;
+	}
+
+	return ASTCENC_SUCCESS;
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Validate single channel compression swizzle.
+ *
+ * @param swizzle   The swizzle to check.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_compression_swz(
+	astcenc_swz swizzle
+) {
+	// Not all enum values are handled; SWZ_Z is invalid for compression
+	switch (static_cast<int>(swizzle))
+	{
+	case ASTCENC_SWZ_R:
+	case ASTCENC_SWZ_G:
+	case ASTCENC_SWZ_B:
+	case ASTCENC_SWZ_A:
+	case ASTCENC_SWZ_0:
+	case ASTCENC_SWZ_1:
+		return ASTCENC_SUCCESS;
+	default:
+		return ASTCENC_ERR_BAD_SWIZZLE;
+	}
+}
+
+/**
+ * @brief Validate overall compression swizzle.
+ *
+ * @param swizzle   The swizzle to check.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_compression_swizzle(
+	const astcenc_swizzle& swizzle
+) {
+	if (validate_compression_swz(swizzle.r) ||
+	    validate_compression_swz(swizzle.g) ||
+	    validate_compression_swz(swizzle.b) ||
+	    validate_compression_swz(swizzle.a))
+	{
+		return ASTCENC_ERR_BAD_SWIZZLE;
+	}
+
+	return ASTCENC_SUCCESS;
+}
+#endif
+
+/**
+ * @brief Validate single channel decompression swizzle.
+ *
+ * @param swizzle   The swizzle to check.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_decompression_swz(
+	astcenc_swz swizzle
+) {
+	// Values in this enum are from an external user, so not guaranteed to be
+	// bounded to the enum values
+	switch (static_cast<int>(swizzle))
+	{
+	case ASTCENC_SWZ_R:
+	case ASTCENC_SWZ_G:
+	case ASTCENC_SWZ_B:
+	case ASTCENC_SWZ_A:
+	case ASTCENC_SWZ_0:
+	case ASTCENC_SWZ_1:
+	case ASTCENC_SWZ_Z:
+		return ASTCENC_SUCCESS;
+	default:
+		return ASTCENC_ERR_BAD_SWIZZLE;
+	}
+}
+
+/**
+ * @brief Validate overall decompression swizzle.
+ *
+ * @param swizzle   The swizzle to check.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_decompression_swizzle(
+	const astcenc_swizzle& swizzle
+) {
+	if (validate_decompression_swz(swizzle.r) ||
+	    validate_decompression_swz(swizzle.g) ||
+	    validate_decompression_swz(swizzle.b) ||
+	    validate_decompression_swz(swizzle.a))
+	{
+		return ASTCENC_ERR_BAD_SWIZZLE;
+	}
+
+	return ASTCENC_SUCCESS;
+}
+
+/**
+ * Validate that an incoming configuration is in-spec.
+ *
+ * This function can respond in two ways:
+ *
+ *   * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
+ *     for out-of-range inputs in this case.
+ *   * Numerical inputs and logic inputs are are logically invalid and which make no sense
+ *     algorithmically will return an error.
+ *
+ * @param[in,out] config   The input compressor configuration.
+ *
+ * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
+ */
+static astcenc_error validate_config(
+	astcenc_config &config
+) {
+	astcenc_error status;
+
+	status = validate_profile(config.profile);
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	status = validate_flags(config.flags);
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	status = validate_block_size(config.block_x, config.block_y, config.block_z);
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+#if defined(ASTCENC_DECOMPRESS_ONLY)
+	// Decompress-only builds only support decompress-only contexts
+	if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
+	{
+		return ASTCENC_ERR_BAD_PARAM;
+	}
+#endif
+
+	config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
+
+	config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
+	config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
+	config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
+	config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
+	config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
+	config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
+	config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
+	config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
+	config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
+	config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES);
+	config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
+	config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
+	config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
+	config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
+	config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
+
+	// Specifying a zero weight color component is not allowed; force to small value
+	float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
+	                             astc::max(config.cw_b_weight, config.cw_a_weight));
+	if (max_weight > 0.0f)
+	{
+		max_weight /= 1000.0f;
+		config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
+		config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
+		config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
+		config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
+	}
+	// If all color components error weights are zero then return an error
+	else
+	{
+		return ASTCENC_ERR_BAD_PARAM;
+	}
+
+	return ASTCENC_SUCCESS;
+}
+
+/* See header for documentation. */
+astcenc_error astcenc_config_init(
+	astcenc_profile profile,
+	unsigned int block_x,
+	unsigned int block_y,
+	unsigned int block_z,
+	float quality,
+	unsigned int flags,
+	astcenc_config* configp
+) {
+	astcenc_error status;
+
+	// Check basic library compatibility options here so they are checked early. Note, these checks
+	// are repeated in context_alloc for cases where callers use a manually defined config struct
+	status = validate_cpu_isa();
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	status = validate_cpu_float();
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	// Zero init all config fields; although most of will be over written
+	astcenc_config& config = *configp;
+	std::memset(&config, 0, sizeof(config));
+
+	// Process the block size
+	block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
+	status = validate_block_size(block_x, block_y, block_z);
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	config.block_x = block_x;
+	config.block_y = block_y;
+	config.block_z = block_z;
+
+	float texels = static_cast<float>(block_x * block_y * block_z);
+	float ltexels = logf(texels) / logf(10.0f);
+
+	// Process the performance quality level or preset; note that this must be done before we
+	// process any additional settings, such as color profile and flags, which may replace some of
+	// these settings with more use case tuned values
+	if (quality < ASTCENC_PRE_FASTEST ||
+	    quality > ASTCENC_PRE_EXHAUSTIVE)
+	{
+		return ASTCENC_ERR_BAD_QUALITY;
+	}
+
+	static const std::array<astcenc_preset_config, 6>* preset_configs;
+	int texels_int = block_x * block_y * block_z;
+	if (texels_int < 25)
+	{
+		preset_configs = &preset_configs_high;
+	}
+	else if (texels_int < 64)
+	{
+		preset_configs = &preset_configs_mid;
+	}
+	else
+	{
+		preset_configs = &preset_configs_low;
+	}
+
+	// Determine which preset to use, or which pair to interpolate
+	size_t start;
+	size_t end;
+	for (end = 0; end < preset_configs->size(); end++)
+	{
+		if ((*preset_configs)[end].quality >= quality)
+		{
+			break;
+		}
+	}
+
+	start = end == 0 ? 0 : end - 1;
+
+	// Start and end node are the same - so just transfer the values.
+	if (start == end)
+	{
+		config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
+		config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
+		config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
+		config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
+		config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
+		config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
+		config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
+		config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
+		config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
+		config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES);
+		config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
+		                                 (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
+
+		config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
+
+		config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
+		config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
+		config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
+	}
+	// Start and end node are not the same - so interpolate between them
+	else
+	{
+		auto& node_a = (*preset_configs)[start];
+		auto& node_b = (*preset_configs)[end];
+
+		float wt_range = node_b.quality - node_a.quality;
+		assert(wt_range > 0);
+
+		// Compute interpolation factors
+		float wt_node_a = (node_b.quality - quality) / wt_range;
+		float wt_node_b = (quality - node_a.quality) / wt_range;
+
+		#define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
+		#define LERPI(param) astc::flt2int_rtn(\
+		                         (static_cast<float>(node_a.param) * wt_node_a) + \
+		                         (static_cast<float>(node_b.param) * wt_node_b))
+		#define LERPUI(param) static_cast<unsigned int>(LERPI(param))
+
+		config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
+		config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
+		config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
+		config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
+		config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
+		config.tune_refinement_limit = LERPI(tune_refinement_limit);
+		config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
+		                                        TUNE_MAX_TRIAL_CANDIDATES);
+		config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
+		                                                      BLOCK_MAX_PARTITIONINGS);
+		config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
+		                                                      BLOCK_MAX_PARTITIONINGS);
+		config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
+		                                                      BLOCK_MAX_PARTITIONINGS);
+		config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
+		                                 LERP(tune_db_limit_b_base) - 19 * ltexels);
+
+		config.tune_mse_overshoot = LERP(tune_mse_overshoot);
+
+		config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
+		config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
+		config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
+		#undef LERP
+		#undef LERPI
+		#undef LERPUI
+	}
+
+	// Set heuristics to the defaults for each color profile
+	config.cw_r_weight = 1.0f;
+	config.cw_g_weight = 1.0f;
+	config.cw_b_weight = 1.0f;
+	config.cw_a_weight = 1.0f;
+
+	config.a_scale_radius = 0;
+
+	config.rgbm_m_scale = 0.0f;
+
+	config.profile = profile;
+
+	// Values in this enum are from an external user, so not guaranteed to be
+	// bounded to the enum values
+	switch (static_cast<int>(profile))
+	{
+	case ASTCENC_PRF_LDR:
+	case ASTCENC_PRF_LDR_SRGB:
+		break;
+	case ASTCENC_PRF_HDR_RGB_LDR_A:
+	case ASTCENC_PRF_HDR:
+		config.tune_db_limit = 999.0f;
+		break;
+	default:
+		return ASTCENC_ERR_BAD_PROFILE;
+	}
+
+	// Flags field must not contain any unknown flag bits
+	status = validate_flags(flags);
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	if (flags & ASTCENC_FLG_MAP_NORMAL)
+	{
+		// Normal map encoding uses L+A blocks, so allow one more partitioning
+		// than normal. We need need fewer bits for endpoints, so more likely
+		// to be able to use more partitions than an RGB/RGBA block
+		config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
+
+		config.cw_g_weight = 0.0f;
+		config.cw_b_weight = 0.0f;
+		config.tune_2_partition_early_out_limit_factor *= 1.5f;
+		config.tune_3_partition_early_out_limit_factor *= 1.5f;
+		config.tune_2_plane_early_out_limit_correlation = 0.99f;
+
+		// Normals are prone to blocking artifacts on smooth curves
+		// so force compressor to try harder here ...
+		config.tune_db_limit *= 1.03f;
+	}
+	else if (flags & ASTCENC_FLG_MAP_RGBM)
+	{
+		config.rgbm_m_scale = 5.0f;
+		config.cw_a_weight = 2.0f * config.rgbm_m_scale;
+	}
+	else // (This is color data)
+	{
+		// This is a very basic perceptual metric for RGB color data, which weights error
+		// significance by the perceptual luminance contribution of each color channel. For
+		// luminance the usual weights to compute luminance from a linear RGB value are as
+		// follows:
+		//
+		//     l = r * 0.3 + g * 0.59 + b * 0.11
+		//
+		// ... but we scale these up to keep a better balance between color and alpha. Note
+		// that if the content is using alpha we'd recommend using the -a option to weight
+		// the color contribution by the alpha transparency.
+		if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
+		{
+			config.cw_r_weight = 0.30f * 2.25f;
+			config.cw_g_weight = 0.59f * 2.25f;
+			config.cw_b_weight = 0.11f * 2.25f;
+		}
+	}
+	config.flags = flags;
+
+	return ASTCENC_SUCCESS;
+}
+
+/* See header for documentation. */
+astcenc_error astcenc_context_alloc(
+	const astcenc_config* configp,
+	unsigned int thread_count,
+	astcenc_context** context
+) {
+	astcenc_error status;
+	const astcenc_config& config = *configp;
+
+	status = validate_cpu_isa();
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	status = validate_cpu_float();
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	if (thread_count == 0)
+	{
+		return ASTCENC_ERR_BAD_PARAM;
+	}
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	// Force single threaded compressor use in diagnostic mode.
+	if (thread_count != 1)
+	{
+		return ASTCENC_ERR_BAD_PARAM;
+	}
+#endif
+
+	astcenc_context* ctxo = new astcenc_context;
+	astcenc_contexti* ctx = &ctxo->context;
+	ctx->thread_count = thread_count;
+	ctx->config = config;
+	ctx->working_buffers = nullptr;
+
+	// These are allocated per-compress, as they depend on image size
+	ctx->input_alpha_averages = nullptr;
+
+	// Copy the config first and validate the copy (we may modify it)
+	status = validate_config(ctx->config);
+	if (status != ASTCENC_SUCCESS)
+	{
+		delete ctxo;
+		return status;
+	}
+
+	ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
+	bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
+	init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
+	                           can_omit_modes,
+	                           config.tune_partition_count_limit,
+	                           static_cast<float>(config.tune_block_mode_limit) / 100.0f,
+	                           *ctx->bsd);
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	// Do setup only needed by compression
+	if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
+	{
+		// Turn a dB limit into a per-texel error for faster use later
+		if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
+		{
+			ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
+		}
+		else
+		{
+			ctx->config.tune_db_limit = 0.0f;
+		}
+
+		size_t worksize = sizeof(compression_working_buffers) * thread_count;
+		ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
+		static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
+		              "compression_working_buffers size must be multiple of vector alignment");
+		if (!ctx->working_buffers)
+		{
+			aligned_free<block_size_descriptor>(ctx->bsd);
+			delete ctxo;
+			*context = nullptr;
+			return ASTCENC_ERR_OUT_OF_MEM;
+		}
+	}
+#endif
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
+	if (!ctx->trace_log->m_file)
+	{
+		return ASTCENC_ERR_DTRACE_FAILURE;
+	}
+
+	trace_add_data("block_x", config.block_x);
+	trace_add_data("block_y", config.block_y);
+	trace_add_data("block_z", config.block_z);
+#endif
+
+	*context = ctxo;
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	prepare_angular_tables();
+#endif
+
+	return ASTCENC_SUCCESS;
+}
+
+/* See header dor documentation. */
+void astcenc_context_free(
+	astcenc_context* ctxo
+) {
+	if (ctxo)
+	{
+		astcenc_contexti* ctx = &ctxo->context;
+		aligned_free<compression_working_buffers>(ctx->working_buffers);
+		aligned_free<block_size_descriptor>(ctx->bsd);
+#if defined(ASTCENC_DIAGNOSTICS)
+		delete ctx->trace_log;
+#endif
+		delete ctxo;
+	}
+}
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Compress an image, after any preflight has completed.
+ *
+ * @param[out] ctxo           The compressor context.
+ * @param      thread_index   The thread index.
+ * @param      image          The intput image.
+ * @param      swizzle        The input swizzle.
+ * @param[out] buffer         The output array for the compressed data.
+ */
+static void compress_image(
+	astcenc_context& ctxo,
+	unsigned int thread_index,
+	const astcenc_image& image,
+	const astcenc_swizzle& swizzle,
+	uint8_t* buffer
+) {
+	astcenc_contexti& ctx = ctxo.context;
+	const block_size_descriptor& bsd = *ctx.bsd;
+	astcenc_profile decode_mode = ctx.config.profile;
+
+	image_block blk;
+
+	int block_x = bsd.xdim;
+	int block_y = bsd.ydim;
+	int block_z = bsd.zdim;
+	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
+
+	int dim_x = image.dim_x;
+	int dim_y = image.dim_y;
+	int dim_z = image.dim_z;
+
+	int xblocks = (dim_x + block_x - 1) / block_x;
+	int yblocks = (dim_y + block_y - 1) / block_y;
+	int zblocks = (dim_z + block_z - 1) / block_z;
+	int block_count = zblocks * yblocks * xblocks;
+
+	int row_blocks = xblocks;
+	int plane_blocks = xblocks * yblocks;
+
+	// Populate the block channel weights
+	blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
+	                             ctx.config.cw_g_weight,
+	                             ctx.config.cw_b_weight,
+	                             ctx.config.cw_a_weight);
+
+	// Use preallocated scratch buffer
+	auto& temp_buffers = ctx.working_buffers[thread_index];
+
+	// Only the first thread actually runs the initializer
+	ctxo.manage_compress.init(block_count);
+
+	// Determine if we can use an optimized load function
+	bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
+	                 (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
+
+	bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
+	                 (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
+
+	bool use_fast_load = !needs_swz && !needs_hdr &&
+	                     block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
+
+	auto load_func = load_image_block;
+	if (use_fast_load)
+	{
+		load_func = load_image_block_fast_ldr;
+	}
+
+	// All threads run this processing loop until there is no work remaining
+	while (true)
+	{
+		unsigned int count;
+		unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
+		if (!count)
+		{
+			break;
+		}
+
+		for (unsigned int i = base; i < base + count; i++)
+		{
+			// Decode i into x, y, z block indices
+			int z = i / plane_blocks;
+			unsigned int rem = i - (z * plane_blocks);
+			int y = rem / row_blocks;
+			int x = rem - (y * row_blocks);
+
+			// Test if we can apply some basic alpha-scale RDO
+			bool use_full_block = true;
+			if (ctx.config.a_scale_radius != 0 && block_z == 1)
+			{
+				int start_x = x * block_x;
+				int end_x = astc::min(dim_x, start_x + block_x);
+
+				int start_y = y * block_y;
+				int end_y = astc::min(dim_y, start_y + block_y);
+
+				// SATs accumulate error, so don't test exactly zero. Test for
+				// less than 1 alpha in the expanded block footprint that
+				// includes the alpha radius.
+				int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
+
+				int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
+
+				float footprint = static_cast<float>(x_footprint * y_footprint);
+				float threshold = 0.9f / (255.0f * footprint);
+
+				// Do we have any alpha values?
+				use_full_block = false;
+				for (int ay = start_y; ay < end_y; ay++)
+				{
+					for (int ax = start_x; ax < end_x; ax++)
+					{
+						float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
+						if (a_avg > threshold)
+						{
+							use_full_block = true;
+							ax = end_x;
+							ay = end_y;
+						}
+					}
+				}
+			}
+
+			// Fetch the full block for compression
+			if (use_full_block)
+			{
+				load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
+
+				// Scale RGB error contribution by the maximum alpha in the block
+				// This encourages preserving alpha accuracy in regions with high
+				// transparency, and can buy up to 0.5 dB PSNR.
+				if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
+				{
+					float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
+					blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
+					                             ctx.config.cw_g_weight * alpha_scale,
+					                             ctx.config.cw_b_weight * alpha_scale,
+					                             ctx.config.cw_a_weight);
+				}
+			}
+			// Apply alpha scale RDO - substitute constant color block
+			else
+			{
+				blk.origin_texel = vfloat4::zero();
+				blk.data_min = vfloat4::zero();
+				blk.data_mean = vfloat4::zero();
+				blk.data_max = vfloat4::zero();
+				blk.grayscale = true;
+			}
+
+			int offset = ((z * yblocks + y) * xblocks + x) * 16;
+			uint8_t *bp = buffer + offset;
+			physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
+			compress_block(ctx, blk, *pcb, temp_buffers);
+		}
+
+		ctxo.manage_compress.complete_task_assignment(count);
+	}
+}
+
+/**
+ * @brief Compute regional averages in an image.
+ *
+ * This function can be called by multiple threads, but only after a single
+ * thread calls the setup function @c init_compute_averages().
+ *
+ * Results are written back into @c img->input_alpha_averages.
+ *
+ * @param[out] ctx   The context.
+ * @param      ag    The average and variance arguments created during setup.
+ */
+static void compute_averages(
+	astcenc_context& ctx,
+	const avg_args &ag
+) {
+	pixel_region_args arg = ag.arg;
+	arg.work_memory = new vfloat4[ag.work_memory_size];
+
+	int size_x = ag.img_size_x;
+	int size_y = ag.img_size_y;
+	int size_z = ag.img_size_z;
+
+	int step_xy = ag.blk_size_xy;
+	int step_z = ag.blk_size_z;
+
+	int y_tasks = (size_y + step_xy - 1) / step_xy;
+
+	// All threads run this processing loop until there is no work remaining
+	while (true)
+	{
+		unsigned int count;
+		unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
+		if (!count)
+		{
+			break;
+		}
+
+		for (unsigned int i = base; i < base + count; i++)
+		{
+			int z = (i / (y_tasks)) * step_z;
+			int y = (i - (z * y_tasks)) * step_xy;
+
+			arg.size_z = astc::min(step_z, size_z - z);
+			arg.offset_z = z;
+
+			arg.size_y = astc::min(step_xy, size_y - y);
+			arg.offset_y = y;
+
+			for (int x = 0; x < size_x; x += step_xy)
+			{
+				arg.size_x = astc::min(step_xy, size_x - x);
+				arg.offset_x = x;
+				compute_pixel_region_variance(ctx.context, arg);
+			}
+		}
+
+		ctx.manage_avg.complete_task_assignment(count);
+	}
+
+	delete[] arg.work_memory;
+}
+
+#endif
+
+/* See header for documentation. */
+astcenc_error astcenc_compress_image(
+	astcenc_context* ctxo,
+	astcenc_image* imagep,
+	const astcenc_swizzle* swizzle,
+	uint8_t* data_out,
+	size_t data_len,
+	unsigned int thread_index
+) {
+#if defined(ASTCENC_DECOMPRESS_ONLY)
+	(void)ctxo;
+	(void)imagep;
+	(void)swizzle;
+	(void)data_out;
+	(void)data_len;
+	(void)thread_index;
+	return ASTCENC_ERR_BAD_CONTEXT;
+#else
+	astcenc_contexti* ctx = &ctxo->context;
+	astcenc_error status;
+	astcenc_image& image = *imagep;
+
+	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
+	{
+		return ASTCENC_ERR_BAD_CONTEXT;
+	}
+
+	status = validate_compression_swizzle(*swizzle);
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	if (thread_index >= ctx->thread_count)
+	{
+		return ASTCENC_ERR_BAD_PARAM;
+	}
+
+	unsigned int block_x = ctx->config.block_x;
+	unsigned int block_y = ctx->config.block_y;
+	unsigned int block_z = ctx->config.block_z;
+
+	unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
+	unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
+	unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
+
+	// Check we have enough output space (16 bytes per block)
+	size_t size_needed = xblocks * yblocks * zblocks * 16;
+	if (data_len < size_needed)
+	{
+		return ASTCENC_ERR_OUT_OF_MEM;
+	}
+
+	// If context thread count is one then implicitly reset
+	if (ctx->thread_count == 1)
+	{
+		astcenc_compress_reset(ctxo);
+	}
+
+	if (ctx->config.a_scale_radius != 0)
+	{
+		// First thread to enter will do setup, other threads will subsequently
+		// enter the critical section but simply skip over the initialization
+		auto init_avg = [ctx, &image, swizzle]() {
+			// Perform memory allocations for the destination buffers
+			size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
+			ctx->input_alpha_averages = new float[texel_count];
+
+			return init_compute_averages(
+				image, ctx->config.a_scale_radius, *swizzle,
+				ctx->avg_preprocess_args);
+		};
+
+		// Only the first thread actually runs the initializer
+		ctxo->manage_avg.init(init_avg);
+
+		// All threads will enter this function and dynamically grab work
+		compute_averages(*ctxo, ctx->avg_preprocess_args);
+	}
+
+	// Wait for compute_averages to complete before compressing
+	ctxo->manage_avg.wait();
+
+	compress_image(*ctxo, thread_index, image, *swizzle, data_out);
+
+	// Wait for compress to complete before freeing memory
+	ctxo->manage_compress.wait();
+
+	auto term_compress = [ctx]() {
+		delete[] ctx->input_alpha_averages;
+		ctx->input_alpha_averages = nullptr;
+	};
+
+	// Only the first thread to arrive actually runs the term
+	ctxo->manage_compress.term(term_compress);
+
+	return ASTCENC_SUCCESS;
+#endif
+}
+
+/* See header for documentation. */
+astcenc_error astcenc_compress_reset(
+	astcenc_context* ctxo
+) {
+#if defined(ASTCENC_DECOMPRESS_ONLY)
+	(void)ctxo;
+	return ASTCENC_ERR_BAD_CONTEXT;
+#else
+	astcenc_contexti* ctx = &ctxo->context;
+	if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
+	{
+		return ASTCENC_ERR_BAD_CONTEXT;
+	}
+
+	ctxo->manage_avg.reset();
+	ctxo->manage_compress.reset();
+	return ASTCENC_SUCCESS;
+#endif
+}
+
+/* See header for documentation. */
+astcenc_error astcenc_decompress_image(
+	astcenc_context* ctxo,
+	const uint8_t* data,
+	size_t data_len,
+	astcenc_image* image_outp,
+	const astcenc_swizzle* swizzle,
+	unsigned int thread_index
+) {
+	astcenc_error status;
+	astcenc_image& image_out = *image_outp;
+	astcenc_contexti* ctx = &ctxo->context;
+
+	// Today this doesn't matter (working set on stack) but might in future ...
+	if (thread_index >= ctx->thread_count)
+	{
+		return ASTCENC_ERR_BAD_PARAM;
+	}
+
+	status = validate_decompression_swizzle(*swizzle);
+	if (status != ASTCENC_SUCCESS)
+	{
+		return status;
+	}
+
+	unsigned int block_x = ctx->config.block_x;
+	unsigned int block_y = ctx->config.block_y;
+	unsigned int block_z = ctx->config.block_z;
+
+	unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
+	unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
+	unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
+
+	int row_blocks = xblocks;
+	int plane_blocks = xblocks * yblocks;
+
+	// Check we have enough output space (16 bytes per block)
+	size_t size_needed = xblocks * yblocks * zblocks * 16;
+	if (data_len < size_needed)
+	{
+		return ASTCENC_ERR_OUT_OF_MEM;
+	}
+
+	image_block blk;
+	blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
+
+	// If context thread count is one then implicitly reset
+	if (ctx->thread_count == 1)
+	{
+		astcenc_decompress_reset(ctxo);
+	}
+
+	// Only the first thread actually runs the initializer
+	ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
+
+	// All threads run this processing loop until there is no work remaining
+	while (true)
+	{
+		unsigned int count;
+		unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
+		if (!count)
+		{
+			break;
+		}
+
+		for (unsigned int i = base; i < base + count; i++)
+		{
+			// Decode i into x, y, z block indices
+			int z = i / plane_blocks;
+			unsigned int rem = i - (z * plane_blocks);
+			int y = rem / row_blocks;
+			int x = rem - (y * row_blocks);
+
+			unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
+			const uint8_t* bp = data + offset;
+
+			const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
+			symbolic_compressed_block scb;
+
+			physical_to_symbolic(*ctx->bsd, pcb, scb);
+
+			decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
+			                          x * block_x, y * block_y, z * block_z,
+			                          scb, blk);
+
+			store_image_block(image_out, blk, *ctx->bsd,
+			                  x * block_x, y * block_y, z * block_z, *swizzle);
+		}
+
+		ctxo->manage_decompress.complete_task_assignment(count);
+	}
+
+	return ASTCENC_SUCCESS;
+}
+
+/* See header for documentation. */
+astcenc_error astcenc_decompress_reset(
+	astcenc_context* ctxo
+) {
+	ctxo->manage_decompress.reset();
+	return ASTCENC_SUCCESS;
+}
+
+/* See header for documentation. */
+astcenc_error astcenc_get_block_info(
+	astcenc_context* ctxo,
+	const uint8_t data[16],
+	astcenc_block_info* info
+) {
+#if defined(ASTCENC_DECOMPRESS_ONLY)
+	(void)ctxo;
+	(void)data;
+	(void)info;
+	return ASTCENC_ERR_BAD_CONTEXT;
+#else
+	astcenc_contexti* ctx = &ctxo->context;
+
+	// Decode the compressed data into a symbolic form
+	const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
+	symbolic_compressed_block scb;
+	physical_to_symbolic(*ctx->bsd, pcb, scb);
+
+	// Fetch the appropriate partition and decimation tables
+	block_size_descriptor& bsd = *ctx->bsd;
+
+	// Start from a clean slate
+	memset(info, 0, sizeof(*info));
+
+	// Basic info we can always populate
+	info->profile = ctx->config.profile;
+
+	info->block_x = ctx->config.block_x;
+	info->block_y = ctx->config.block_y;
+	info->block_z = ctx->config.block_z;
+	info->texel_count = bsd.texel_count;
+
+	// Check for error blocks first
+	info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
+	if (info->is_error_block)
+	{
+		return ASTCENC_SUCCESS;
+	}
+
+	// Check for constant color blocks second
+	info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
+	                          scb.block_type == SYM_BTYPE_CONST_U16;
+	if (info->is_constant_block)
+	{
+		return ASTCENC_SUCCESS;
+	}
+
+	// Otherwise handle a full block ; known to be valid after conditions above have been checked
+	int partition_count = scb.partition_count;
+	const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
+
+	const block_mode& bm = bsd.get_block_mode(scb.block_mode);
+	const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	info->weight_x = di.weight_x;
+	info->weight_y = di.weight_y;
+	info->weight_z = di.weight_z;
+
+	info->is_dual_plane_block = bm.is_dual_plane != 0;
+
+	info->partition_count = scb.partition_count;
+	info->partition_index = scb.partition_index;
+	info->dual_plane_component = scb.plane2_component;
+
+	info->color_level_count = get_quant_level(scb.get_color_quant_mode());
+	info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
+
+	// Unpack color endpoints for each active partition
+	for (unsigned int i = 0; i < scb.partition_count; i++)
+	{
+		bool rgb_hdr;
+		bool a_hdr;
+		vint4 endpnt[2];
+
+		unpack_color_endpoints(ctx->config.profile,
+		                       scb.color_formats[i],
+		                       scb.color_values[i],
+		                       rgb_hdr, a_hdr,
+		                       endpnt[0], endpnt[1]);
+
+		// Store the color endpoint mode info
+		info->color_endpoint_modes[i] = scb.color_formats[i];
+		info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
+
+		// Store the unpacked and decoded color endpoint
+		vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
+		for (int j = 0; j < 2; j++)
+		{
+			vint4 color_lns = lns_to_sf16(endpnt[j]);
+			vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
+			vint4 datai = select(color_unorm, color_lns, hdr_mask);
+			store(float16_to_float(datai), info->color_endpoints[i][j]);
+		}
+	}
+
+	// Unpack weights for each texel
+	int weight_plane1[BLOCK_MAX_TEXELS];
+	int weight_plane2[BLOCK_MAX_TEXELS];
+
+	unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
+	for (unsigned int i = 0; i < bsd.texel_count; i++)
+	{
+		info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
+		if (info->is_dual_plane_block)
+		{
+			info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
+		}
+	}
+
+	// Unpack partition assignments for each texel
+	for (unsigned int i = 0; i < bsd.texel_count; i++)
+	{
+		info->partition_assignment[i] = pi.partition_of_texel[i];
+	}
+
+	return ASTCENC_SUCCESS;
+#endif
+}
+
+/* See header for documentation. */
+const char* astcenc_get_error_string(
+	astcenc_error status
+) {
+	// Values in this enum are from an external user, so not guaranteed to be
+	// bounded to the enum values
+	switch (static_cast<int>(status))
+	{
+	case ASTCENC_SUCCESS:
+		return "ASTCENC_SUCCESS";
+	case ASTCENC_ERR_OUT_OF_MEM:
+		return "ASTCENC_ERR_OUT_OF_MEM";
+	case ASTCENC_ERR_BAD_CPU_FLOAT:
+		return "ASTCENC_ERR_BAD_CPU_FLOAT";
+	case ASTCENC_ERR_BAD_CPU_ISA:
+		return "ASTCENC_ERR_BAD_CPU_ISA";
+	case ASTCENC_ERR_BAD_PARAM:
+		return "ASTCENC_ERR_BAD_PARAM";
+	case ASTCENC_ERR_BAD_BLOCK_SIZE:
+		return "ASTCENC_ERR_BAD_BLOCK_SIZE";
+	case ASTCENC_ERR_BAD_PROFILE:
+		return "ASTCENC_ERR_BAD_PROFILE";
+	case ASTCENC_ERR_BAD_QUALITY:
+		return "ASTCENC_ERR_BAD_QUALITY";
+	case ASTCENC_ERR_BAD_FLAGS:
+		return "ASTCENC_ERR_BAD_FLAGS";
+	case ASTCENC_ERR_BAD_SWIZZLE:
+		return "ASTCENC_ERR_BAD_SWIZZLE";
+	case ASTCENC_ERR_BAD_CONTEXT:
+		return "ASTCENC_ERR_BAD_CONTEXT";
+	case ASTCENC_ERR_NOT_IMPLEMENTED:
+		return "ASTCENC_ERR_NOT_IMPLEMENTED";
+#if defined(ASTCENC_DIAGNOSTICS)
+	case ASTCENC_ERR_DTRACE_FAILURE:
+		return "ASTCENC_ERR_DTRACE_FAILURE";
+#endif
+	default:
+		return nullptr;
+	}
+}
diff --git a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
new file mode 100644
index 0000000000..ffde3c7060
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for finding best partition for a block.
+ *
+ * The partition search operates in two stages. The first pass uses kmeans clustering to group
+ * texels into an ideal partitioning for the requested partition count, and then compares that
+ * against the 1024 partitionings generated by the ASTC partition hash function. The generated
+ * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
+ * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
+ * partitionings that actually generate fewer than the requested partition count, but only the top
+ * N candidates are actually put through a more detailed search. N is determined by the compressor
+ * quality preset.
+ *
+ * For the detailed search, each candidate is checked against two possible encoding methods:
+ *
+ *   - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
+ *   - The best partitioning assuming same chroma colors (RGB + scale endpoints).
+ *
+ * This is implemented by computing the compute mean color and dominant direction for each
+ * partition. This defines two lines, both of which go through the mean color value.
+ *
+ * - One line has a direction defined by the dominant direction; this is used to assess the error
+ *   from using an uncorrelated color representation.
+ * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
+ *   (RGB + scale) color representation.
+ *
+ * The best candidate is selected by computing the squared-errors that result from using these
+ * lines for endpoint selection.
+ */
+
+#include <limits>
+#include "astcenc_internal.h"
+
+/**
+ * @brief Pick some initial kmeans cluster centers.
+ *
+ * @param      blk               The image block color data to compress.
+ * @param      texel_count       The number of texels in the block.
+ * @param      partition_count   The number of partitions in the block.
+ * @param[out] cluster_centers   The initial partition cluster center colors.
+ */
+static void kmeans_init(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	unsigned int clusters_selected = 0;
+	float distances[BLOCK_MAX_TEXELS];
+
+	// Pick a random sample as first cluster center; 145897 from random.org
+	unsigned int sample = 145897 % texel_count;
+	vfloat4 center_color = blk.texel(sample);
+	cluster_centers[clusters_selected] = center_color;
+	clusters_selected++;
+
+	// Compute the distance to the first cluster center
+	float distance_sum = 0.0f;
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		vfloat4 color = blk.texel(i);
+		vfloat4 diff = color - center_color;
+		float distance = dot_s(diff * diff, blk.channel_weight);
+		distance_sum += distance;
+		distances[i] = distance;
+	}
+
+	// More numbers from random.org for weighted-random center selection
+	const float cluster_cutoffs[9] {
+		0.626220f, 0.932770f, 0.275454f,
+		0.318558f, 0.240113f, 0.009190f,
+		0.347661f, 0.731960f, 0.156391f
+	};
+
+	unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
+
+	// Pick the remaining samples as needed
+	while (true)
+	{
+		// Pick the next center in a weighted-random fashion.
+		float summa = 0.0f;
+		float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
+		for (sample = 0; sample < texel_count; sample++)
+		{
+			summa += distances[sample];
+			if (summa >= distance_cutoff)
+			{
+				break;
+			}
+		}
+
+		// Clamp to a valid range and store the selected cluster center
+		sample = astc::min(sample, texel_count - 1);
+
+		center_color = blk.texel(sample);
+		cluster_centers[clusters_selected++] = center_color;
+		if (clusters_selected >= partition_count)
+		{
+			break;
+		}
+
+		// Compute the distance to the new cluster center, keep the min dist
+		distance_sum = 0.0f;
+		for (unsigned int i = 0; i < texel_count; i++)
+		{
+			vfloat4 color = blk.texel(i);
+			vfloat4 diff = color - center_color;
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			distance = astc::min(distance, distances[i]);
+			distance_sum += distance;
+			distances[i] = distance;
+		}
+	}
+}
+
+/**
+ * @brief Assign texels to clusters, based on a set of chosen center points.
+ *
+ * @param      blk                  The image block color data to compress.
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_count      The number of partitions in the block.
+ * @param      cluster_centers      The partition cluster center colors.
+ * @param[out] partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_assign(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the best partition for every texel
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		float best_distance = std::numeric_limits<float>::max();
+		unsigned int best_partition = 0;
+
+		vfloat4 color = blk.texel(i);
+		for (unsigned int j = 0; j < partition_count; j++)
+		{
+			vfloat4 diff = color - cluster_centers[j];
+			float distance = dot_s(diff * diff, blk.channel_weight);
+			if (distance < best_distance)
+			{
+				best_distance = distance;
+				best_partition = j;
+			}
+		}
+
+		partition_of_texel[i] = static_cast<uint8_t>(best_partition);
+		partition_texel_count[best_partition]++;
+	}
+
+	// It is possible to get a situation where a partition ends up without any texels. In this case,
+	// assign texel N to partition N. This is silly, but ensures that every partition retains at
+	// least one texel. Reassigning a texel in this manner may cause another partition to go empty,
+	// so if we actually did a reassignment, run the whole loop over again.
+	bool problem_case;
+	do
+	{
+		problem_case = false;
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			if (partition_texel_count[i] == 0)
+			{
+				partition_texel_count[partition_of_texel[i]]--;
+				partition_texel_count[i]++;
+				partition_of_texel[i] = static_cast<uint8_t>(i);
+				problem_case = true;
+			}
+		}
+	} while (problem_case);
+}
+
+/**
+ * @brief Compute new cluster centers based on their center of gravity.
+ *
+ * @param       blk                  The image block color data to compress.
+ * @param       texel_count          The number of texels in the block.
+ * @param       partition_count      The number of partitions in the block.
+ * @param[out]  cluster_centers      The new cluster center colors.
+ * @param       partition_of_texel   The partition assigned for each texel.
+ */
+static void kmeans_update(
+	const image_block& blk,
+	unsigned int texel_count,
+	unsigned int partition_count,
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+	const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero(),
+		vfloat4::zero()
+	};
+
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+	// Find the center-of-gravity in each cluster
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		uint8_t partition = partition_of_texel[i];
+		color_sum[partition] += blk.texel(i);
+		partition_texel_count[partition]++;
+	}
+
+	// Set the center of gravity to be the new cluster center
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
+		cluster_centers[i] = color_sum[i] * scale;
+	}
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 2-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch2(
+	const uint64_t a[2],
+	const uint64_t b[2]
+) {
+	int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
+	int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
+	return astc::min(v1, v2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 3-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch3(
+	const uint64_t a[3],
+	const uint64_t b[3]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+
+	int s0 = p11 + p22;
+	int s1 = p12 + p21;
+	int v0 = astc::min(s0, s1) + p00;
+
+	int s2 = p10 + p22;
+	int s3 = p12 + p20;
+	int v1 = astc::min(s2, s3) + p01;
+
+	int s4 = p10 + p21;
+	int s5 = p11 + p20;
+	int v2 = astc::min(s4, s5) + p02;
+
+	return astc::min(v0, v1, v2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 4-partition mode.
+ *
+ * @param a   The texel assignment bitvector for the block.
+ * @param b   The texel assignment bitvector for the partition table.
+ *
+ * @return    The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch4(
+	const uint64_t a[4],
+	const uint64_t b[4]
+) {
+	int p00 = popcount(a[0] ^ b[0]);
+	int p01 = popcount(a[0] ^ b[1]);
+	int p02 = popcount(a[0] ^ b[2]);
+	int p03 = popcount(a[0] ^ b[3]);
+
+	int p10 = popcount(a[1] ^ b[0]);
+	int p11 = popcount(a[1] ^ b[1]);
+	int p12 = popcount(a[1] ^ b[2]);
+	int p13 = popcount(a[1] ^ b[3]);
+
+	int p20 = popcount(a[2] ^ b[0]);
+	int p21 = popcount(a[2] ^ b[1]);
+	int p22 = popcount(a[2] ^ b[2]);
+	int p23 = popcount(a[2] ^ b[3]);
+
+	int p30 = popcount(a[3] ^ b[0]);
+	int p31 = popcount(a[3] ^ b[1]);
+	int p32 = popcount(a[3] ^ b[2]);
+	int p33 = popcount(a[3] ^ b[3]);
+
+	int mx23 = astc::min(p22 + p33, p23 + p32);
+	int mx13 = astc::min(p21 + p33, p23 + p31);
+	int mx12 = astc::min(p21 + p32, p22 + p31);
+	int mx03 = astc::min(p20 + p33, p23 + p30);
+	int mx02 = astc::min(p20 + p32, p22 + p30);
+	int mx01 = astc::min(p21 + p30, p20 + p31);
+
+	int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
+	int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
+	int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
+	int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
+
+	return astc::min(v0, v1, v2, v3);
+}
+
+using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
+
+/**
+ * @brief Count the partition table mismatches vs the data clustering.
+ *
+ * @param      bsd               The block size information.
+ * @param      partition_count   The number of partitions in the block.
+ * @param      bitmaps           The block texel partition assignment patterns.
+ * @param[out] mismatch_counts   The array storing per partitioning mismatch counts.
+ */
+static void count_partition_mismatch_bits(
+	const block_size_descriptor& bsd,
+	unsigned int partition_count,
+	const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
+	unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
+) {
+	unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
+	promise(active_count > 0);
+
+	if (partition_count == 2)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
+		}
+	}
+	else if (partition_count == 3)
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < active_count; i++)
+		{
+			mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
+		}
+	}
+}
+
+/**
+ * @brief Use counting sort on the mismatch array to sort partition candidates.
+ *
+ * @param      partitioning_count   The number of packed partitionings.
+ * @param      mismatch_count       Partitioning mismatch counts, in index order.
+ * @param[out] partition_ordering   Partition index values, in mismatch order.
+ *
+ * @return The number of active partitions in this selection.
+ */
+static unsigned int get_partition_ordering_by_mismatch_bits(
+	unsigned int partitioning_count,
+	const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
+	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	promise(partitioning_count > 0);
+	unsigned int mscount[256] { 0 };
+
+	// Create the histogram of mismatch counts
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		mscount[mismatch_count[i]]++;
+	}
+
+	unsigned int active_count = partitioning_count - mscount[255];
+
+	// Create a running sum from the histogram array
+	// Cells store previous values only; i.e. exclude self after sum
+	unsigned int summa = 0;
+	for (unsigned int i = 0; i < 256; i++)
+	{
+		unsigned int cnt = mscount[i];
+		mscount[i] = summa;
+		summa += cnt;
+	}
+
+	// Use the running sum as the index, incrementing after read to allow
+	// sequential entries with the same count
+	for (unsigned int i = 0; i < partitioning_count; i++)
+	{
+		unsigned int idx = mscount[mismatch_count[i]]++;
+		partition_ordering[idx] = i;
+	}
+
+	return active_count;
+}
+
+/**
+ * @brief Use k-means clustering to compute a partition ordering for a block..
+ *
+ * @param      bsd                  The block size information.
+ * @param      blk                  The image block color data to compress.
+ * @param      partition_count      The desired number of partitions in the block.
+ * @param[out] partition_ordering   The list of recommended partition indices, in priority order.
+ *
+ * @return The number of active partitionings in this selection.
+ */
+static unsigned int compute_kmeans_partition_ordering(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+	vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
+	uint8_t texel_partitions[BLOCK_MAX_TEXELS];
+
+	// Use three passes of k-means clustering to partition the block data
+	for (unsigned int i = 0; i < 3; i++)
+	{
+		if (i == 0)
+		{
+			kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
+		}
+		else
+		{
+			kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+		}
+
+		kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+	}
+
+	// Construct the block bitmaps of texel assignments to each partition
+	uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
+	unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+	promise(texels_to_process > 0);
+	for (unsigned int i = 0; i < texels_to_process; i++)
+	{
+		unsigned int idx = bsd.kmeans_texels[i];
+		bitmaps[texel_partitions[idx]] |= 1ULL << i;
+	}
+
+	// Count the mismatch between the block and the format's partition tables
+	unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
+	count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
+
+	// Sort the partitions based on the number of mismatched bits
+	return get_partition_ordering_by_mismatch_bits(
+	    bsd.partitioning_count_selected[partition_count - 1],
+	    mismatch_counts, partition_ordering);
+}
+
+/**
+ * @brief Insert a partitioning into an order list of results, sorted by error.
+ *
+ * @param      max_values      The max number of entries in the best result arrays.
+ * @param      this_error      The error of the new entry.
+ * @param      this_partition  The partition ID of the new entry.
+ * @param[out] best_errors     The array of best error values.
+ * @param[out] best_partitions The array of best partition values.
+ */
+static void insert_result(
+	unsigned int max_values,
+	float this_error,
+	unsigned int this_partition,
+	float* best_errors,
+	unsigned int* best_partitions)
+{
+	promise(max_values > 0);
+
+	// Don't bother searching if the current worst error beats the new error
+	if (this_error >= best_errors[max_values - 1])
+	{
+		return;
+	}
+
+	// Else insert into the list in error-order
+	for (unsigned int i = 0; i < max_values; i++)
+	{
+		// Existing result is better - move on ...
+		if (this_error > best_errors[i])
+		{
+			continue;
+		}
+
+		// Move existing results down one
+		for (unsigned int j = max_values - 1; j > i; j--)
+		{
+			best_errors[j] = best_errors[j - 1];
+			best_partitions[j] = best_partitions[j - 1];
+		}
+
+		// Insert new result
+		best_errors[i] = this_error;
+		best_partitions[i] = this_partition;
+		break;
+	}
+}
+
+/* See header for documentation. */
+unsigned int find_best_partition_candidates(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	unsigned int partition_search_limit,
+	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
+	unsigned int requested_candidates
+) {
+	// Constant used to estimate quantization error for a given partitioning; the optimal value for
+	// this depends on bitrate. These values have been determined empirically.
+	unsigned int texels_per_block = bsd.texel_count;
+	float weight_imprecision_estim = 0.055f;
+	if (texels_per_block <= 20)
+	{
+		weight_imprecision_estim = 0.03f;
+	}
+	else if (texels_per_block <= 31)
+	{
+		weight_imprecision_estim = 0.04f;
+	}
+	else if (texels_per_block <= 41)
+	{
+		weight_imprecision_estim = 0.05f;
+	}
+
+	promise(partition_count > 0);
+	promise(partition_search_limit > 0);
+
+	weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
+
+	unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
+	unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
+	partition_search_limit = astc::min(partition_search_limit, sequence_len);
+	requested_candidates = astc::min(partition_search_limit, requested_candidates);
+
+	bool uses_alpha = !blk.is_constant_channel(3);
+
+	// Partitioning errors assuming uncorrelated-chrominance endpoints
+	float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	// Partitioning errors assuming same-chrominance endpoints
+	float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+	unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		uncor_best_errors[i] = ERROR_CALC_DEFAULT;
+		samec_best_errors[i] = ERROR_CALC_DEFAULT;
+	}
+
+	if (uses_alpha)
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+			compute_avgs_and_dirs_4_comp(pi, blk, pms);
+
+			line4 uncor_lines[BLOCK_MAX_PARTITIONS];
+			line4 samec_lines[BLOCK_MAX_PARTITIONS];
+
+			processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
+			processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
+
+			float uncor_line_lens[BLOCK_MAX_PARTITIONS];
+			float samec_line_lens[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+
+				uncor_lines[j].a = pm.avg;
+				uncor_lines[j].b = normalize_safe(pm.dir, unit4());
+
+				uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
+				uncor_plines[j].bs = uncor_lines[j].b;
+
+				samec_lines[j].a = vfloat4::zero();
+				samec_lines[j].b = normalize_safe(pm.avg, unit4());
+
+				samec_plines[j].amod = vfloat4::zero();
+				samec_plines[j].bs = samec_lines[j].b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgba(pi,
+			                           blk,
+			                           uncor_plines,
+			                           samec_plines,
+			                           uncor_line_lens,
+			                           samec_line_lens,
+			                           uncor_error,
+			                           samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
+				vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
+
+				uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < partition_search_limit; i++)
+		{
+			unsigned int partition = partition_sequence[i];
+			const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+			// Compute weighting to give to each component in each partition
+			partition_metrics pms[BLOCK_MAX_PARTITIONS];
+			compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+
+			partition_lines3 plines[BLOCK_MAX_PARTITIONS];
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_metrics& pm = pms[j];
+				partition_lines3& pl = plines[j];
+
+				pl.uncor_line.a = pm.avg;
+				pl.uncor_line.b = normalize_safe(pm.dir, unit3());
+
+				pl.samec_line.a = vfloat4::zero();
+				pl.samec_line.b = normalize_safe(pm.avg, unit3());
+
+				pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
+				pl.uncor_pline.bs   = pl.uncor_line.b;
+
+				pl.samec_pline.amod = vfloat4::zero();
+				pl.samec_pline.bs   = pl.samec_line.b;
+			}
+
+			float uncor_error = 0.0f;
+			float samec_error = 0.0f;
+
+			compute_error_squared_rgb(pi,
+			                          blk,
+			                          plines,
+			                          uncor_error,
+			                          samec_error);
+
+			// Compute an estimate of error introduced by weight quantization imprecision.
+			// This error is computed as follows, for each partition
+			//     1: compute the principal-axis vector (full length) in error-space
+			//     2: convert the principal-axis vector to regular RGB-space
+			//     3: scale the vector by a constant that estimates average quantization error
+			//     4: for each texel, square the vector, then do a dot-product with the texel's
+			//        error weight; sum up the results across all texels.
+			//     4(optimized): square the vector once, then do a dot-product with the average
+			//        texel error, then multiply by the number of texels.
+
+			for (unsigned int j = 0; j < partition_count; j++)
+			{
+				partition_lines3& pl = plines[j];
+
+				float tpp = static_cast<float>(pi.partition_texel_count[j]);
+				vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+				vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
+				vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
+
+				uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
+				samec_error += dot3_s(samec_vector * samec_vector, error_weights);
+			}
+
+			insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+			insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+		}
+	}
+
+	bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];
+
+	unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
+	for (unsigned int i = 0; i < requested_candidates; i++)
+	{
+		if (best_is_uncor)
+		{
+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+		}
+		else
+		{
+			interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+			interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+		}
+	}
+
+	uint64_t bitmasks[1024/64] { 0 };
+	unsigned int emitted = 0;
+
+	// Deduplicate the first "requested" entries
+	for (unsigned int i = 0; i < requested_candidates * 2;  i++)
+	{
+		unsigned int partition = interleave[i];
+
+		unsigned int word = partition / 64;
+		unsigned int bit = partition % 64;
+
+		bool written = bitmasks[word] & (1ull << bit);
+
+		if (!written)
+		{
+			best_partitions[emitted] = partition;
+			bitmasks[word] |= 1ull << bit;
+			emitted++;
+
+			if (emitted == requested_candidates)
+			{
+				break;
+			}
+		}
+	}
+
+	return emitted;
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
new file mode 100644
index 0000000000..5145e08693
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_ideal_endpoints_and_weights.cpp
@@ -0,0 +1,1663 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for computing color endpoints and texel weights.
+ */
+
+#include <cassert>
+
+#include "astcenc_internal.h"
+#include "astcenc_vecmathlib.h"
+
+/**
+ * @brief Compute the infilled weight for N texel indices in a decimated grid.
+ *
+ * @param di        The weight grid decimation to use.
+ * @param weights   The decimated weight values to use.
+ * @param index     The first texel index to interpolate.
+ *
+ * @return The interpolated weight for the given set of SIMD_WIDTH texels.
+ */
+static vfloat bilinear_infill_vla(
+	const decimation_info& di,
+	const float* weights,
+	unsigned int index
+) {
+	// Load the bilinear filter texel weight indexes in the decimated grid
+	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
+	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
+	vint weight_idx2 = vint(di.texel_weights_tr[2] + index);
+	vint weight_idx3 = vint(di.texel_weights_tr[3] + index);
+
+	// Load the bilinear filter weights from the decimated grid
+	vfloat weight_val0 = gatherf(weights, weight_idx0);
+	vfloat weight_val1 = gatherf(weights, weight_idx1);
+	vfloat weight_val2 = gatherf(weights, weight_idx2);
+	vfloat weight_val3 = gatherf(weights, weight_idx3);
+
+	// Load the weight contribution factors for each decimated weight
+	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
+	vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
+	vfloat tex_weight_float2 = loada(di.texel_weight_contribs_float_tr[2] + index);
+	vfloat tex_weight_float3 = loada(di.texel_weight_contribs_float_tr[3] + index);
+
+	// Compute the bilinear interpolation to generate the per-texel weight
+	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1) +
+	       (weight_val2 * tex_weight_float2 + weight_val3 * tex_weight_float3);
+}
+
+/**
+ * @brief Compute the infilled weight for N texel indices in a decimated grid.
+ *
+ * This is specialized version which computes only two weights per texel for
+ * encodings that are only decimated in a single axis.
+ *
+ * @param di        The weight grid decimation to use.
+ * @param weights   The decimated weight values to use.
+ * @param index     The first texel index to interpolate.
+ *
+ * @return The interpolated weight for the given set of SIMD_WIDTH texels.
+ */
+static vfloat bilinear_infill_vla_2(
+	const decimation_info& di,
+	const float* weights,
+	unsigned int index
+) {
+	// Load the bilinear filter texel weight indexes in the decimated grid
+	vint weight_idx0 = vint(di.texel_weights_tr[0] + index);
+	vint weight_idx1 = vint(di.texel_weights_tr[1] + index);
+
+	// Load the bilinear filter weights from the decimated grid
+	vfloat weight_val0 = gatherf(weights, weight_idx0);
+	vfloat weight_val1 = gatherf(weights, weight_idx1);
+
+	// Load the weight contribution factors for each decimated weight
+	vfloat tex_weight_float0 = loada(di.texel_weight_contribs_float_tr[0] + index);
+	vfloat tex_weight_float1 = loada(di.texel_weight_contribs_float_tr[1] + index);
+
+	// Compute the bilinear interpolation to generate the per-texel weight
+	return (weight_val0 * tex_weight_float0 + weight_val1 * tex_weight_float1);
+}
+
+/**
+ * @brief Compute the ideal endpoints and weights for 1 color component.
+ *
+ * @param      blk         The image block color data to compress.
+ * @param      pi          The partition info for the current trial.
+ * @param[out] ei          The computed ideal endpoints and weights.
+ * @param      component   The color component to compute.
+ */
+static void compute_ideal_colors_and_weights_1_comp(
+	const image_block& blk,
+	const partition_info& pi,
+	endpoints_and_weights& ei,
+	unsigned int component
+) {
+	unsigned int partition_count = pi.partition_count;
+	ei.ep.partition_count = partition_count;
+	promise(partition_count > 0);
+
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	float error_weight;
+	const float* data_vr = nullptr;
+
+	assert(component < BLOCK_MAX_COMPONENTS);
+	switch (component)
+	{
+	case 0:
+		error_weight = blk.channel_weight.lane<0>();
+		data_vr = blk.data_r;
+		break;
+	case 1:
+		error_weight = blk.channel_weight.lane<1>();
+		data_vr = blk.data_g;
+		break;
+	case 2:
+		error_weight = blk.channel_weight.lane<2>();
+		data_vr = blk.data_b;
+		break;
+	default:
+		assert(component == 3);
+		error_weight = blk.channel_weight.lane<3>();
+		data_vr = blk.data_a;
+		break;
+	}
+
+	vmask4 sep_mask = vint4::lane_id() == vint4(component);
+	bool is_constant_wes { true };
+	float partition0_len_sq { 0.0f };
+
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		float lowvalue { 1e10f };
+		float highvalue { -1e10f };
+
+		unsigned int partition_texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			float value = data_vr[tix];
+			lowvalue = astc::min(value, lowvalue);
+			highvalue = astc::max(value, highvalue);
+		}
+
+		if (highvalue <= lowvalue)
+		{
+			lowvalue = 0.0f;
+			highvalue = 1e-7f;
+		}
+
+		float length = highvalue - lowvalue;
+		float length_squared = length * length;
+		float scale = 1.0f / length;
+
+		if (i == 0)
+		{
+			partition0_len_sq = length_squared;
+		}
+		else
+		{
+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
+		}
+
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			float value = (data_vr[tix] - lowvalue) * scale;
+			value = astc::clamp1f(value);
+
+			ei.weights[tix] = value;
+			ei.weight_error_scale[tix] = length_squared * error_weight;
+			assert(!astc::isnan(ei.weight_error_scale[tix]));
+		}
+
+		ei.ep.endpt0[i] = select(blk.data_min, vfloat4(lowvalue), sep_mask);
+		ei.ep.endpt1[i] = select(blk.data_max, vfloat4(highvalue), sep_mask);
+	}
+
+	// Zero initialize any SIMD over-fetch
+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	{
+		ei.weights[i] = 0.0f;
+		ei.weight_error_scale[i] = 0.0f;
+	}
+
+	ei.is_constant_weight_error_scale = is_constant_wes;
+}
+
+/**
+ * @brief Compute the ideal endpoints and weights for 2 color components.
+ *
+ * @param      blk          The image block color data to compress.
+ * @param      pi           The partition info for the current trial.
+ * @param[out] ei           The computed ideal endpoints and weights.
+ * @param      component1   The first color component to compute.
+ * @param      component2   The second color component to compute.
+ */
+static void compute_ideal_colors_and_weights_2_comp(
+	const image_block& blk,
+	const partition_info& pi,
+	endpoints_and_weights& ei,
+	int component1,
+	int component2
+) {
+	unsigned int partition_count = pi.partition_count;
+	ei.ep.partition_count = partition_count;
+	promise(partition_count > 0);
+
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+	float error_weight;
+	const float* data_vr = nullptr;
+	const float* data_vg = nullptr;
+
+	if (component1 == 0 && component2 == 1)
+	{
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1>()) / 2.0f;
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_g;
+	}
+	else if (component1 == 0 && component2 == 2)
+	{
+		error_weight = hadd_s(blk.channel_weight.swz<0, 2>()) / 2.0f;
+
+		data_vr = blk.data_r;
+		data_vg = blk.data_b;
+	}
+	else // (component1 == 1 && component2 == 2)
+	{
+		assert(component1 == 1 && component2 == 2);
+
+		error_weight = hadd_s(blk.channel_weight.swz<1, 2>()) / 2.0f;
+
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+	}
+
+	compute_avgs_and_dirs_2_comp(pi, blk, component1, component2, pms);
+
+	bool is_constant_wes { true };
+	float partition0_len_sq { 0.0f };
+
+	vmask4 comp1_mask = vint4::lane_id() == vint4(component1);
+	vmask4 comp2_mask = vint4::lane_id() == vint4(component2);
+
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		vfloat4 dir = pms[i].dir;
+		if (hadd_s(dir) < 0.0f)
+		{
+			dir = vfloat4::zero() - dir;
+		}
+
+		line2 line { pms[i].avg, normalize_safe(dir, unit2()) };
+		float lowparam { 1e10f };
+		float highparam { -1e10f };
+
+		unsigned int partition_texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			vfloat4 point = vfloat2(data_vr[tix], data_vg[tix]);
+			float param = dot_s(point - line.a, line.b);
+			ei.weights[tix] = param;
+
+			lowparam = astc::min(param, lowparam);
+			highparam = astc::max(param, highparam);
+		}
+
+		// It is possible for a uniform-color partition to produce length=0;
+		// this causes NaN issues so set to small value to avoid this problem
+		if (highparam <= lowparam)
+		{
+			lowparam = 0.0f;
+			highparam = 1e-7f;
+		}
+
+		float length = highparam - lowparam;
+		float length_squared = length * length;
+		float scale = 1.0f / length;
+
+		if (i == 0)
+		{
+			partition0_len_sq = length_squared;
+		}
+		else
+		{
+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
+		}
+
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			float idx = (ei.weights[tix] - lowparam) * scale;
+			idx = astc::clamp1f(idx);
+
+			ei.weights[tix] = idx;
+			ei.weight_error_scale[tix] = length_squared * error_weight;
+			assert(!astc::isnan(ei.weight_error_scale[tix]));
+		}
+
+		vfloat4 lowvalue = line.a + line.b * lowparam;
+		vfloat4 highvalue = line.a + line.b * highparam;
+
+		vfloat4 ep0 = select(blk.data_min, vfloat4(lowvalue.lane<0>()), comp1_mask);
+		vfloat4 ep1 = select(blk.data_max, vfloat4(highvalue.lane<0>()), comp1_mask);
+
+		ei.ep.endpt0[i] = select(ep0, vfloat4(lowvalue.lane<1>()), comp2_mask);
+		ei.ep.endpt1[i] = select(ep1, vfloat4(highvalue.lane<1>()), comp2_mask);
+	}
+
+	// Zero initialize any SIMD over-fetch
+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	{
+		ei.weights[i] = 0.0f;
+		ei.weight_error_scale[i] = 0.0f;
+	}
+
+	ei.is_constant_weight_error_scale = is_constant_wes;
+}
+
+/**
+ * @brief Compute the ideal endpoints and weights for 3 color components.
+ *
+ * @param      blk                 The image block color data to compress.
+ * @param      pi                  The partition info for the current trial.
+ * @param[out] ei                  The computed ideal endpoints and weights.
+ * @param      omitted_component   The color component excluded from the calculation.
+ */
+static void compute_ideal_colors_and_weights_3_comp(
+	const image_block& blk,
+	const partition_info& pi,
+	endpoints_and_weights& ei,
+	unsigned int omitted_component
+) {
+	unsigned int partition_count = pi.partition_count;
+	ei.ep.partition_count = partition_count;
+	promise(partition_count > 0);
+
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+
+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+	float error_weight;
+	const float* data_vr = nullptr;
+	const float* data_vg = nullptr;
+	const float* data_vb = nullptr;
+	if (omitted_component == 0)
+	{
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
+		data_vr = blk.data_g;
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 1)
+	{
+		error_weight = hadd_s(blk.channel_weight.swz<0, 2, 3>());
+		data_vr = blk.data_r;
+		data_vg = blk.data_b;
+		data_vb = blk.data_a;
+	}
+	else if (omitted_component == 2)
+	{
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 3>());
+		data_vr = blk.data_r;
+		data_vg = blk.data_g;
+		data_vb = blk.data_a;
+	}
+	else
+	{
+		assert(omitted_component == 3);
+
+		error_weight = hadd_s(blk.channel_weight.swz<0, 1, 2>());
+		data_vr = blk.data_r;
+		data_vg = blk.data_g;
+		data_vb = blk.data_b;
+	}
+
+	error_weight = error_weight * (1.0f / 3.0f);
+
+	if (omitted_component == 3)
+	{
+		compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+	}
+	else
+	{
+		compute_avgs_and_dirs_3_comp(pi, blk, omitted_component, pms);
+	}
+
+	bool is_constant_wes { true };
+	float partition0_len_sq { 0.0f };
+
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		vfloat4 dir = pms[i].dir;
+		if (hadd_rgb_s(dir) < 0.0f)
+		{
+			dir = vfloat4::zero() - dir;
+		}
+
+		line3 line { pms[i].avg, normalize_safe(dir, unit3()) };
+		float lowparam { 1e10f };
+		float highparam { -1e10f };
+
+		unsigned int partition_texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			vfloat4 point = vfloat3(data_vr[tix], data_vg[tix], data_vb[tix]);
+			float param = dot3_s(point - line.a, line.b);
+			ei.weights[tix] = param;
+
+			lowparam = astc::min(param, lowparam);
+			highparam = astc::max(param, highparam);
+		}
+
+		// It is possible for a uniform-color partition to produce length=0;
+		// this causes NaN issues so set to small value to avoid this problem
+		if (highparam <= lowparam)
+		{
+			lowparam = 0.0f;
+			highparam = 1e-7f;
+		}
+
+		float length = highparam - lowparam;
+		float length_squared = length * length;
+		float scale = 1.0f / length;
+
+		if (i == 0)
+		{
+			partition0_len_sq = length_squared;
+		}
+		else
+		{
+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
+		}
+
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			float idx = (ei.weights[tix] - lowparam) * scale;
+			idx = astc::clamp1f(idx);
+
+			ei.weights[tix] = idx;
+			ei.weight_error_scale[tix] = length_squared * error_weight;
+			assert(!astc::isnan(ei.weight_error_scale[tix]));
+		}
+
+		vfloat4 ep0 = line.a + line.b * lowparam;
+		vfloat4 ep1 = line.a + line.b * highparam;
+
+		vfloat4 bmin = blk.data_min;
+		vfloat4 bmax = blk.data_max;
+
+		assert(omitted_component < BLOCK_MAX_COMPONENTS);
+		switch (omitted_component)
+		{
+			case 0:
+				ei.ep.endpt0[i] = vfloat4(bmin.lane<0>(), ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>());
+				ei.ep.endpt1[i] = vfloat4(bmax.lane<0>(), ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>());
+				break;
+			case 1:
+				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), bmin.lane<1>(), ep0.lane<1>(), ep0.lane<2>());
+				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), bmax.lane<1>(), ep1.lane<1>(), ep1.lane<2>());
+				break;
+			case 2:
+				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), bmin.lane<2>(), ep0.lane<2>());
+				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), bmax.lane<2>(), ep1.lane<2>());
+				break;
+			default:
+				ei.ep.endpt0[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), bmin.lane<3>());
+				ei.ep.endpt1[i] = vfloat4(ep1.lane<0>(), ep1.lane<1>(), ep1.lane<2>(), bmax.lane<3>());
+				break;
+		}
+	}
+
+	// Zero initialize any SIMD over-fetch
+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	{
+		ei.weights[i] = 0.0f;
+		ei.weight_error_scale[i] = 0.0f;
+	}
+
+	ei.is_constant_weight_error_scale = is_constant_wes;
+}
+
+/**
+ * @brief Compute the ideal endpoints and weights for 4 color components.
+ *
+ * @param      blk   The image block color data to compress.
+ * @param      pi    The partition info for the current trial.
+ * @param[out] ei    The computed ideal endpoints and weights.
+ */
+static void compute_ideal_colors_and_weights_4_comp(
+	const image_block& blk,
+	const partition_info& pi,
+	endpoints_and_weights& ei
+) {
+	const float error_weight = hadd_s(blk.channel_weight) / 4.0f;
+
+	unsigned int partition_count = pi.partition_count;
+
+	unsigned int texel_count = blk.texel_count;
+	promise(texel_count > 0);
+	promise(partition_count > 0);
+
+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+	compute_avgs_and_dirs_4_comp(pi, blk, pms);
+
+	bool is_constant_wes { true };
+	float partition0_len_sq { 0.0f };
+
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		vfloat4 dir = pms[i].dir;
+		if (hadd_rgb_s(dir) < 0.0f)
+		{
+			dir = vfloat4::zero() - dir;
+		}
+
+		line4 line { pms[i].avg, normalize_safe(dir, unit4()) };
+		float lowparam { 1e10f };
+		float highparam { -1e10f };
+
+		unsigned int partition_texel_count = pi.partition_texel_count[i];
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			vfloat4 point = blk.texel(tix);
+			float param = dot_s(point - line.a, line.b);
+			ei.weights[tix] = param;
+
+			lowparam = astc::min(param, lowparam);
+			highparam = astc::max(param, highparam);
+		}
+
+		// It is possible for a uniform-color partition to produce length=0;
+		// this causes NaN issues so set to small value to avoid this problem
+		if (highparam <= lowparam)
+		{
+			lowparam = 0.0f;
+			highparam = 1e-7f;
+		}
+
+		float length = highparam - lowparam;
+		float length_squared = length * length;
+		float scale = 1.0f / length;
+
+		if (i == 0)
+		{
+			partition0_len_sq = length_squared;
+		}
+		else
+		{
+			is_constant_wes = is_constant_wes && length_squared == partition0_len_sq;
+		}
+
+		ei.ep.endpt0[i] = line.a + line.b * lowparam;
+		ei.ep.endpt1[i] = line.a + line.b * highparam;
+
+		for (unsigned int j = 0; j < partition_texel_count; j++)
+		{
+			unsigned int tix = pi.texels_of_partition[i][j];
+			float idx = (ei.weights[tix] - lowparam) * scale;
+			idx = astc::clamp1f(idx);
+
+			ei.weights[tix] = idx;
+			ei.weight_error_scale[tix] = length_squared * error_weight;
+			assert(!astc::isnan(ei.weight_error_scale[tix]));
+		}
+	}
+
+	// Zero initialize any SIMD over-fetch
+	unsigned int texel_count_simd = round_up_to_simd_multiple_vla(texel_count);
+	for (unsigned int i = texel_count; i < texel_count_simd; i++)
+	{
+		ei.weights[i] = 0.0f;
+		ei.weight_error_scale[i] = 0.0f;
+	}
+
+	ei.is_constant_weight_error_scale = is_constant_wes;
+}
+
+/* See header for documentation. */
+void compute_ideal_colors_and_weights_1plane(
+	const image_block& blk,
+	const partition_info& pi,
+	endpoints_and_weights& ei
+) {
+	bool uses_alpha = !blk.is_constant_channel(3);
+
+	if (uses_alpha)
+	{
+		compute_ideal_colors_and_weights_4_comp(blk, pi, ei);
+	}
+	else
+	{
+		compute_ideal_colors_and_weights_3_comp(blk, pi, ei, 3);
+	}
+}
+
+/* See header for documentation. */
+void compute_ideal_colors_and_weights_2planes(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int plane2_component,
+	endpoints_and_weights& ei1,
+	endpoints_and_weights& ei2
+) {
+	const auto& pi = bsd.get_partition_info(1, 0);
+	bool uses_alpha = !blk.is_constant_channel(3);
+
+	assert(plane2_component < BLOCK_MAX_COMPONENTS);
+	switch (plane2_component)
+	{
+	case 0: // Separate weights for red
+		if (uses_alpha)
+		{
+			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 0);
+		}
+		else
+		{
+			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 1, 2);
+		}
+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 0);
+		break;
+
+	case 1: // Separate weights for green
+		if (uses_alpha)
+		{
+			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 1);
+		}
+		else
+		{
+			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 2);
+		}
+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 1);
+		break;
+
+	case 2: // Separate weights for blue
+		if (uses_alpha)
+		{
+			compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 2);
+		}
+		else
+		{
+			compute_ideal_colors_and_weights_2_comp(blk, pi, ei1, 0, 1);
+		}
+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 2);
+		break;
+
+	default: // Separate weights for alpha
+		assert(uses_alpha);
+		compute_ideal_colors_and_weights_3_comp(blk, pi, ei1, 3);
+		compute_ideal_colors_and_weights_1_comp(blk, pi, ei2, 3);
+		break;
+	}
+}
+
+/* See header for documentation. */
+float compute_error_of_weight_set_1plane(
+	const endpoints_and_weights& eai,
+	const decimation_info& di,
+	const float* dec_weight_quant_uvalue
+) {
+	vfloatacc error_summav = vfloatacc::zero();
+	unsigned int texel_count = di.texel_count;
+	promise(texel_count > 0);
+
+	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
+	if (di.max_texel_weight_count > 2)
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			// Compute the bilinear interpolation of the decimated weight grid
+			vfloat current_values = bilinear_infill_vla(di, dec_weight_quant_uvalue, i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values = loada(eai.weights + i);
+			vfloat diff = current_values - actual_values;
+			vfloat significance = loada(eai.weight_error_scale + i);
+			vfloat error = diff * diff * significance;
+
+			haccumulate(error_summav, error);
+		}
+	}
+	else if (di.max_texel_weight_count > 1)
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			// Compute the bilinear interpolation of the decimated weight grid
+			vfloat current_values = bilinear_infill_vla_2(di, dec_weight_quant_uvalue, i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values = loada(eai.weights + i);
+			vfloat diff = current_values - actual_values;
+			vfloat significance = loada(eai.weight_error_scale + i);
+			vfloat error = diff * diff * significance;
+
+			haccumulate(error_summav, error);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			// Load the weight set directly, without interpolation
+			vfloat current_values = loada(dec_weight_quant_uvalue + i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values = loada(eai.weights + i);
+			vfloat diff = current_values - actual_values;
+			vfloat significance = loada(eai.weight_error_scale + i);
+			vfloat error = diff * diff * significance;
+
+			haccumulate(error_summav, error);
+		}
+	}
+
+	// Resolve the final scalar accumulator sum
+	return hadd_s(error_summav);
+}
+
+/* See header for documentation. */
+float compute_error_of_weight_set_2planes(
+	const endpoints_and_weights& eai1,
+	const endpoints_and_weights& eai2,
+	const decimation_info& di,
+	const float* dec_weight_quant_uvalue_plane1,
+	const float* dec_weight_quant_uvalue_plane2
+) {
+	vfloatacc error_summav = vfloatacc::zero();
+	unsigned int texel_count = di.texel_count;
+	promise(texel_count > 0);
+
+	// Process SIMD-width chunks, safe to over-fetch - the extra space is zero initialized
+	if (di.max_texel_weight_count > 2)
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			// Plane 1
+			// Compute the bilinear interpolation of the decimated weight grid
+			vfloat current_values1 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane1, i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values1 = loada(eai1.weights + i);
+			vfloat diff = current_values1 - actual_values1;
+			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
+
+			// Plane 2
+			// Compute the bilinear interpolation of the decimated weight grid
+			vfloat current_values2 = bilinear_infill_vla(di, dec_weight_quant_uvalue_plane2, i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values2 = loada(eai2.weights + i);
+			diff = current_values2 - actual_values2;
+			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
+
+			haccumulate(error_summav, error1 + error2);
+		}
+	}
+	else if (di.max_texel_weight_count > 1)
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			// Plane 1
+			// Compute the bilinear interpolation of the decimated weight grid
+			vfloat current_values1 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane1, i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values1 = loada(eai1.weights + i);
+			vfloat diff = current_values1 - actual_values1;
+			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
+
+			// Plane 2
+			// Compute the bilinear interpolation of the decimated weight grid
+			vfloat current_values2 = bilinear_infill_vla_2(di, dec_weight_quant_uvalue_plane2, i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values2 = loada(eai2.weights + i);
+			diff = current_values2 - actual_values2;
+			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
+
+			haccumulate(error_summav, error1 + error2);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			// Plane 1
+			// Load the weight set directly, without interpolation
+			vfloat current_values1 = loada(dec_weight_quant_uvalue_plane1 + i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values1 = loada(eai1.weights + i);
+			vfloat diff = current_values1 - actual_values1;
+			vfloat error1 = diff * diff * loada(eai1.weight_error_scale + i);
+
+			// Plane 2
+			// Load the weight set directly, without interpolation
+			vfloat current_values2 = loada(dec_weight_quant_uvalue_plane2 + i);
+
+			// Compute the error between the computed value and the ideal weight
+			vfloat actual_values2 = loada(eai2.weights + i);
+			diff = current_values2 - actual_values2;
+			vfloat error2 = diff * diff * loada(eai2.weight_error_scale + i);
+
+			haccumulate(error_summav, error1 + error2);
+		}
+	}
+
+	// Resolve the final scalar accumulator sum
+	return hadd_s(error_summav);
+}
+
+/* See header for documentation. */
+void compute_ideal_weights_for_decimation(
+	const endpoints_and_weights& ei,
+	const decimation_info& di,
+	float* dec_weight_ideal_value
+) {
+	unsigned int texel_count = di.texel_count;
+	unsigned int weight_count = di.weight_count;
+	bool is_direct = texel_count == weight_count;
+	promise(texel_count > 0);
+	promise(weight_count > 0);
+
+	// Ensure that the end of the output arrays that are used for SIMD paths later are filled so we
+	// can safely run SIMD elsewhere without a loop tail. Note that this is always safe as weight
+	// arrays always contain space for 64 elements
+	unsigned int prev_weight_count_simd = round_down_to_simd_multiple_vla(weight_count - 1);
+	storea(vfloat::zero(), dec_weight_ideal_value + prev_weight_count_simd);
+
+	// If we have a 1:1 mapping just shortcut the computation. Transfer enough to also copy the
+	// zero-initialized SIMD over-fetch region
+	if (is_direct)
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat weight(ei.weights + i);
+			storea(weight, dec_weight_ideal_value + i);
+		}
+
+		return;
+	}
+
+	// Otherwise compute an estimate and perform single refinement iteration
+	alignas(ASTCENC_VECALIGN) float infilled_weights[BLOCK_MAX_TEXELS];
+
+	// Compute an initial average for each decimated weight
+	bool constant_wes = ei.is_constant_weight_error_scale;
+	vfloat weight_error_scale(ei.weight_error_scale[0]);
+
+	// This overshoots - this is OK as we initialize the array tails in the
+	// decimation table structures to safe values ...
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Start with a small value to avoid div-by-zero later
+		vfloat weight_weight(1e-10f);
+		vfloat initial_weight = vfloat::zero();
+
+		// Accumulate error weighting of all the texels using this weight
+		vint weight_texel_count(di.weight_texel_count + i);
+		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		promise(max_texel_count > 0);
+
+		for (unsigned int j = 0; j < max_texel_count; j++)
+		{
+			vint texel(di.weight_texels_tr[j] + i);
+			vfloat weight = loada(di.weights_texel_contribs_tr[j] + i);
+
+			if (!constant_wes)
+			{
+				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+			}
+
+			vfloat contrib_weight = weight * weight_error_scale;
+
+			weight_weight += contrib_weight;
+			initial_weight += gatherf(ei.weights, texel) * contrib_weight;
+		}
+
+		storea(initial_weight / weight_weight, dec_weight_ideal_value + i);
+	}
+
+	// Populate the interpolated weight grid based on the initial average
+	// Process SIMD-width texel coordinates at at time while we can. Safe to
+	// over-process full SIMD vectors - the tail is zeroed.
+	if (di.max_texel_weight_count <= 2)
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat weight = bilinear_infill_vla_2(di, dec_weight_ideal_value, i);
+			storea(weight, infilled_weights + i);
+		}
+	}
+	else
+	{
+		for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat weight = bilinear_infill_vla(di, dec_weight_ideal_value, i);
+			storea(weight, infilled_weights + i);
+		}
+	}
+
+	// Perform a single iteration of refinement
+	// Empirically determined step size; larger values don't help but smaller drops image quality
+	constexpr float stepsize = 0.25f;
+	constexpr float chd_scale = -WEIGHTS_TEXEL_SUM;
+
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat weight_val = loada(dec_weight_ideal_value + i);
+
+		// Accumulate error weighting of all the texels using this weight
+		// Start with a small value to avoid div-by-zero later
+		vfloat error_change0(1e-10f);
+		vfloat error_change1(0.0f);
+
+		// Accumulate error weighting of all the texels using this weight
+		vint weight_texel_count(di.weight_texel_count + i);
+		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		promise(max_texel_count > 0);
+
+		for (unsigned int j = 0; j < max_texel_count; j++)
+		{
+			vint texel(di.weight_texels_tr[j] + i);
+			vfloat contrib_weight = loada(di.weights_texel_contribs_tr[j] + i);
+
+			if (!constant_wes)
+			{
+ 				weight_error_scale = gatherf(ei.weight_error_scale, texel);
+			}
+
+			vfloat scale = weight_error_scale * contrib_weight;
+			vfloat old_weight = gatherf(infilled_weights, texel);
+			vfloat ideal_weight = gatherf(ei.weights, texel);
+
+			error_change0 += contrib_weight * scale;
+			error_change1 += (old_weight - ideal_weight) * scale;
+		}
+
+		vfloat step = (error_change1 * chd_scale) / error_change0;
+		step = clamp(-stepsize, stepsize, step);
+
+		// Update the weight; note this can store negative values
+		storea(weight_val + step, dec_weight_ideal_value + i);
+	}
+}
+
+/* See header for documentation. */
+void compute_quantized_weights_for_decimation(
+	const decimation_info& di,
+	float low_bound,
+	float high_bound,
+	const float* dec_weight_ideal_value,
+	float* weight_set_out,
+	uint8_t* quantized_weight_set,
+	quant_method quant_level
+) {
+	int weight_count = di.weight_count;
+	promise(weight_count > 0);
+	const quant_and_transfer_table& qat = quant_and_xfer_tables[quant_level];
+
+	// The available quant levels, stored with a minus 1 bias
+	static const float quant_levels_m1[12] {
+		1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 7.0f, 9.0f, 11.0f, 15.0f, 19.0f, 23.0f, 31.0f
+	};
+
+	vint steps_m1(get_quant_level(quant_level) - 1);
+	float quant_level_m1 = quant_levels_m1[quant_level];
+
+	// Quantize the weight set using both the specified low/high bounds and standard 0..1 bounds
+
+	// TODO: Oddity to investigate; triggered by test in issue #265.
+	if (high_bound <= low_bound)
+	{
+		low_bound = 0.0f;
+		high_bound = 1.0f;
+	}
+
+	float rscale = high_bound - low_bound;
+	float scale = 1.0f / rscale;
+
+	float scaled_low_bound = low_bound * scale;
+	rscale *= 1.0f / 64.0f;
+
+	vfloat scalev(scale);
+	vfloat scaled_low_boundv(scaled_low_bound);
+	vfloat quant_level_m1v(quant_level_m1);
+	vfloat rscalev(rscale);
+	vfloat low_boundv(low_bound);
+
+	// This runs to the rounded-up SIMD size, which is safe as the loop tail is filled with known
+	// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
+	if (get_quant_level(quant_level) <= 16)
+	{
+		vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
+		vint tab0p;
+		vtable_prepare(tab0, tab0p);
+
+		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
+			ix = clampzo(ix);
+
+			// Look up the two closest indexes and return the one that was closest
+			vfloat ix1 = ix * quant_level_m1v;
+
+			vint weightl = float_to_int(ix1);
+			vint weighth = min(weightl + vint(1), steps_m1);
+
+			vint ixli = vtable_8bt_32bi(tab0p, weightl);
+			vint ixhi = vtable_8bt_32bi(tab0p, weighth);
+
+			vfloat ixl = int_to_float(ixli);
+			vfloat ixh = int_to_float(ixhi);
+
+			vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
+			vint weight = select(ixli, ixhi, mask);
+			ixl = select(ixl, ixh, mask);
+
+			// Invert the weight-scaling that was done initially
+			storea(ixl * rscalev + low_boundv, weight_set_out + i);
+			vint scn = pack_low_bytes(weight);
+			store_nbytes(scn, quantized_weight_set + i);
+		}
+	}
+	else
+	{
+		vint4 tab0(reinterpret_cast<const int*>(qat.quant_to_unquant));
+		vint4 tab1(reinterpret_cast<const int*>(qat.quant_to_unquant + 16));
+		vint tab0p, tab1p;
+		vtable_prepare(tab0, tab1, tab0p, tab1p);
+
+		for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat ix = loada(dec_weight_ideal_value + i) * scalev - scaled_low_boundv;
+			ix = clampzo(ix);
+
+			// Look up the two closest indexes and return the one that was closest
+			vfloat ix1 = ix * quant_level_m1v;
+
+			vint weightl = float_to_int(ix1);
+			vint weighth = min(weightl + vint(1), steps_m1);
+
+			vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
+			vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
+
+			vfloat ixl = int_to_float(ixli);
+			vfloat ixh = int_to_float(ixhi);
+
+			vmask mask = (ixl + ixh) < (vfloat(128.0f) * ix);
+			vint weight = select(ixli, ixhi, mask);
+			ixl = select(ixl, ixh, mask);
+
+			// Invert the weight-scaling that was done initially
+			storea(ixl * rscalev + low_boundv, weight_set_out + i);
+			vint scn = pack_low_bytes(weight);
+			store_nbytes(scn, quantized_weight_set + i);
+		}
+	}
+}
+
+/**
+ * @brief Compute the RGB + offset for a HDR endpoint mode #7.
+ *
+ * Since the matrix needed has a regular structure we can simplify the inverse calculation. This
+ * gives us ~24 multiplications vs. 96 for a generic inverse.
+ *
+ *  mat[0] = vfloat4(rgba_ws.x,      0.0f,      0.0f, wght_ws.x);
+ *  mat[1] = vfloat4(     0.0f, rgba_ws.y,      0.0f, wght_ws.y);
+ *  mat[2] = vfloat4(     0.0f,      0.0f, rgba_ws.z, wght_ws.z);
+ *  mat[3] = vfloat4(wght_ws.x, wght_ws.y, wght_ws.z,      psum);
+ *  mat = invert(mat);
+ *
+ * @param rgba_weight_sum     Sum of partition component error weights.
+ * @param weight_weight_sum   Sum of partition component error weights * texel weight.
+ * @param rgbq_sum            Sum of partition component error weights * texel weight * color data.
+ * @param psum                Sum of RGB color weights * texel weight^2.
+ */
+static inline vfloat4 compute_rgbo_vector(
+	vfloat4 rgba_weight_sum,
+	vfloat4 weight_weight_sum,
+	vfloat4 rgbq_sum,
+	float psum
+) {
+	float X = rgba_weight_sum.lane<0>();
+	float Y = rgba_weight_sum.lane<1>();
+	float Z = rgba_weight_sum.lane<2>();
+	float P = weight_weight_sum.lane<0>();
+	float Q = weight_weight_sum.lane<1>();
+	float R = weight_weight_sum.lane<2>();
+	float S = psum;
+
+	float PP = P * P;
+	float QQ = Q * Q;
+	float RR = R * R;
+
+	float SZmRR = S * Z - RR;
+	float DT = SZmRR * Y - Z * QQ;
+	float YP = Y * P;
+	float QX = Q * X;
+	float YX = Y * X;
+	float mZYP = -Z * YP;
+	float mZQX = -Z * QX;
+	float mRYX = -R * YX;
+	float ZQP = Z * Q * P;
+	float RYP = R * YP;
+	float RQX = R * QX;
+
+	// Compute the reciprocal of matrix determinant
+	float rdet = 1.0f / (DT * X + mZYP * P);
+
+	// Actually compute the adjugate, and then apply 1/det separately
+	vfloat4 mat0(DT, ZQP, RYP, mZYP);
+	vfloat4 mat1(ZQP, SZmRR * X - Z * PP, RQX, mZQX);
+	vfloat4 mat2(RYP, RQX, (S * Y - QQ) * X - Y * PP, mRYX);
+	vfloat4 mat3(mZYP, mZQX, mRYX, Z * YX);
+	vfloat4 vect = rgbq_sum * rdet;
+
+	return vfloat4(dot_s(mat0, vect),
+	               dot_s(mat1, vect),
+	               dot_s(mat2, vect),
+	               dot_s(mat3, vect));
+}
+
+/* See header for documentation. */
+void recompute_ideal_colors_1plane(
+	const image_block& blk,
+	const partition_info& pi,
+	const decimation_info& di,
+	const uint8_t* dec_weights_uquant,
+	endpoints& ep,
+	vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
+	vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]
+) {
+	unsigned int weight_count = di.weight_count;
+	unsigned int total_texel_count = blk.texel_count;
+	unsigned int partition_count = pi.partition_count;
+
+	promise(weight_count > 0);
+	promise(total_texel_count > 0);
+	promise(partition_count > 0);
+
+	alignas(ASTCENC_VECALIGN) float dec_weight[BLOCK_MAX_WEIGHTS];
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vint unquant_value(dec_weights_uquant + i);
+		vfloat unquant_valuef = int_to_float(unquant_value) * vfloat(1.0f / 64.0f);
+		storea(unquant_valuef, dec_weight + i);
+	}
+
+	alignas(ASTCENC_VECALIGN) float undec_weight[BLOCK_MAX_TEXELS];
+	float* undec_weight_ref;
+	if (di.max_texel_weight_count == 1)
+	{
+		undec_weight_ref = dec_weight;
+	}
+	else if (di.max_texel_weight_count <= 2)
+	{
+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat weight = bilinear_infill_vla_2(di, dec_weight, i);
+			storea(weight, undec_weight + i);
+		}
+
+		undec_weight_ref = undec_weight;
+	}
+	else
+	{
+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat weight = bilinear_infill_vla(di, dec_weight, i);
+			storea(weight, undec_weight + i);
+		}
+
+		undec_weight_ref = undec_weight;
+	}
+
+	vfloat4 rgba_sum(blk.data_mean * static_cast<float>(blk.texel_count));
+
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		unsigned int texel_count = pi.partition_texel_count[i];
+		const uint8_t *texel_indexes = pi.texels_of_partition[i];
+
+		// Only compute a partition mean if more than one partition
+		if (partition_count > 1)
+		{
+			rgba_sum = vfloat4::zero();
+			promise(texel_count > 0);
+			for (unsigned int j = 0; j < texel_count; j++)
+			{
+				unsigned int tix = texel_indexes[j];
+				rgba_sum += blk.texel(tix);
+			}
+		}
+
+		rgba_sum = rgba_sum * blk.channel_weight;
+		vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
+		vfloat4 scale_dir = normalize((rgba_sum / rgba_weight_sum).swz<0, 1, 2>());
+
+		float scale_max = 0.0f;
+		float scale_min = 1e10f;
+
+		float wmin1 = 1.0f;
+		float wmax1 = 0.0f;
+
+		float left_sum_s = 0.0f;
+		float middle_sum_s = 0.0f;
+		float right_sum_s = 0.0f;
+
+		vfloat4 color_vec_x = vfloat4::zero();
+		vfloat4 color_vec_y = vfloat4::zero();
+
+		vfloat4 scale_vec = vfloat4::zero();
+
+		float weight_weight_sum_s = 1e-17f;
+
+		vfloat4 color_weight = blk.channel_weight;
+		float ls_weight = hadd_rgb_s(color_weight);
+
+		for (unsigned int j = 0; j < texel_count; j++)
+		{
+			unsigned int tix = texel_indexes[j];
+			vfloat4 rgba = blk.texel(tix);
+
+			float idx0 = undec_weight_ref[tix];
+
+			float om_idx0 = 1.0f - idx0;
+			wmin1 = astc::min(idx0, wmin1);
+			wmax1 = astc::max(idx0, wmax1);
+
+			float scale = dot3_s(scale_dir, rgba);
+			scale_min = astc::min(scale, scale_min);
+			scale_max = astc::max(scale, scale_max);
+
+			left_sum_s   += om_idx0 * om_idx0;
+			middle_sum_s += om_idx0 * idx0;
+			right_sum_s  += idx0 * idx0;
+			weight_weight_sum_s += idx0;
+
+			vfloat4 color_idx(idx0);
+			vfloat4 cwprod = rgba;
+			vfloat4 cwiprod = cwprod * color_idx;
+
+			color_vec_y += cwiprod;
+			color_vec_x += cwprod - cwiprod;
+
+			scale_vec += vfloat2(om_idx0, idx0) * (scale * ls_weight);
+		}
+
+		vfloat4 left_sum   = vfloat4(left_sum_s) * color_weight;
+		vfloat4 middle_sum = vfloat4(middle_sum_s) * color_weight;
+		vfloat4 right_sum  = vfloat4(right_sum_s) * color_weight;
+		vfloat4 lmrs_sum   = vfloat3(left_sum_s, middle_sum_s, right_sum_s) * ls_weight;
+
+		color_vec_x = color_vec_x * color_weight;
+		color_vec_y = color_vec_y * color_weight;
+
+		// Initialize the luminance and scale vectors with a reasonable default
+		float scalediv = scale_min / astc::max(scale_max, 1e-10f);
+		scalediv = astc::clamp1f(scalediv);
+
+		vfloat4 sds = scale_dir * scale_max;
+
+		rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
+
+		if (wmin1 >= wmax1 * 0.999f)
+		{
+			// If all weights in the partition were equal, then just take average of all colors in
+			// the partition and use that as both endpoint colors
+			vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
+
+			vmask4 notnan_mask = avg == avg;
+			ep.endpt0[i] = select(ep.endpt0[i], avg, notnan_mask);
+			ep.endpt1[i] = select(ep.endpt1[i], avg, notnan_mask);
+
+			rgbs_vectors[i] = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
+		}
+		else
+		{
+			// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
+			// set of texel weights and pixel colors
+			vfloat4 color_det1 = (left_sum * right_sum) - (middle_sum * middle_sum);
+			vfloat4 color_rdet1 = 1.0f / color_det1;
+
+			float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
+			float ls_rdet1 = 1.0f / ls_det1;
+
+			vfloat4 color_mss1 = (left_sum * left_sum)
+			                   + (2.0f * middle_sum * middle_sum)
+			                   + (right_sum * right_sum);
+
+			float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
+			              + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
+			              + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
+
+			vfloat4 ep0 = (right_sum * color_vec_x - middle_sum * color_vec_y) * color_rdet1;
+			vfloat4 ep1 = (left_sum * color_vec_y - middle_sum * color_vec_x) * color_rdet1;
+
+			vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
+			vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
+			vmask4 full_mask = det_mask & notnan_mask;
+
+			ep.endpt0[i] = select(ep.endpt0[i], ep0, full_mask);
+			ep.endpt1[i] = select(ep.endpt1[i], ep1, full_mask);
+
+			float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
+			float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
+
+			if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
+			{
+				float scalediv2 = scale_ep0 / scale_ep1;
+				vfloat4 sdsm = scale_dir * scale_ep1;
+				rgbs_vectors[i] = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
+			}
+		}
+
+		// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
+		if (blk.rgb_lns[0] || blk.alpha_lns[0])
+		{
+			vfloat4 weight_weight_sum = vfloat4(weight_weight_sum_s) * color_weight;
+			float psum = right_sum_s * hadd_rgb_s(color_weight);
+
+			vfloat4 rgbq_sum = color_vec_x + color_vec_y;
+			rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
+
+			vfloat4 rgbovec = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
+			rgbo_vectors[i] = rgbovec;
+
+			// We can get a failure due to the use of a singular (non-invertible) matrix
+			// If it failed, compute rgbo_vectors[] with a different method ...
+			if (astc::isnan(dot_s(rgbovec, rgbovec)))
+			{
+				vfloat4 v0 = ep.endpt0[i];
+				vfloat4 v1 = ep.endpt1[i];
+
+				float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
+				avgdif = astc::max(avgdif, 0.0f);
+
+				vfloat4 avg = (v0 + v1) * 0.5f;
+				vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
+				rgbo_vectors[i] = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+void recompute_ideal_colors_2planes(
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	const decimation_info& di,
+	const uint8_t* dec_weights_uquant_plane1,
+	const uint8_t* dec_weights_uquant_plane2,
+	endpoints& ep,
+	vfloat4& rgbs_vector,
+	vfloat4& rgbo_vector,
+	int plane2_component
+) {
+	unsigned int weight_count = di.weight_count;
+	unsigned int total_texel_count = blk.texel_count;
+
+	promise(total_texel_count > 0);
+	promise(weight_count > 0);
+
+	alignas(ASTCENC_VECALIGN) float dec_weight_plane1[BLOCK_MAX_WEIGHTS_2PLANE];
+	alignas(ASTCENC_VECALIGN) float dec_weight_plane2[BLOCK_MAX_WEIGHTS_2PLANE];
+
+	assert(weight_count <= BLOCK_MAX_WEIGHTS_2PLANE);
+
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vint unquant_value1(dec_weights_uquant_plane1 + i);
+		vfloat unquant_value1f = int_to_float(unquant_value1) * vfloat(1.0f / 64.0f);
+		storea(unquant_value1f, dec_weight_plane1 + i);
+
+		vint unquant_value2(dec_weights_uquant_plane2 + i);
+		vfloat unquant_value2f = int_to_float(unquant_value2) * vfloat(1.0f / 64.0f);
+		storea(unquant_value2f, dec_weight_plane2 + i);
+	}
+
+	alignas(ASTCENC_VECALIGN) float undec_weight_plane1[BLOCK_MAX_TEXELS];
+	alignas(ASTCENC_VECALIGN) float undec_weight_plane2[BLOCK_MAX_TEXELS];
+
+	float* undec_weight_plane1_ref;
+	float* undec_weight_plane2_ref;
+
+	if (di.max_texel_weight_count == 1)
+	{
+		undec_weight_plane1_ref = dec_weight_plane1;
+		undec_weight_plane2_ref = dec_weight_plane2;
+	}
+	else if (di.max_texel_weight_count <= 2)
+	{
+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat weight = bilinear_infill_vla_2(di, dec_weight_plane1, i);
+			storea(weight, undec_weight_plane1 + i);
+
+			weight = bilinear_infill_vla_2(di, dec_weight_plane2, i);
+			storea(weight, undec_weight_plane2 + i);
+		}
+
+		undec_weight_plane1_ref = undec_weight_plane1;
+		undec_weight_plane2_ref = undec_weight_plane2;
+	}
+	else
+	{
+		for (unsigned int i = 0; i < total_texel_count; i += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat weight = bilinear_infill_vla(di, dec_weight_plane1, i);
+			storea(weight, undec_weight_plane1 + i);
+
+			weight = bilinear_infill_vla(di, dec_weight_plane2, i);
+			storea(weight, undec_weight_plane2 + i);
+		}
+
+		undec_weight_plane1_ref = undec_weight_plane1;
+		undec_weight_plane2_ref = undec_weight_plane2;
+	}
+
+	unsigned int texel_count = bsd.texel_count;
+	vfloat4 rgba_weight_sum = max(blk.channel_weight * static_cast<float>(texel_count), 1e-17f);
+	vfloat4 scale_dir = normalize(blk.data_mean.swz<0, 1, 2>());
+
+	float scale_max = 0.0f;
+	float scale_min = 1e10f;
+
+	float wmin1 = 1.0f;
+	float wmax1 = 0.0f;
+
+	float wmin2 = 1.0f;
+	float wmax2 = 0.0f;
+
+	float left1_sum_s = 0.0f;
+	float middle1_sum_s = 0.0f;
+	float right1_sum_s = 0.0f;
+
+	float left2_sum_s = 0.0f;
+	float middle2_sum_s = 0.0f;
+	float right2_sum_s = 0.0f;
+
+	vfloat4 color_vec_x = vfloat4::zero();
+	vfloat4 color_vec_y = vfloat4::zero();
+
+	vfloat4 scale_vec = vfloat4::zero();
+
+	vfloat4 weight_weight_sum = vfloat4(1e-17f);
+
+	vmask4 p2_mask = vint4::lane_id() == vint4(plane2_component);
+	vfloat4 color_weight = blk.channel_weight;
+	float ls_weight = hadd_rgb_s(color_weight);
+
+	for (unsigned int j = 0; j < texel_count; j++)
+	{
+		vfloat4 rgba = blk.texel(j);
+
+		float idx0 = undec_weight_plane1_ref[j];
+
+		float om_idx0 = 1.0f - idx0;
+		wmin1 = astc::min(idx0, wmin1);
+		wmax1 = astc::max(idx0, wmax1);
+
+		float scale = dot3_s(scale_dir, rgba);
+		scale_min = astc::min(scale, scale_min);
+		scale_max = astc::max(scale, scale_max);
+
+		left1_sum_s   += om_idx0 * om_idx0;
+		middle1_sum_s += om_idx0 * idx0;
+		right1_sum_s  += idx0 * idx0;
+
+		float idx1 = undec_weight_plane2_ref[j];
+
+		float om_idx1 = 1.0f - idx1;
+		wmin2 = astc::min(idx1, wmin2);
+		wmax2 = astc::max(idx1, wmax2);
+
+		left2_sum_s   += om_idx1 * om_idx1;
+		middle2_sum_s += om_idx1 * idx1;
+		right2_sum_s  += idx1 * idx1;
+
+		vfloat4 color_idx = select(vfloat4(idx0), vfloat4(idx1), p2_mask);
+
+		vfloat4 cwprod = rgba;
+		vfloat4 cwiprod = cwprod * color_idx;
+
+		color_vec_y += cwiprod;
+		color_vec_x += cwprod - cwiprod;
+
+		scale_vec += vfloat2(om_idx0, idx0) * (ls_weight * scale);
+		weight_weight_sum += color_idx;
+	}
+
+	vfloat4 left1_sum   = vfloat4(left1_sum_s) * color_weight;
+	vfloat4 middle1_sum = vfloat4(middle1_sum_s) * color_weight;
+	vfloat4 right1_sum  = vfloat4(right1_sum_s) * color_weight;
+	vfloat4 lmrs_sum    = vfloat3(left1_sum_s, middle1_sum_s, right1_sum_s) * ls_weight;
+
+	vfloat4 left2_sum   = vfloat4(left2_sum_s) * color_weight;
+	vfloat4 middle2_sum = vfloat4(middle2_sum_s) * color_weight;
+	vfloat4 right2_sum  = vfloat4(right2_sum_s) * color_weight;
+
+	color_vec_x = color_vec_x * color_weight;
+	color_vec_y = color_vec_y * color_weight;
+
+	// Initialize the luminance and scale vectors with a reasonable default
+	float scalediv = scale_min / astc::max(scale_max, 1e-10f);
+	scalediv = astc::clamp1f(scalediv);
+
+	vfloat4 sds = scale_dir * scale_max;
+
+	rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), scalediv);
+
+	if (wmin1 >= wmax1 * 0.999f)
+	{
+		// If all weights in the partition were equal, then just take average of all colors in
+		// the partition and use that as both endpoint colors
+		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
+
+		vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
+		vmask4 notnan_mask = avg == avg;
+		vmask4 full_mask = p1_mask & notnan_mask;
+
+		ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
+		ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
+
+		rgbs_vector = vfloat4(sds.lane<0>(), sds.lane<1>(), sds.lane<2>(), 1.0f);
+	}
+	else
+	{
+		// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
+		// set of texel weights and pixel colors
+		vfloat4 color_det1 = (left1_sum * right1_sum) - (middle1_sum * middle1_sum);
+		vfloat4 color_rdet1 = 1.0f / color_det1;
+
+		float ls_det1  = (lmrs_sum.lane<0>() * lmrs_sum.lane<2>()) - (lmrs_sum.lane<1>() * lmrs_sum.lane<1>());
+		float ls_rdet1 = 1.0f / ls_det1;
+
+		vfloat4 color_mss1 = (left1_sum * left1_sum)
+		                   + (2.0f * middle1_sum * middle1_sum)
+		                   + (right1_sum * right1_sum);
+
+		float ls_mss1 = (lmrs_sum.lane<0>() * lmrs_sum.lane<0>())
+		              + (2.0f * lmrs_sum.lane<1>() * lmrs_sum.lane<1>())
+		              + (lmrs_sum.lane<2>() * lmrs_sum.lane<2>());
+
+		vfloat4 ep0 = (right1_sum * color_vec_x - middle1_sum * color_vec_y) * color_rdet1;
+		vfloat4 ep1 = (left1_sum * color_vec_y - middle1_sum * color_vec_x) * color_rdet1;
+
+		float scale_ep0 = (lmrs_sum.lane<2>() * scale_vec.lane<0>() - lmrs_sum.lane<1>() * scale_vec.lane<1>()) * ls_rdet1;
+		float scale_ep1 = (lmrs_sum.lane<0>() * scale_vec.lane<1>() - lmrs_sum.lane<1>() * scale_vec.lane<0>()) * ls_rdet1;
+
+		vmask4 p1_mask = vint4::lane_id() != vint4(plane2_component);
+		vmask4 det_mask = abs(color_det1) > (color_mss1 * 1e-4f);
+		vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
+		vmask4 full_mask = p1_mask & det_mask & notnan_mask;
+
+		ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
+		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
+
+		if (fabsf(ls_det1) > (ls_mss1 * 1e-4f) && scale_ep0 == scale_ep0 && scale_ep1 == scale_ep1 && scale_ep0 < scale_ep1)
+		{
+			float scalediv2 = scale_ep0 / scale_ep1;
+			vfloat4 sdsm = scale_dir * scale_ep1;
+			rgbs_vector = vfloat4(sdsm.lane<0>(), sdsm.lane<1>(), sdsm.lane<2>(), scalediv2);
+		}
+	}
+
+	if (wmin2 >= wmax2 * 0.999f)
+	{
+		// If all weights in the partition were equal, then just take average of all colors in
+		// the partition and use that as both endpoint colors
+		vfloat4 avg = (color_vec_x + color_vec_y) / rgba_weight_sum;
+
+		vmask4 notnan_mask = avg == avg;
+		vmask4 full_mask = p2_mask & notnan_mask;
+
+		ep.endpt0[0] = select(ep.endpt0[0], avg, full_mask);
+		ep.endpt1[0] = select(ep.endpt1[0], avg, full_mask);
+	}
+	else
+	{
+		// Otherwise, complete the analytic calculation of ideal-endpoint-values for the given
+		// set of texel weights and pixel colors
+		vfloat4 color_det2 = (left2_sum * right2_sum) - (middle2_sum * middle2_sum);
+		vfloat4 color_rdet2 = 1.0f / color_det2;
+
+		vfloat4 color_mss2 = (left2_sum * left2_sum)
+		                   + (2.0f * middle2_sum * middle2_sum)
+		                   + (right2_sum * right2_sum);
+
+		vfloat4 ep0 = (right2_sum * color_vec_x - middle2_sum * color_vec_y) * color_rdet2;
+		vfloat4 ep1 = (left2_sum * color_vec_y - middle2_sum * color_vec_x) * color_rdet2;
+
+		vmask4 det_mask = abs(color_det2) > (color_mss2 * 1e-4f);
+		vmask4 notnan_mask = (ep0 == ep0) & (ep1 == ep1);
+		vmask4 full_mask = p2_mask & det_mask & notnan_mask;
+
+		ep.endpt0[0] = select(ep.endpt0[0], ep0, full_mask);
+		ep.endpt1[0] = select(ep.endpt1[0], ep1, full_mask);
+	}
+
+	// Calculations specific to mode #7, the HDR RGB-scale mode - skip if known LDR
+	if (blk.rgb_lns[0] || blk.alpha_lns[0])
+	{
+		weight_weight_sum = weight_weight_sum * color_weight;
+		float psum = dot3_s(select(right1_sum, right2_sum, p2_mask), color_weight);
+
+		vfloat4 rgbq_sum = color_vec_x + color_vec_y;
+		rgbq_sum.set_lane<3>(hadd_rgb_s(color_vec_y));
+
+		rgbo_vector = compute_rgbo_vector(rgba_weight_sum, weight_weight_sum, rgbq_sum, psum);
+
+		// We can get a failure due to the use of a singular (non-invertible) matrix
+		// If it failed, compute rgbo_vectors[] with a different method ...
+		if (astc::isnan(dot_s(rgbo_vector, rgbo_vector)))
+		{
+			vfloat4 v0 = ep.endpt0[0];
+			vfloat4 v1 = ep.endpt1[0];
+
+			float avgdif = hadd_rgb_s(v1 - v0) * (1.0f / 3.0f);
+			avgdif = astc::max(avgdif, 0.0f);
+
+			vfloat4 avg = (v0 + v1) * 0.5f;
+			vfloat4 ep0 = avg - vfloat4(avgdif) * 0.5f;
+
+			rgbo_vector = vfloat4(ep0.lane<0>(), ep0.lane<1>(), ep0.lane<2>(), avgdif);
+		}
+	}
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_image.cpp b/thirdparty/astcenc/astcenc_image.cpp
new file mode 100644
index 0000000000..9c0d6727d0
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_image.cpp
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for creating in-memory ASTC image structures.
+ */
+
+#include <cassert>
+#include <cstring>
+
+#include "astcenc_internal.h"
+
+/**
+ * @brief Loader pipeline function type for data fetch from memory.
+ */
+using pixel_loader = vfloat4(*)(const void*, int);
+
+/**
+ * @brief Loader pipeline function type for swizzling data in a vector.
+ */
+using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&);
+
+/**
+ * @brief Loader pipeline function type for converting data in a vector to LNS.
+ */
+using pixel_converter = vfloat4(*)(vfloat4, vmask4);
+
+/**
+ * @brief Load a 8-bit UNORM texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_u8(
+	const void* data,
+	int base_offset
+) {
+	const uint8_t* data8 = static_cast<const uint8_t*>(data);
+	return int_to_float(vint4(data8 + base_offset)) / 255.0f;
+}
+
+/**
+ * @brief Load a 16-bit fp16 texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f16(
+	const void* data,
+	int base_offset
+) {
+	const uint16_t* data16 = static_cast<const uint16_t*>(data);
+	int r = data16[base_offset    ];
+	int g = data16[base_offset + 1];
+	int b = data16[base_offset + 2];
+	int a = data16[base_offset + 3];
+	return float16_to_float(vint4(r, g, b, a));
+}
+
+/**
+ * @brief Load a 32-bit float texel from a data array.
+ *
+ * @param data          The data pointer.
+ * @param base_offset   The index offset to the start of the pixel.
+ */
+static vfloat4 load_texel_f32(
+	const void* data,
+	int base_offset
+) {
+	const float* data32 = static_cast<const float*>(data);
+	return vfloat4(data32 + base_offset);
+}
+
+/**
+ * @brief Dummy no-op swizzle function.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel_skip(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	return data;
+}
+
+/**
+ * @brief Swizzle a texel into a new arrangement.
+ *
+ * @param data   The source RGBA vector to swizzle.
+ * @param swz    The swizzle to use.
+ */
+static vfloat4 swz_texel(
+	vfloat4 data,
+	const astcenc_swizzle& swz
+) {
+	alignas(16) float datas[6];
+
+	storea(data, datas);
+	datas[ASTCENC_SWZ_0] = 0.0f;
+	datas[ASTCENC_SWZ_1] = 1.0f;
+
+	return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]);
+}
+
+/**
+ * @brief Encode a texel that is entirely LDR linear.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_unorm(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	(void)lns_mask;
+	return data * 65535.0f;
+}
+
+/**
+ * @brief Encode a texel that includes at least some HDR LNS texels.
+ *
+ * @param data       The RGBA data to encode.
+ * @param lns_mask   The mask for the HDR channels than need LNS encoding.
+ */
+static vfloat4 encode_texel_lns(
+	vfloat4 data,
+	vmask4 lns_mask
+) {
+	vfloat4 datav_unorm = data * 65535.0f;
+	vfloat4 datav_lns = float_to_lns(data);
+	return select(datav_unorm, datav_lns, lns_mask);
+}
+
+/* See header for documentation. */
+void load_image_block(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+	unsigned int zsize = img.dim_z;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	int idx = 0;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean(0.0f);
+	vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count));
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+
+	// This works because we impose the same choice everywhere during encode
+	uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) ||
+	                  (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0;
+	uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0;
+	vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns);
+	vmask4 lns_mask = use_lns != vint4::zero();
+
+	// Set up the function pointers for loading pipeline as needed
+	pixel_loader loader = load_texel_u8;
+	if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		loader = load_texel_f16;
+	}
+	else if  (img.data_type == ASTCENC_TYPE_F32)
+	{
+		loader = load_texel_f32;
+	}
+
+	pixel_swizzler swizzler = swz_texel_skip;
+	if (needs_swz)
+	{
+		swizzler = swz_texel;
+	}
+
+	pixel_converter converter = encode_texel_unorm;
+	if (any(lns_mask))
+	{
+		converter = encode_texel_lns;
+	}
+
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		unsigned int zi = astc::min(zpos + z, zsize - 1);
+		void* plane = img.data[zi];
+
+		for (unsigned int y = 0; y < bsd.ydim; y++)
+		{
+			unsigned int yi = astc::min(ypos + y, ysize - 1);
+
+			for (unsigned int x = 0; x < bsd.xdim; x++)
+			{
+				unsigned int xi = astc::min(xpos + x, xsize - 1);
+
+				vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi));
+				datav = swizzler(datav, swz);
+				datav = converter(datav, lns_mask);
+
+				// Compute block metadata
+				data_min = min(data_min, datav);
+				data_mean += datav * data_mean_scale;
+				data_max = max(data_max, datav);
+
+				grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+				blk.data_r[idx] = datav.lane<0>();
+				blk.data_g[idx] = datav.lane<1>();
+				blk.data_b[idx] = datav.lane<2>();
+				blk.data_a[idx] = datav.lane<3>();
+
+				blk.rgb_lns[idx] = rgb_lns;
+				blk.alpha_lns[idx] = a_lns;
+
+				idx++;
+			}
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	vfloat4 data_enc = blk.texel(0);
+	vfloat4 data_enc_unorm = data_enc / 65535.0f;
+	vfloat4 data_enc_lns = vfloat4::zero();
+
+	if (rgb_lns || a_lns)
+	{
+		data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc)));
+	}
+
+	blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask);
+
+	// Store block metadata
+	blk.data_min = data_min;
+	blk.data_mean = data_mean;
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void load_image_block_fast_ldr(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	(void)swz;
+	(void)decode_mode;
+
+	unsigned int xsize = img.dim_x;
+	unsigned int ysize = img.dim_y;
+
+	blk.xpos = xpos;
+	blk.ypos = ypos;
+	blk.zpos = zpos;
+
+	vfloat4 data_min(1e38f);
+	vfloat4 data_mean = vfloat4::zero();
+	vfloat4 data_max(-1e38f);
+	vmask4 grayscalev(true);
+	int idx = 0;
+
+	const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]);
+	for (unsigned int y = ypos; y < ypos + bsd.ydim; y++)
+	{
+		unsigned int yi = astc::min(y, ysize - 1);
+
+		for (unsigned int x = xpos; x < xpos + bsd.xdim; x++)
+		{
+			unsigned int xi = astc::min(x, xsize - 1);
+
+			vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi));
+			vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f);
+
+			// Compute block metadata
+			data_min = min(data_min, datav);
+			data_mean += datav;
+			data_max = max(data_max, datav);
+
+			grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>());
+
+			blk.data_r[idx] = datav.lane<0>();
+			blk.data_g[idx] = datav.lane<1>();
+			blk.data_b[idx] = datav.lane<2>();
+			blk.data_a[idx] = datav.lane<3>();
+
+			idx++;
+		}
+	}
+
+	// Reverse the encoding so we store origin block in the original format
+	blk.origin_texel = blk.texel(0) / 65535.0f;
+
+	// Store block metadata
+	blk.rgb_lns[0] = 0;
+	blk.alpha_lns[0] = 0;
+	blk.data_min = data_min;
+	blk.data_mean = data_mean / static_cast<float>(bsd.texel_count);
+	blk.data_max = data_max;
+	blk.grayscale = all(grayscalev);
+}
+
+/* See header for documentation. */
+void store_image_block(
+	astcenc_image& img,
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz
+) {
+	unsigned int x_size = img.dim_x;
+	unsigned int x_start = xpos;
+	unsigned int x_end = astc::min(x_size, xpos + bsd.xdim);
+	unsigned int x_count = x_end - x_start;
+	unsigned int x_nudge = bsd.xdim - x_count;
+
+	unsigned int y_size = img.dim_y;
+	unsigned int y_start = ypos;
+	unsigned int y_end = astc::min(y_size, ypos + bsd.ydim);
+	unsigned int y_count = y_end - y_start;
+	unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim;
+
+	unsigned int z_size = img.dim_z;
+	unsigned int z_start = zpos;
+	unsigned int z_end = astc::min(z_size, zpos + bsd.zdim);
+
+	// True if any non-identity swizzle
+	bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) ||
+	                 (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A);
+
+	// True if any swizzle uses Z reconstruct
+	bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) ||
+	               (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z);
+
+	int idx = 0;
+	if (img.data_type == ASTCENC_TYPE_U8)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint8_t* data8 = static_cast<uint8_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH)
+				{
+					unsigned int max_texels = ASTCENC_SIMD_WIDTH;
+					unsigned int used_texels = astc::min(x_count - x, max_texels);
+
+					// Unaligned load as rows are not always SIMD_WIDTH long
+					vfloat data_r(blk.data_r + idx);
+					vfloat data_g(blk.data_g + idx);
+					vfloat data_b(blk.data_b + idx);
+					vfloat data_a(blk.data_a + idx);
+
+					vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f);
+					vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f);
+					vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f);
+					vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f);
+
+					if (needs_swz)
+					{
+						vint swizzle_table[7];
+						swizzle_table[ASTCENC_SWZ_0] = vint(0);
+						swizzle_table[ASTCENC_SWZ_1] = vint(255);
+						swizzle_table[ASTCENC_SWZ_R] = data_ri;
+						swizzle_table[ASTCENC_SWZ_G] = data_gi;
+						swizzle_table[ASTCENC_SWZ_B] = data_bi;
+						swizzle_table[ASTCENC_SWZ_A] = data_ai;
+
+						if (needs_z)
+						{
+							vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f);
+							vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y);
+							data_z = max(data_z, 0.0f);
+							data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f);
+
+							swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f);
+						}
+
+						data_ri = swizzle_table[swz.r];
+						data_gi = swizzle_table[swz.g];
+						data_bi = swizzle_table[swz.b];
+						data_ai = swizzle_table[swz.a];
+					}
+
+					// Errors are NaN encoded - convert to magenta error color
+					// Branch is OK here - it is almost never true so predicts well
+					vmask nan_mask = data_r != data_r;
+					if (any(nan_mask))
+					{
+						data_ri = select(data_ri, vint(0xFF), nan_mask);
+						data_gi = select(data_gi, vint(0x00), nan_mask);
+						data_bi = select(data_bi, vint(0xFF), nan_mask);
+						data_ai = select(data_ai, vint(0xFF), nan_mask);
+					}
+
+					vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai);
+					vmask store_mask = vint::lane_id() < vint(used_texels);
+					store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask);
+
+					data8_row += ASTCENC_SIMD_WIDTH * 4;
+					idx += used_texels;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else if (img.data_type == ASTCENC_TYPE_F16)
+	{
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			uint16_t* data16 = static_cast<uint16_t*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vint4 color;
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = blk.data_r[idx];
+						data[ASTCENC_SWZ_G] = blk.data_g[idx];
+						data[ASTCENC_SWZ_B] = blk.data_b[idx];
+						data[ASTCENC_SWZ_A] = blk.data_a[idx];
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+						color = float_to_float16(colorf);
+					}
+					else
+					{
+						vfloat4 colorf = blk.texel(idx);
+						color = float_to_float16(colorf);
+					}
+
+					// TODO: Vectorize with store N shorts?
+					data16_row[0] = static_cast<uint16_t>(color.lane<0>());
+					data16_row[1] = static_cast<uint16_t>(color.lane<1>());
+					data16_row[2] = static_cast<uint16_t>(color.lane<2>());
+					data16_row[3] = static_cast<uint16_t>(color.lane<3>());
+					data16_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+	else // if (img.data_type == ASTCENC_TYPE_F32)
+	{
+		assert(img.data_type == ASTCENC_TYPE_F32);
+
+		for (unsigned int z = z_start; z < z_end; z++)
+		{
+			// Fetch the image plane
+			float* data32 = static_cast<float*>(img.data[z]);
+
+			for (unsigned int y = y_start; y < y_end; y++)
+			{
+				float* data32_row = data32 + (4 * x_size * y) + (4 * x_start);
+
+				for (unsigned int x = 0; x < x_count; x++)
+				{
+					vfloat4 color = blk.texel(idx);
+
+					// NaNs are handled inline - no need to special case
+					if (needs_swz)
+					{
+						float data[7];
+						data[ASTCENC_SWZ_0] = 0.0f;
+						data[ASTCENC_SWZ_1] = 1.0f;
+						data[ASTCENC_SWZ_R] = color.lane<0>();
+						data[ASTCENC_SWZ_G] = color.lane<1>();
+						data[ASTCENC_SWZ_B] = color.lane<2>();
+						data[ASTCENC_SWZ_A] = color.lane<3>();
+
+						if (needs_z)
+						{
+							float xN = (data[0] * 2.0f) - 1.0f;
+							float yN = (data[3] * 2.0f) - 1.0f;
+							float zN = 1.0f - xN * xN - yN * yN;
+							if (zN < 0.0f)
+							{
+								zN = 0.0f;
+							}
+							data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f;
+						}
+
+						color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]);
+					}
+
+					store(color, data32_row);
+					data32_row += 4;
+					idx++;
+				}
+				idx += x_nudge;
+			}
+			idx += y_nudge;
+		}
+	}
+}
diff --git a/thirdparty/astcenc/astcenc_integer_sequence.cpp b/thirdparty/astcenc/astcenc_integer_sequence.cpp
new file mode 100644
index 0000000000..416750374d
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_integer_sequence.cpp
@@ -0,0 +1,739 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for encoding/decoding Bounded Integer Sequence Encoding.
+ */
+
+#include "astcenc_internal.h"
+
+#include <array>
+
+/** @brief Unpacked quint triplets <low,middle,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t quints_of_integer[128][3] {
+	{0, 0, 0}, {1, 0, 0}, {2, 0, 0}, {3, 0, 0},
+	{4, 0, 0}, {0, 4, 0}, {4, 4, 0}, {4, 4, 4},
+	{0, 1, 0}, {1, 1, 0}, {2, 1, 0}, {3, 1, 0},
+	{4, 1, 0}, {1, 4, 0}, {4, 4, 1}, {4, 4, 4},
+	{0, 2, 0}, {1, 2, 0}, {2, 2, 0}, {3, 2, 0},
+	{4, 2, 0}, {2, 4, 0}, {4, 4, 2}, {4, 4, 4},
+	{0, 3, 0}, {1, 3, 0}, {2, 3, 0}, {3, 3, 0},
+	{4, 3, 0}, {3, 4, 0}, {4, 4, 3}, {4, 4, 4},
+	{0, 0, 1}, {1, 0, 1}, {2, 0, 1}, {3, 0, 1},
+	{4, 0, 1}, {0, 4, 1}, {4, 0, 4}, {0, 4, 4},
+	{0, 1, 1}, {1, 1, 1}, {2, 1, 1}, {3, 1, 1},
+	{4, 1, 1}, {1, 4, 1}, {4, 1, 4}, {1, 4, 4},
+	{0, 2, 1}, {1, 2, 1}, {2, 2, 1}, {3, 2, 1},
+	{4, 2, 1}, {2, 4, 1}, {4, 2, 4}, {2, 4, 4},
+	{0, 3, 1}, {1, 3, 1}, {2, 3, 1}, {3, 3, 1},
+	{4, 3, 1}, {3, 4, 1}, {4, 3, 4}, {3, 4, 4},
+	{0, 0, 2}, {1, 0, 2}, {2, 0, 2}, {3, 0, 2},
+	{4, 0, 2}, {0, 4, 2}, {2, 0, 4}, {3, 0, 4},
+	{0, 1, 2}, {1, 1, 2}, {2, 1, 2}, {3, 1, 2},
+	{4, 1, 2}, {1, 4, 2}, {2, 1, 4}, {3, 1, 4},
+	{0, 2, 2}, {1, 2, 2}, {2, 2, 2}, {3, 2, 2},
+	{4, 2, 2}, {2, 4, 2}, {2, 2, 4}, {3, 2, 4},
+	{0, 3, 2}, {1, 3, 2}, {2, 3, 2}, {3, 3, 2},
+	{4, 3, 2}, {3, 4, 2}, {2, 3, 4}, {3, 3, 4},
+	{0, 0, 3}, {1, 0, 3}, {2, 0, 3}, {3, 0, 3},
+	{4, 0, 3}, {0, 4, 3}, {0, 0, 4}, {1, 0, 4},
+	{0, 1, 3}, {1, 1, 3}, {2, 1, 3}, {3, 1, 3},
+	{4, 1, 3}, {1, 4, 3}, {0, 1, 4}, {1, 1, 4},
+	{0, 2, 3}, {1, 2, 3}, {2, 2, 3}, {3, 2, 3},
+	{4, 2, 3}, {2, 4, 3}, {0, 2, 4}, {1, 2, 4},
+	{0, 3, 3}, {1, 3, 3}, {2, 3, 3}, {3, 3, 3},
+	{4, 3, 3}, {3, 4, 3}, {0, 3, 4}, {1, 3, 4}
+};
+
+/** @brief Packed quint values for each unpacked value, indexed [hi][mid][lo]. */
+static const uint8_t integer_of_quints[5][5][5] {
+	{
+		{0, 1, 2, 3, 4},
+		{8, 9, 10, 11, 12},
+		{16, 17, 18, 19, 20},
+		{24, 25, 26, 27, 28},
+		{5, 13, 21, 29, 6}
+	},
+	{
+		{32, 33, 34, 35, 36},
+		{40, 41, 42, 43, 44},
+		{48, 49, 50, 51, 52},
+		{56, 57, 58, 59, 60},
+		{37, 45, 53, 61, 14}
+	},
+	{
+		{64, 65, 66, 67, 68},
+		{72, 73, 74, 75, 76},
+		{80, 81, 82, 83, 84},
+		{88, 89, 90, 91, 92},
+		{69, 77, 85, 93, 22}
+	},
+	{
+		{96, 97, 98, 99, 100},
+		{104, 105, 106, 107, 108},
+		{112, 113, 114, 115, 116},
+		{120, 121, 122, 123, 124},
+		{101, 109, 117, 125, 30}
+	},
+	{
+		{102, 103, 70, 71, 38},
+		{110, 111, 78, 79, 46},
+		{118, 119, 86, 87, 54},
+		{126, 127, 94, 95, 62},
+		{39, 47, 55, 63, 31}
+	}
+};
+
+/** @brief Unpacked trit quintuplets <low,...,high> for each packed value */
+// TODO: Bitpack these into a uint16_t?
+static const uint8_t trits_of_integer[256][5] {
+	{0, 0, 0, 0, 0}, {1, 0, 0, 0, 0}, {2, 0, 0, 0, 0}, {0, 0, 2, 0, 0},
+	{0, 1, 0, 0, 0}, {1, 1, 0, 0, 0}, {2, 1, 0, 0, 0}, {1, 0, 2, 0, 0},
+	{0, 2, 0, 0, 0}, {1, 2, 0, 0, 0}, {2, 2, 0, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 2, 2, 0, 0}, {1, 2, 2, 0, 0}, {2, 2, 2, 0, 0}, {2, 0, 2, 0, 0},
+	{0, 0, 1, 0, 0}, {1, 0, 1, 0, 0}, {2, 0, 1, 0, 0}, {0, 1, 2, 0, 0},
+	{0, 1, 1, 0, 0}, {1, 1, 1, 0, 0}, {2, 1, 1, 0, 0}, {1, 1, 2, 0, 0},
+	{0, 2, 1, 0, 0}, {1, 2, 1, 0, 0}, {2, 2, 1, 0, 0}, {2, 1, 2, 0, 0},
+	{0, 0, 0, 2, 2}, {1, 0, 0, 2, 2}, {2, 0, 0, 2, 2}, {0, 0, 2, 2, 2},
+	{0, 0, 0, 1, 0}, {1, 0, 0, 1, 0}, {2, 0, 0, 1, 0}, {0, 0, 2, 1, 0},
+	{0, 1, 0, 1, 0}, {1, 1, 0, 1, 0}, {2, 1, 0, 1, 0}, {1, 0, 2, 1, 0},
+	{0, 2, 0, 1, 0}, {1, 2, 0, 1, 0}, {2, 2, 0, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 2, 2, 1, 0}, {1, 2, 2, 1, 0}, {2, 2, 2, 1, 0}, {2, 0, 2, 1, 0},
+	{0, 0, 1, 1, 0}, {1, 0, 1, 1, 0}, {2, 0, 1, 1, 0}, {0, 1, 2, 1, 0},
+	{0, 1, 1, 1, 0}, {1, 1, 1, 1, 0}, {2, 1, 1, 1, 0}, {1, 1, 2, 1, 0},
+	{0, 2, 1, 1, 0}, {1, 2, 1, 1, 0}, {2, 2, 1, 1, 0}, {2, 1, 2, 1, 0},
+	{0, 1, 0, 2, 2}, {1, 1, 0, 2, 2}, {2, 1, 0, 2, 2}, {1, 0, 2, 2, 2},
+	{0, 0, 0, 2, 0}, {1, 0, 0, 2, 0}, {2, 0, 0, 2, 0}, {0, 0, 2, 2, 0},
+	{0, 1, 0, 2, 0}, {1, 1, 0, 2, 0}, {2, 1, 0, 2, 0}, {1, 0, 2, 2, 0},
+	{0, 2, 0, 2, 0}, {1, 2, 0, 2, 0}, {2, 2, 0, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 2, 2, 2, 0}, {1, 2, 2, 2, 0}, {2, 2, 2, 2, 0}, {2, 0, 2, 2, 0},
+	{0, 0, 1, 2, 0}, {1, 0, 1, 2, 0}, {2, 0, 1, 2, 0}, {0, 1, 2, 2, 0},
+	{0, 1, 1, 2, 0}, {1, 1, 1, 2, 0}, {2, 1, 1, 2, 0}, {1, 1, 2, 2, 0},
+	{0, 2, 1, 2, 0}, {1, 2, 1, 2, 0}, {2, 2, 1, 2, 0}, {2, 1, 2, 2, 0},
+	{0, 2, 0, 2, 2}, {1, 2, 0, 2, 2}, {2, 2, 0, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 2}, {1, 0, 0, 0, 2}, {2, 0, 0, 0, 2}, {0, 0, 2, 0, 2},
+	{0, 1, 0, 0, 2}, {1, 1, 0, 0, 2}, {2, 1, 0, 0, 2}, {1, 0, 2, 0, 2},
+	{0, 2, 0, 0, 2}, {1, 2, 0, 0, 2}, {2, 2, 0, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 2, 2, 0, 2}, {1, 2, 2, 0, 2}, {2, 2, 2, 0, 2}, {2, 0, 2, 0, 2},
+	{0, 0, 1, 0, 2}, {1, 0, 1, 0, 2}, {2, 0, 1, 0, 2}, {0, 1, 2, 0, 2},
+	{0, 1, 1, 0, 2}, {1, 1, 1, 0, 2}, {2, 1, 1, 0, 2}, {1, 1, 2, 0, 2},
+	{0, 2, 1, 0, 2}, {1, 2, 1, 0, 2}, {2, 2, 1, 0, 2}, {2, 1, 2, 0, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 0, 2, 2, 2},
+	{0, 0, 0, 0, 1}, {1, 0, 0, 0, 1}, {2, 0, 0, 0, 1}, {0, 0, 2, 0, 1},
+	{0, 1, 0, 0, 1}, {1, 1, 0, 0, 1}, {2, 1, 0, 0, 1}, {1, 0, 2, 0, 1},
+	{0, 2, 0, 0, 1}, {1, 2, 0, 0, 1}, {2, 2, 0, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 2, 2, 0, 1}, {1, 2, 2, 0, 1}, {2, 2, 2, 0, 1}, {2, 0, 2, 0, 1},
+	{0, 0, 1, 0, 1}, {1, 0, 1, 0, 1}, {2, 0, 1, 0, 1}, {0, 1, 2, 0, 1},
+	{0, 1, 1, 0, 1}, {1, 1, 1, 0, 1}, {2, 1, 1, 0, 1}, {1, 1, 2, 0, 1},
+	{0, 2, 1, 0, 1}, {1, 2, 1, 0, 1}, {2, 2, 1, 0, 1}, {2, 1, 2, 0, 1},
+	{0, 0, 1, 2, 2}, {1, 0, 1, 2, 2}, {2, 0, 1, 2, 2}, {0, 1, 2, 2, 2},
+	{0, 0, 0, 1, 1}, {1, 0, 0, 1, 1}, {2, 0, 0, 1, 1}, {0, 0, 2, 1, 1},
+	{0, 1, 0, 1, 1}, {1, 1, 0, 1, 1}, {2, 1, 0, 1, 1}, {1, 0, 2, 1, 1},
+	{0, 2, 0, 1, 1}, {1, 2, 0, 1, 1}, {2, 2, 0, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 2, 2, 1, 1}, {1, 2, 2, 1, 1}, {2, 2, 2, 1, 1}, {2, 0, 2, 1, 1},
+	{0, 0, 1, 1, 1}, {1, 0, 1, 1, 1}, {2, 0, 1, 1, 1}, {0, 1, 2, 1, 1},
+	{0, 1, 1, 1, 1}, {1, 1, 1, 1, 1}, {2, 1, 1, 1, 1}, {1, 1, 2, 1, 1},
+	{0, 2, 1, 1, 1}, {1, 2, 1, 1, 1}, {2, 2, 1, 1, 1}, {2, 1, 2, 1, 1},
+	{0, 1, 1, 2, 2}, {1, 1, 1, 2, 2}, {2, 1, 1, 2, 2}, {1, 1, 2, 2, 2},
+	{0, 0, 0, 2, 1}, {1, 0, 0, 2, 1}, {2, 0, 0, 2, 1}, {0, 0, 2, 2, 1},
+	{0, 1, 0, 2, 1}, {1, 1, 0, 2, 1}, {2, 1, 0, 2, 1}, {1, 0, 2, 2, 1},
+	{0, 2, 0, 2, 1}, {1, 2, 0, 2, 1}, {2, 2, 0, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 2, 2, 2, 1}, {1, 2, 2, 2, 1}, {2, 2, 2, 2, 1}, {2, 0, 2, 2, 1},
+	{0, 0, 1, 2, 1}, {1, 0, 1, 2, 1}, {2, 0, 1, 2, 1}, {0, 1, 2, 2, 1},
+	{0, 1, 1, 2, 1}, {1, 1, 1, 2, 1}, {2, 1, 1, 2, 1}, {1, 1, 2, 2, 1},
+	{0, 2, 1, 2, 1}, {1, 2, 1, 2, 1}, {2, 2, 1, 2, 1}, {2, 1, 2, 2, 1},
+	{0, 2, 1, 2, 2}, {1, 2, 1, 2, 2}, {2, 2, 1, 2, 2}, {2, 1, 2, 2, 2},
+	{0, 0, 0, 1, 2}, {1, 0, 0, 1, 2}, {2, 0, 0, 1, 2}, {0, 0, 2, 1, 2},
+	{0, 1, 0, 1, 2}, {1, 1, 0, 1, 2}, {2, 1, 0, 1, 2}, {1, 0, 2, 1, 2},
+	{0, 2, 0, 1, 2}, {1, 2, 0, 1, 2}, {2, 2, 0, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 2, 2, 1, 2}, {1, 2, 2, 1, 2}, {2, 2, 2, 1, 2}, {2, 0, 2, 1, 2},
+	{0, 0, 1, 1, 2}, {1, 0, 1, 1, 2}, {2, 0, 1, 1, 2}, {0, 1, 2, 1, 2},
+	{0, 1, 1, 1, 2}, {1, 1, 1, 1, 2}, {2, 1, 1, 1, 2}, {1, 1, 2, 1, 2},
+	{0, 2, 1, 1, 2}, {1, 2, 1, 1, 2}, {2, 2, 1, 1, 2}, {2, 1, 2, 1, 2},
+	{0, 2, 2, 2, 2}, {1, 2, 2, 2, 2}, {2, 2, 2, 2, 2}, {2, 1, 2, 2, 2}
+};
+
+/** @brief Packed trit values for each unpacked value, indexed [hi][][][][lo]. */
+static const uint8_t integer_of_trits[3][3][3][3][3] {
+	{
+		{
+			{
+				{0, 1, 2},
+				{4, 5, 6},
+				{8, 9, 10}
+			},
+			{
+				{16, 17, 18},
+				{20, 21, 22},
+				{24, 25, 26}
+			},
+			{
+				{3, 7, 15},
+				{19, 23, 27},
+				{12, 13, 14}
+			}
+		},
+		{
+			{
+				{32, 33, 34},
+				{36, 37, 38},
+				{40, 41, 42}
+			},
+			{
+				{48, 49, 50},
+				{52, 53, 54},
+				{56, 57, 58}
+			},
+			{
+				{35, 39, 47},
+				{51, 55, 59},
+				{44, 45, 46}
+			}
+		},
+		{
+			{
+				{64, 65, 66},
+				{68, 69, 70},
+				{72, 73, 74}
+			},
+			{
+				{80, 81, 82},
+				{84, 85, 86},
+				{88, 89, 90}
+			},
+			{
+				{67, 71, 79},
+				{83, 87, 91},
+				{76, 77, 78}
+			}
+		}
+	},
+	{
+		{
+			{
+				{128, 129, 130},
+				{132, 133, 134},
+				{136, 137, 138}
+			},
+			{
+				{144, 145, 146},
+				{148, 149, 150},
+				{152, 153, 154}
+			},
+			{
+				{131, 135, 143},
+				{147, 151, 155},
+				{140, 141, 142}
+			}
+		},
+		{
+			{
+				{160, 161, 162},
+				{164, 165, 166},
+				{168, 169, 170}
+			},
+			{
+				{176, 177, 178},
+				{180, 181, 182},
+				{184, 185, 186}
+			},
+			{
+				{163, 167, 175},
+				{179, 183, 187},
+				{172, 173, 174}
+			}
+		},
+		{
+			{
+				{192, 193, 194},
+				{196, 197, 198},
+				{200, 201, 202}
+			},
+			{
+				{208, 209, 210},
+				{212, 213, 214},
+				{216, 217, 218}
+			},
+			{
+				{195, 199, 207},
+				{211, 215, 219},
+				{204, 205, 206}
+			}
+		}
+	},
+	{
+		{
+			{
+				{96, 97, 98},
+				{100, 101, 102},
+				{104, 105, 106}
+			},
+			{
+				{112, 113, 114},
+				{116, 117, 118},
+				{120, 121, 122}
+			},
+			{
+				{99, 103, 111},
+				{115, 119, 123},
+				{108, 109, 110}
+			}
+		},
+		{
+			{
+				{224, 225, 226},
+				{228, 229, 230},
+				{232, 233, 234}
+			},
+			{
+				{240, 241, 242},
+				{244, 245, 246},
+				{248, 249, 250}
+			},
+			{
+				{227, 231, 239},
+				{243, 247, 251},
+				{236, 237, 238}
+			}
+		},
+		{
+			{
+				{28, 29, 30},
+				{60, 61, 62},
+				{92, 93, 94}
+			},
+			{
+				{156, 157, 158},
+				{188, 189, 190},
+				{220, 221, 222}
+			},
+			{
+				{31, 63, 127},
+				{159, 191, 255},
+				{252, 253, 254}
+			}
+		}
+	}
+};
+
+/**
+ * @brief The number of bits, trits, and quints needed for a quant level.
+ */
+struct btq_count
+{
+	/** @brief The number of bits. */
+	uint8_t bits:6;
+
+	/** @brief The number of trits. */
+	uint8_t trits:1;
+
+	/** @brief The number of quints. */
+	uint8_t quints:1;
+};
+
+/**
+ * @brief The table of bits, trits, and quints needed for a quant encode.
+ */
+static const std::array<btq_count, 21> btq_counts {{
+	{ 1, 0, 0 }, // QUANT_2
+	{ 0, 1, 0 }, // QUANT_3
+	{ 2, 0, 0 }, // QUANT_4
+	{ 0, 0, 1 }, // QUANT_5
+	{ 1, 1, 0 }, // QUANT_6
+	{ 3, 0, 0 }, // QUANT_8
+	{ 1, 0, 1 }, // QUANT_10
+	{ 2, 1, 0 }, // QUANT_12
+	{ 4, 0, 0 }, // QUANT_16
+	{ 2, 0, 1 }, // QUANT_20
+	{ 3, 1, 0 }, // QUANT_24
+	{ 5, 0, 0 }, // QUANT_32
+	{ 3, 0, 1 }, // QUANT_40
+	{ 4, 1, 0 }, // QUANT_48
+	{ 6, 0, 0 }, // QUANT_64
+	{ 4, 0, 1 }, // QUANT_80
+	{ 5, 1, 0 }, // QUANT_96
+	{ 7, 0, 0 }, // QUANT_128
+	{ 5, 0, 1 }, // QUANT_160
+	{ 6, 1, 0 }, // QUANT_192
+	{ 8, 0, 0 }  // QUANT_256
+}};
+
+/**
+ * @brief The sequence scale, round, and divisors needed to compute sizing.
+ *
+ * The length of a quantized sequence in bits is:
+ *     (scale * <sequence_len> + round) / divisor
+ */
+struct ise_size
+{
+	/** @brief The scaling parameter. */
+	uint8_t scale:6;
+
+	/** @brief The divisor parameter. */
+	uint8_t divisor:2;
+};
+
+/**
+ * @brief The table of scale, round, and divisors needed for quant sizing.
+ */
+static const std::array<ise_size, 21> ise_sizes {{
+	{  1, 0 }, // QUANT_2
+	{  8, 2 }, // QUANT_3
+	{  2, 0 }, // QUANT_4
+	{  7, 1 }, // QUANT_5
+	{ 13, 2 }, // QUANT_6
+	{  3, 0 }, // QUANT_8
+	{ 10, 1 }, // QUANT_10
+	{ 18, 2 }, // QUANT_12
+	{  4, 0 }, // QUANT_16
+	{ 13, 1 }, // QUANT_20
+	{ 23, 2 }, // QUANT_24
+	{  5, 0 }, // QUANT_32
+	{ 16, 1 }, // QUANT_40
+	{ 28, 2 }, // QUANT_48
+	{  6, 0 }, // QUANT_64
+	{ 19, 1 }, // QUANT_80
+	{ 33, 2 }, // QUANT_96
+	{  7, 0 }, // QUANT_128
+	{ 22, 1 }, // QUANT_160
+	{ 38, 2 }, // QUANT_192
+	{  8, 0 }  // QUANT_256
+}};
+
+/* See header for documentation. */
+unsigned int get_ise_sequence_bitcount(
+	unsigned int character_count,
+	quant_method quant_level
+) {
+	// Cope with out-of bounds values - input might be invalid
+	if (static_cast<size_t>(quant_level) >= ise_sizes.size())
+	{
+		// Arbitrary large number that's more than an ASTC block can hold
+		return 1024;
+	}
+
+	auto& entry = ise_sizes[quant_level];
+	unsigned int divisor = (entry.divisor << 1) + 1;
+	return (entry.scale * character_count + divisor - 1) / divisor;
+}
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	unsigned int value,
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	uint8_t ptr[2]
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/**
+ * @brief Read up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline unsigned int read_bits(
+	unsigned int bitcount,
+	unsigned int bitoffset,
+	const uint8_t* ptr
+) {
+	unsigned int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	unsigned int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+/* See header for documentation. */
+void encode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+	unsigned int mask = (1 << bits) - 1;
+
+	// Write out trits and bits
+	if (trits)
+	{
+		unsigned int i = 0;
+		unsigned int full_trit_blocks = character_count / 5;
+
+		for (unsigned int j = 0; j < full_trit_blocks; j++)
+		{
+			unsigned int i4 = input_data[i + 4] >> bits;
+			unsigned int i3 = input_data[i + 3] >> bits;
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			// The max size of a trit bit count is 6, so we can always safely
+			// pack a single MX value with the following 1 or 2 T bits.
+			uint8_t pack;
+
+			// Element 0 + T0 + T1
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 1 + T2 + T3
+			pack = (input_data[i++] & mask) | (((T >> 2) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2 + T4
+			pack = (input_data[i++] & mask) | (((T >> 4) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+
+			// Element 3 + T5 + T6
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 4 + T7
+			pack = (input_data[i++] & mask) | (((T >> 7) & 0x1) << bits);
+			write_bits(pack, bits + 1, bit_offset, output_data);
+			bit_offset += bits + 1;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i4 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i4 =                            0;
+			unsigned int i3 = i + 3 >= character_count ? 0 : input_data[i + 3] >> bits;
+			unsigned int i2 = i + 2 >= character_count ? 0 : input_data[i + 2] >> bits;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_trits[i4][i3][i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[4]  { 2, 2, 1, 2 };
+				static const uint8_t tshift[4] { 0, 2, 4, 5 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out quints and bits
+	else if (quints)
+	{
+		unsigned int i = 0;
+		unsigned int full_quint_blocks = character_count / 3;
+
+		for (unsigned int j = 0; j < full_quint_blocks; j++)
+		{
+			unsigned int i2 = input_data[i + 2] >> bits;
+			unsigned int i1 = input_data[i + 1] >> bits;
+			unsigned int i0 = input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			// The max size of a quint bit count is 5, so we can always safely
+			// pack a single M value with the following 2 or 3 T bits.
+			uint8_t pack;
+
+			// Element 0
+			pack = (input_data[i++] & mask) | (((T >> 0) & 0x7) << bits);
+			write_bits(pack, bits + 3, bit_offset, output_data);
+			bit_offset += bits + 3;
+
+			// Element 1
+			pack = (input_data[i++] & mask) | (((T >> 3) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+
+			// Element 2
+			pack = (input_data[i++] & mask) | (((T >> 5) & 0x3) << bits);
+			write_bits(pack, bits + 2, bit_offset, output_data);
+			bit_offset += bits + 2;
+		}
+
+		// Loop tail for a partial block
+		if (i != character_count)
+		{
+			// i2 cannot be present - we know the block is partial
+			// i0 must be present - we know the block isn't empty
+			unsigned int i2 =                            0;
+			unsigned int i1 = i + 1 >= character_count ? 0 : input_data[i + 1] >> bits;
+			unsigned int i0 =                                input_data[i + 0] >> bits;
+
+			uint8_t T = integer_of_quints[i2][i1][i0];
+
+			for (unsigned int j = 0; i < character_count; i++, j++)
+			{
+				// Truncated table as this iteration is always partital
+				static const uint8_t tbits[2]  { 3, 2 };
+				static const uint8_t tshift[2] { 0, 3 };
+
+				uint8_t pack = (input_data[i] & mask) |
+				               (((T >> tshift[j]) & ((1 << tbits[j]) - 1)) << bits);
+
+				write_bits(pack, bits + tbits[j], bit_offset, output_data);
+				bit_offset += bits + tbits[j];
+			}
+		}
+	}
+	// Write out just bits
+	else
+	{
+		for (unsigned int i = 0; i < character_count; i++)
+		{
+			write_bits(input_data[i], bits, bit_offset, output_data);
+			bit_offset += bits;
+		}
+	}
+}
+
+/* See header for documentation. */
+void decode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset
+) {
+	promise(character_count > 0);
+
+	// Note: due to how the trit/quint-block unpacking is done in this function, we may write more
+	// temporary results than the number of outputs. The maximum actual number of results is 64 bit,
+	// but we keep 4 additional character_count of padding.
+	uint8_t results[68];
+	uint8_t tq_blocks[22] { 0 }; // Trit-blocks or quint-blocks, must be zeroed
+
+	unsigned int bits = btq_counts[quant_level].bits;
+	unsigned int trits = btq_counts[quant_level].trits;
+	unsigned int quints = btq_counts[quant_level].quints;
+
+	unsigned int lcounter = 0;
+	unsigned int hcounter = 0;
+
+	// Collect bits for each element, as well as bits for any trit-blocks and quint-blocks.
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		results[i] = static_cast<uint8_t>(read_bits(bits, bit_offset, input_data));
+		bit_offset += bits;
+
+		if (trits)
+		{
+			static const uint8_t bits_to_read[5]  { 2, 2, 1, 2, 1 };
+			static const uint8_t block_shift[5]   { 0, 2, 4, 5, 7 };
+			static const uint8_t next_lcounter[5] { 1, 2, 3, 4, 0 };
+			static const uint8_t hcounter_incr[5] { 0, 0, 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+
+		if (quints)
+		{
+			static const uint8_t bits_to_read[3]  { 3, 2, 2 };
+			static const uint8_t block_shift[3]   { 0, 3, 5 };
+			static const uint8_t next_lcounter[3] { 1, 2, 0 };
+			static const uint8_t hcounter_incr[3] { 0, 0, 1 };
+			unsigned int tdata = read_bits(bits_to_read[lcounter], bit_offset, input_data);
+			bit_offset += bits_to_read[lcounter];
+			tq_blocks[hcounter] |= tdata << block_shift[lcounter];
+			hcounter += hcounter_incr[lcounter];
+			lcounter = next_lcounter[lcounter];
+		}
+	}
+
+	// Unpack trit-blocks or quint-blocks as needed
+	if (trits)
+	{
+		unsigned int trit_blocks = (character_count + 4) / 5;
+		promise(trit_blocks > 0);
+		for (unsigned int i = 0; i < trit_blocks; i++)
+		{
+			const uint8_t *tritptr = trits_of_integer[tq_blocks[i]];
+			results[5 * i    ] |= tritptr[0] << bits;
+			results[5 * i + 1] |= tritptr[1] << bits;
+			results[5 * i + 2] |= tritptr[2] << bits;
+			results[5 * i + 3] |= tritptr[3] << bits;
+			results[5 * i + 4] |= tritptr[4] << bits;
+		}
+	}
+
+	if (quints)
+	{
+		unsigned int quint_blocks = (character_count + 2) / 3;
+		promise(quint_blocks > 0);
+		for (unsigned int i = 0; i < quint_blocks; i++)
+		{
+			const uint8_t *quintptr = quints_of_integer[tq_blocks[i]];
+			results[3 * i    ] |= quintptr[0] << bits;
+			results[3 * i + 1] |= quintptr[1] << bits;
+			results[3 * i + 2] |= quintptr[2] << bits;
+		}
+	}
+
+	for (unsigned int i = 0; i < character_count; i++)
+	{
+		output_data[i] = results[i];
+	}
+}
diff --git a/thirdparty/astcenc/astcenc_internal.h b/thirdparty/astcenc/astcenc_internal.h
new file mode 100644
index 0000000000..0aa8fa0f81
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_internal.h
@@ -0,0 +1,2196 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data declarations.
+ */
+
+#ifndef ASTCENC_INTERNAL_INCLUDED
+#define ASTCENC_INTERNAL_INCLUDED
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#if defined(ASTCENC_DIAGNOSTICS)
+	#include <cstdio>
+#endif
+#include <cstdlib>
+
+#include "astcenc.h"
+#include "astcenc_mathlib.h"
+#include "astcenc_vecmathlib.h"
+
+/**
+ * @brief Make a promise to the compiler's optimizer.
+ *
+ * A promise is an expression that the optimizer is can assume is true for to help it generate
+ * faster code. Common use cases for this are to promise that a for loop will iterate more than
+ * once, or that the loop iteration count is a multiple of a vector length, which avoids pre-loop
+ * checks and can avoid loop tails if loops are unrolled by the auto-vectorizer.
+ */
+#if defined(NDEBUG)
+	#if !defined(__clang__) && defined(_MSC_VER)
+		#define promise(cond) __assume(cond)
+	#elif defined(__clang__)
+		#if __has_builtin(__builtin_assume)
+			#define promise(cond) __builtin_assume(cond)
+		#elif __has_builtin(__builtin_unreachable)
+			#define promise(cond) if (!(cond)) { __builtin_unreachable(); }
+		#else
+			#define promise(cond)
+		#endif
+	#else // Assume GCC
+		#define promise(cond) if (!(cond)) { __builtin_unreachable(); }
+	#endif
+#else
+	#define promise(cond) assert(cond)
+#endif
+
+/* ============================================================================
+  Constants
+============================================================================ */
+#if !defined(ASTCENC_BLOCK_MAX_TEXELS)
+	#define ASTCENC_BLOCK_MAX_TEXELS 216 // A 3D 6x6x6 block
+#endif
+
+/** @brief The maximum number of texels a block can support (6x6x6 block). */
+static constexpr unsigned int BLOCK_MAX_TEXELS { ASTCENC_BLOCK_MAX_TEXELS };
+
+/** @brief The maximum number of components a block can support. */
+static constexpr unsigned int BLOCK_MAX_COMPONENTS { 4 };
+
+/** @brief The maximum number of partitions a block can support. */
+static constexpr unsigned int BLOCK_MAX_PARTITIONS { 4 };
+
+/** @brief The number of partitionings, per partition count, suported by the ASTC format. */
+static constexpr unsigned int BLOCK_MAX_PARTITIONINGS { 1024 };
+
+/** @brief The maximum number of weights used during partition selection for texel clustering. */
+static constexpr uint8_t BLOCK_MAX_KMEANS_TEXELS { 64 };
+
+/** @brief The maximum number of weights a block can support. */
+static constexpr unsigned int BLOCK_MAX_WEIGHTS { 64 };
+
+/** @brief The maximum number of weights a block can support per plane in 2 plane mode. */
+static constexpr unsigned int BLOCK_MAX_WEIGHTS_2PLANE { BLOCK_MAX_WEIGHTS / 2 };
+
+/** @brief The minimum number of weight bits a candidate encoding must encode. */
+static constexpr unsigned int BLOCK_MIN_WEIGHT_BITS { 24 };
+
+/** @brief The maximum number of weight bits a candidate encoding can encode. */
+static constexpr unsigned int BLOCK_MAX_WEIGHT_BITS { 96 };
+
+/** @brief The index indicating a bad (unused) block mode in the remap array. */
+static constexpr uint16_t BLOCK_BAD_BLOCK_MODE { 0xFFFFu };
+
+/** @brief The index indicating a bad (unused) partitioning in the remap array. */
+static constexpr uint16_t BLOCK_BAD_PARTITIONING { 0xFFFFu };
+
+/** @brief The number of partition index bits supported by the ASTC format . */
+static constexpr unsigned int PARTITION_INDEX_BITS { 10 };
+
+/** @brief The offset of the plane 2 weights in shared weight arrays. */
+static constexpr unsigned int WEIGHTS_PLANE2_OFFSET { BLOCK_MAX_WEIGHTS_2PLANE };
+
+/** @brief The sum of quantized weights for one texel. */
+static constexpr float WEIGHTS_TEXEL_SUM { 16.0f };
+
+/** @brief The number of block modes supported by the ASTC format. */
+static constexpr unsigned int WEIGHTS_MAX_BLOCK_MODES { 2048 };
+
+/** @brief The number of weight grid decimation modes supported by the ASTC format. */
+static constexpr unsigned int WEIGHTS_MAX_DECIMATION_MODES { 87 };
+
+/** @brief The high default error used to initialize error trackers. */
+static constexpr float ERROR_CALC_DEFAULT { 1e30f };
+
+/**
+ * @brief The minimum texel count for a block to use the one partition fast path.
+ *
+ * This setting skips 4x4 and 5x4 block sizes.
+ */
+static constexpr unsigned int TUNE_MIN_TEXELS_MODE0_FASTPATH { 24 };
+
+/**
+ * @brief The maximum number of candidate encodings tested for each encoding mode.
+ *
+ * This can be dynamically reduced by the compression quality preset.
+ */
+static constexpr unsigned int TUNE_MAX_TRIAL_CANDIDATES { 8 };
+
+/**
+ * @brief The maximum number of candidate partitionings tested for each encoding mode.
+ *
+ * This can be dynamically reduced by the compression quality preset.
+ */
+static constexpr unsigned int TUNE_MAX_PARTITIONING_CANDIDATES { 32 };
+
+/**
+ * @brief The maximum quant level using full angular endpoint search method.
+ *
+ * The angular endpoint search is used to find the min/max weight that should
+ * be used for a given quantization level. It is effective but expensive, so
+ * we only use it where it has the most value - low quant levels with wide
+ * spacing. It is used below TUNE_MAX_ANGULAR_QUANT (inclusive). Above this we
+ * assume the min weight is 0.0f, and the max weight is 1.0f.
+ *
+ * Note the angular algorithm is vectorized, and using QUANT_12 exactly fills
+ * one 8-wide vector. Decreasing by one doesn't buy much performance, and
+ * increasing by one is disproportionately expensive.
+ */
+static constexpr unsigned int TUNE_MAX_ANGULAR_QUANT { 7 }; /* QUANT_12 */
+
+static_assert((BLOCK_MAX_TEXELS % ASTCENC_SIMD_WIDTH) == 0,
+              "BLOCK_MAX_TEXELS must be multiple of ASTCENC_SIMD_WIDTH");
+
+static_assert(BLOCK_MAX_TEXELS <= 216,
+              "BLOCK_MAX_TEXELS must not be greater than 216");
+
+static_assert((BLOCK_MAX_WEIGHTS % ASTCENC_SIMD_WIDTH) == 0,
+              "BLOCK_MAX_WEIGHTS must be multiple of ASTCENC_SIMD_WIDTH");
+
+static_assert((WEIGHTS_MAX_BLOCK_MODES % ASTCENC_SIMD_WIDTH) == 0,
+              "WEIGHTS_MAX_BLOCK_MODES must be multiple of ASTCENC_SIMD_WIDTH");
+
+
+/* ============================================================================
+  Commonly used data structures
+============================================================================ */
+
+/**
+ * @brief The ASTC endpoint formats.
+ *
+ * Note, the values here are used directly in the encoding in the format so do not rearrange.
+ */
+enum endpoint_formats
+{
+	FMT_LUMINANCE = 0,
+	FMT_LUMINANCE_DELTA = 1,
+	FMT_HDR_LUMINANCE_LARGE_RANGE = 2,
+	FMT_HDR_LUMINANCE_SMALL_RANGE = 3,
+	FMT_LUMINANCE_ALPHA = 4,
+	FMT_LUMINANCE_ALPHA_DELTA = 5,
+	FMT_RGB_SCALE = 6,
+	FMT_HDR_RGB_SCALE = 7,
+	FMT_RGB = 8,
+	FMT_RGB_DELTA = 9,
+	FMT_RGB_SCALE_ALPHA = 10,
+	FMT_HDR_RGB = 11,
+	FMT_RGBA = 12,
+	FMT_RGBA_DELTA = 13,
+	FMT_HDR_RGB_LDR_ALPHA = 14,
+	FMT_HDR_RGBA = 15
+};
+
+/**
+ * @brief The ASTC quantization methods.
+ *
+ * Note, the values here are used directly in the encoding in the format so do not rearrange.
+ */
+enum quant_method
+{
+	QUANT_2 = 0,
+	QUANT_3 = 1,
+	QUANT_4 = 2,
+	QUANT_5 = 3,
+	QUANT_6 = 4,
+	QUANT_8 = 5,
+	QUANT_10 = 6,
+	QUANT_12 = 7,
+	QUANT_16 = 8,
+	QUANT_20 = 9,
+	QUANT_24 = 10,
+	QUANT_32 = 11,
+	QUANT_40 = 12,
+	QUANT_48 = 13,
+	QUANT_64 = 14,
+	QUANT_80 = 15,
+	QUANT_96 = 16,
+	QUANT_128 = 17,
+	QUANT_160 = 18,
+	QUANT_192 = 19,
+	QUANT_256 = 20
+};
+
+/**
+ * @brief The number of levels use by an ASTC quantization method.
+ *
+ * @param method   The quantization method
+ *
+ * @return   The number of levels used by @c method.
+ */
+static inline unsigned int get_quant_level(quant_method method)
+{
+	switch (method)
+	{
+	case QUANT_2:   return   2;
+	case QUANT_3:   return   3;
+	case QUANT_4:   return   4;
+	case QUANT_5:   return   5;
+	case QUANT_6:   return   6;
+	case QUANT_8:   return   8;
+	case QUANT_10:  return  10;
+	case QUANT_12:  return  12;
+	case QUANT_16:  return  16;
+	case QUANT_20:  return  20;
+	case QUANT_24:  return  24;
+	case QUANT_32:  return  32;
+	case QUANT_40:  return  40;
+	case QUANT_48:  return  48;
+	case QUANT_64:  return  64;
+	case QUANT_80:  return  80;
+	case QUANT_96:  return  96;
+	case QUANT_128: return 128;
+	case QUANT_160: return 160;
+	case QUANT_192: return 192;
+	case QUANT_256: return 256;
+	}
+
+	// Unreachable - the enum is fully described
+	return 0;
+}
+
+/**
+ * @brief Computed metrics about a partition in a block.
+ */
+struct partition_metrics
+{
+	/** @brief The error-weighted average color in the partition. */
+	vfloat4 avg;
+
+	/** @brief The dominant error-weighted direction in the partition. */
+	vfloat4 dir;
+};
+
+/**
+ * @brief Computed lines for a a three component analysis.
+ */
+struct partition_lines3
+{
+	/** @brief Line for uncorrelated chroma. */
+	line3 uncor_line;
+
+	/** @brief Line for correlated chroma, passing though the origin. */
+	line3 samec_line;
+
+	/** @brief Post-processed line for uncorrelated chroma. */
+	processed_line3 uncor_pline;
+
+	/** @brief Post-processed line for correlated chroma, passing though the origin. */
+	processed_line3 samec_pline;
+
+	/** @brief The length of the line for uncorrelated chroma. */
+	float uncor_line_len;
+
+	/** @brief The length of the line for correlated chroma. */
+	float samec_line_len;
+};
+
+/**
+ * @brief The partition information for a single partition.
+ *
+ * ASTC has a total of 1024 candidate partitions for each of 2/3/4 partition counts, although this
+ * 1024 includes seeds that generate duplicates of other seeds and seeds that generate completely
+ * empty partitions. These are both valid encodings, but astcenc will skip both during compression
+ * as they are not useful.
+ */
+struct partition_info
+{
+	/** @brief The number of partitions in this partitioning. */
+	uint16_t partition_count;
+
+	/** @brief The index (seed) of this partitioning. */
+	uint16_t partition_index;
+
+	/**
+	 * @brief The number of texels in each partition.
+	 *
+	 * Note that some seeds result in zero texels assigned to a partition are valid, but are skipped
+	 * by this compressor as there is no point spending bits encoding an unused color endpoint.
+	 */
+	uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS];
+
+	/** @brief The partition of each texel in the block. */
+	uint8_t partition_of_texel[BLOCK_MAX_TEXELS];
+
+	/** @brief The list of texels in each partition. */
+	uint8_t texels_of_partition[BLOCK_MAX_PARTITIONS][BLOCK_MAX_TEXELS];
+};
+
+/**
+ * @brief The weight grid information for a single decimation pattern.
+ *
+ * ASTC can store one weight per texel, but is also capable of storing lower resolution weight grids
+ * that are interpolated during decompression to assign a with to a texel. Storing fewer weights
+ * can free up a substantial amount of bits that we can then spend on more useful things, such as
+ * more accurate endpoints and weights, or additional partitions.
+ *
+ * This data structure is used to store information about a single weight grid decimation pattern,
+ * for a single block size.
+ */
+struct decimation_info
+{
+	/** @brief The total number of texels in the block. */
+	uint8_t texel_count;
+
+	/** @brief The maximum number of stored weights that contribute to each texel, between 1 and 4. */
+	uint8_t max_texel_weight_count;
+
+	/** @brief The total number of weights stored. */
+	uint8_t weight_count;
+
+	/** @brief The number of stored weights in the X dimension. */
+	uint8_t weight_x;
+
+	/** @brief The number of stored weights in the Y dimension. */
+	uint8_t weight_y;
+
+	/** @brief The number of stored weights in the Z dimension. */
+	uint8_t weight_z;
+
+	/**
+	 * @brief The number of weights that contribute to each texel.
+	 * Value is between 1 and 4.
+	 */
+	uint8_t texel_weight_count[BLOCK_MAX_TEXELS];
+
+	/**
+	 * @brief The weight index of the N weights that are interpolated for each texel.
+	 * Stored transposed to improve vectorization.
+	 */
+	uint8_t texel_weights_tr[4][BLOCK_MAX_TEXELS];
+
+	/**
+	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
+	 * Value is between 0 and 16, stored transposed to improve vectorization.
+	 */
+	uint8_t texel_weight_contribs_int_tr[4][BLOCK_MAX_TEXELS];
+
+	/**
+	 * @brief The bilinear contribution of the N weights that are interpolated for each texel.
+	 * Value is between 0 and 1, stored transposed to improve vectorization.
+	 */
+	alignas(ASTCENC_VECALIGN) float texel_weight_contribs_float_tr[4][BLOCK_MAX_TEXELS];
+
+	/** @brief The number of texels that each stored weight contributes to. */
+	uint8_t weight_texel_count[BLOCK_MAX_WEIGHTS];
+
+	/**
+	 * @brief The list of texels that use a specific weight index.
+	 * Stored transposed to improve vectorization.
+	 */
+	uint8_t weight_texels_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
+
+	/**
+	 * @brief The bilinear contribution to the N texels that use each weight.
+	 * Value is between 0 and 1, stored transposed to improve vectorization.
+	 */
+	alignas(ASTCENC_VECALIGN) float weights_texel_contribs_tr[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
+
+	/**
+	 * @brief The bilinear contribution to the Nth texel that uses each weight.
+	 * Value is between 0 and 1, stored transposed to improve vectorization.
+	 */
+	float texel_contrib_for_weight[BLOCK_MAX_TEXELS][BLOCK_MAX_WEIGHTS];
+};
+
+/**
+ * @brief Metadata for single block mode for a specific block size.
+ */
+struct block_mode
+{
+	/** @brief The block mode index in the ASTC encoded form. */
+	uint16_t mode_index;
+
+	/** @brief The decimation mode index in the compressor reindexed list. */
+	uint8_t decimation_mode;
+
+	/** @brief The weight quantization used by this block mode. */
+	uint8_t quant_mode;
+
+	/** @brief The weight quantization used by this block mode. */
+	uint8_t weight_bits;
+
+	/** @brief Is a dual weight plane used by this block mode? */
+	uint8_t is_dual_plane : 1;
+
+	/**
+	 * @brief Get the weight quantization used by this block mode.
+	 *
+	 * @return The quantization level.
+	 */
+	inline quant_method get_weight_quant_mode() const
+	{
+		return static_cast<quant_method>(this->quant_mode);
+	}
+};
+
+/**
+ * @brief Metadata for single decimation mode for a specific block size.
+ */
+struct decimation_mode
+{
+	/** @brief The max weight precision for 1 plane, or -1 if not supported. */
+	int8_t maxprec_1plane;
+
+	/** @brief The max weight precision for 2 planes, or -1 if not supported. */
+	int8_t maxprec_2planes;
+
+	/**
+	 * @brief Bitvector indicating weight quant modes used by active 1 plane block modes.
+	 *
+	 * Bit 0 = QUANT_2, Bit 1 = QUANT_3, etc.
+	 */
+	uint16_t refprec_1_plane;
+
+	/**
+	 * @brief Bitvector indicating weight quant methods used by active 2 plane block modes.
+	 *
+	 * Bit 0 = QUANT_2, Bit 1 = QUANT_3, etc.
+	 */
+	uint16_t refprec_2_planes;
+
+	/**
+	 * @brief Set a 1 plane weight quant as active.
+	 *
+	 * @param weight_quant   The quant method to set.
+	 */
+	void set_ref_1_plane(quant_method weight_quant)
+	{
+		refprec_1_plane |= (1 << weight_quant);
+	}
+
+	/**
+	 * @brief Test if this mode is active below a given 1 plane weight quant (inclusive).
+	 *
+	 * @param max_weight_quant   The max quant method to test.
+	 */
+	bool is_ref_1_plane(quant_method max_weight_quant) const
+	{
+		uint16_t mask = static_cast<uint16_t>((1 << (max_weight_quant + 1)) - 1);
+		return (refprec_1_plane & mask) != 0;
+	}
+
+	/**
+	 * @brief Set a 2 plane weight quant as active.
+	 *
+	 * @param weight_quant   The quant method to set.
+	 */
+	void set_ref_2_plane(quant_method weight_quant)
+	{
+		refprec_2_planes |= static_cast<uint16_t>(1 << weight_quant);
+	}
+
+	/**
+	 * @brief Test if this mode is active below a given 2 plane weight quant (inclusive).
+	 *
+	 * @param max_weight_quant   The max quant method to test.
+	 */
+	bool is_ref_2_plane(quant_method max_weight_quant) const
+	{
+		uint16_t mask = static_cast<uint16_t>((1 << (max_weight_quant + 1)) - 1);
+		return (refprec_2_planes & mask) != 0;
+	}
+};
+
+/**
+ * @brief Data tables for a single block size.
+ *
+ * The decimation tables store the information to apply weight grid dimension reductions. We only
+ * store the decimation modes that are actually needed by the current context; many of the possible
+ * modes will be unused (too many weights for the current block size or disabled by heuristics). The
+ * actual number of weights stored is @c decimation_mode_count, and the @c decimation_modes and
+ * @c decimation_tables arrays store the active modes contiguously at the start of the array. These
+ * entries are not stored in any particular order.
+ *
+ * The block mode tables store the unpacked block mode settings. Block modes are stored in the
+ * compressed block as an 11 bit field, but for any given block size and set of compressor
+ * heuristics, only a subset of the block modes will be used. The actual number of block modes
+ * stored is indicated in @c block_mode_count, and the @c block_modes array store the active modes
+ * contiguously at the start of the array. These entries are stored in incrementing "packed" value
+ * order, which doesn't mean much once unpacked. To allow decompressors to reference the packed data
+ * efficiently the @c block_mode_packed_index array stores the mapping between physical ID and the
+ * actual remapped array index.
+ */
+struct block_size_descriptor
+{
+	/** @brief The block X dimension, in texels. */
+	uint8_t xdim;
+
+	/** @brief The block Y dimension, in texels. */
+	uint8_t ydim;
+
+	/** @brief The block Z dimension, in texels. */
+	uint8_t zdim;
+
+	/** @brief The block total texel count. */
+	uint8_t texel_count;
+
+	/**
+	 * @brief The number of stored decimation modes which are "always" modes.
+	 *
+	 * Always modes are stored at the start of the decimation_modes list.
+	 */
+	unsigned int decimation_mode_count_always;
+
+	/** @brief The number of stored decimation modes for selected encodings. */
+	unsigned int decimation_mode_count_selected;
+
+	/** @brief The number of stored decimation modes for any encoding. */
+	unsigned int decimation_mode_count_all;
+
+	/**
+	 * @brief The number of stored block modes which are "always" modes.
+	 *
+	 * Always modes are stored at the start of the block_modes list.
+	 */
+	unsigned int block_mode_count_1plane_always;
+
+	/** @brief The number of stored block modes for active 1 plane encodings. */
+	unsigned int block_mode_count_1plane_selected;
+
+	/** @brief The number of stored block modes for active 1 and 2 plane encodings. */
+	unsigned int block_mode_count_1plane_2plane_selected;
+
+	/** @brief The number of stored block modes for any encoding. */
+	unsigned int block_mode_count_all;
+
+	/** @brief The number of selected partitionings for 1/2/3/4 partitionings. */
+	unsigned int partitioning_count_selected[BLOCK_MAX_PARTITIONS];
+
+	/** @brief The number of partitionings for 1/2/3/4 partitionings. */
+	unsigned int partitioning_count_all[BLOCK_MAX_PARTITIONS];
+
+	/** @brief The active decimation modes, stored in low indices. */
+	decimation_mode decimation_modes[WEIGHTS_MAX_DECIMATION_MODES];
+
+	/** @brief The active decimation tables, stored in low indices. */
+	alignas(ASTCENC_VECALIGN) decimation_info decimation_tables[WEIGHTS_MAX_DECIMATION_MODES];
+
+	/** @brief The packed block mode array index, or @c BLOCK_BAD_BLOCK_MODE if not active. */
+	uint16_t block_mode_packed_index[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The active block modes, stored in low indices. */
+	block_mode block_modes[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The active partition tables, stored in low indices per-count. */
+	partition_info partitionings[(3 * BLOCK_MAX_PARTITIONINGS) + 1];
+
+	/**
+	 * @brief The packed partition table array index, or @c BLOCK_BAD_PARTITIONING if not active.
+	 *
+	 * Indexed by partition_count - 2, containing 2, 3 and 4 partitions.
+	 */
+	uint16_t partitioning_packed_index[3][BLOCK_MAX_PARTITIONINGS];
+
+	/** @brief The active texels for k-means partition selection. */
+	uint8_t kmeans_texels[BLOCK_MAX_KMEANS_TEXELS];
+
+	/**
+	 * @brief The canonical 2-partition coverage pattern used during block partition search.
+	 *
+	 * Indexed by remapped index, not physical index.
+	 */
+	uint64_t coverage_bitmaps_2[BLOCK_MAX_PARTITIONINGS][2];
+
+	/**
+	 * @brief The canonical 3-partition coverage pattern used during block partition search.
+	 *
+	 * Indexed by remapped index, not physical index.
+	 */
+	uint64_t coverage_bitmaps_3[BLOCK_MAX_PARTITIONINGS][3];
+
+	/**
+	 * @brief The canonical 4-partition coverage pattern used during block partition search.
+	 *
+	 * Indexed by remapped index, not physical index.
+	 */
+	uint64_t coverage_bitmaps_4[BLOCK_MAX_PARTITIONINGS][4];
+
+	/**
+	 * @brief Get the block mode structure for index @c block_mode.
+	 *
+	 * This function can only return block modes that are enabled by the current compressor config.
+	 * Decompression from an arbitrary source should not use this without first checking that the
+	 * packed block mode index is not @c BLOCK_BAD_BLOCK_MODE.
+	 *
+	 * @param block_mode   The packed block mode index.
+	 *
+	 * @return The block mode structure.
+	 */
+	const block_mode& get_block_mode(unsigned int block_mode) const
+	{
+		unsigned int packed_index = this->block_mode_packed_index[block_mode];
+		assert(packed_index != BLOCK_BAD_BLOCK_MODE && packed_index < this->block_mode_count_all);
+		return this->block_modes[packed_index];
+	}
+
+	/**
+	 * @brief Get the decimation mode structure for index @c decimation_mode.
+	 *
+	 * This function can only return decimation modes that are enabled by the current compressor
+	 * config. The mode array is stored packed, but this is only ever indexed by the packed index
+	 * stored in the @c block_mode and never exists in an unpacked form.
+	 *
+	 * @param decimation_mode   The packed decimation mode index.
+	 *
+	 * @return The decimation mode structure.
+	 */
+	const decimation_mode& get_decimation_mode(unsigned int decimation_mode) const
+	{
+		return this->decimation_modes[decimation_mode];
+	}
+
+	/**
+	 * @brief Get the decimation info structure for index @c decimation_mode.
+	 *
+	 * This function can only return decimation modes that are enabled by the current compressor
+	 * config. The mode array is stored packed, but this is only ever indexed by the packed index
+	 * stored in the @c block_mode and never exists in an unpacked form.
+	 *
+	 * @param decimation_mode   The packed decimation mode index.
+	 *
+	 * @return The decimation info structure.
+	 */
+	const decimation_info& get_decimation_info(unsigned int decimation_mode) const
+	{
+		return this->decimation_tables[decimation_mode];
+	}
+
+	/**
+	 * @brief Get the partition info table for a given partition count.
+	 *
+	 * @param partition_count   The number of partitions we want the table for.
+	 *
+	 * @return The pointer to the table of 1024 entries (for 2/3/4 parts) or 1 entry (for 1 part).
+	 */
+	const partition_info* get_partition_table(unsigned int partition_count) const
+	{
+		if (partition_count == 1)
+		{
+			partition_count = 5;
+		}
+		unsigned int index = (partition_count - 2) * BLOCK_MAX_PARTITIONINGS;
+		return this->partitionings + index;
+	}
+
+	/**
+	 * @brief Get the partition info structure for a given partition count and seed.
+	 *
+	 * @param partition_count   The number of partitions we want the info for.
+	 * @param index             The partition seed (between 0 and 1023).
+	 *
+	 * @return The partition info structure.
+	 */
+	const partition_info& get_partition_info(unsigned int partition_count, unsigned int index) const
+	{
+		unsigned int packed_index = 0;
+		if (partition_count >= 2)
+		{
+			packed_index = this->partitioning_packed_index[partition_count - 2][index];
+		}
+
+		assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]);
+		auto& result = get_partition_table(partition_count)[packed_index];
+		assert(index == result.partition_index);
+		return result;
+	}
+
+	/**
+	 * @brief Get the partition info structure for a given partition count and seed.
+	 *
+	 * @param partition_count   The number of partitions we want the info for.
+	 * @param packed_index      The raw array offset.
+	 *
+	 * @return The partition info structure.
+	 */
+	const partition_info& get_raw_partition_info(unsigned int partition_count, unsigned int packed_index) const
+	{
+		assert(packed_index != BLOCK_BAD_PARTITIONING && packed_index < this->partitioning_count_all[partition_count - 1]);
+		auto& result = get_partition_table(partition_count)[packed_index];
+		return result;
+	}
+};
+
+/**
+ * @brief The image data for a single block.
+ *
+ * The @c data_[rgba] fields store the image data in an encoded SoA float form designed for easy
+ * vectorization. Input data is converted to float and stored as values between 0 and 65535. LDR
+ * data is stored as direct UNORM data, HDR data is stored as LNS data.
+ *
+ * The @c rgb_lns and @c alpha_lns fields that assigned a per-texel use of HDR are only used during
+ * decompression. The current compressor will always use HDR endpoint formats when in HDR mode.
+ */
+struct image_block
+{
+	/** @brief The input (compress) or output (decompress) data for the red color component. */
+	alignas(ASTCENC_VECALIGN) float data_r[BLOCK_MAX_TEXELS];
+
+	/** @brief The input (compress) or output (decompress) data for the green color component. */
+	alignas(ASTCENC_VECALIGN) float data_g[BLOCK_MAX_TEXELS];
+
+	/** @brief The input (compress) or output (decompress) data for the blue color component. */
+	alignas(ASTCENC_VECALIGN) float data_b[BLOCK_MAX_TEXELS];
+
+	/** @brief The input (compress) or output (decompress) data for the alpha color component. */
+	alignas(ASTCENC_VECALIGN) float data_a[BLOCK_MAX_TEXELS];
+
+	/** @brief The number of texels in the block. */
+	uint8_t texel_count;
+
+	/** @brief The original data for texel 0 for constant color block encoding. */
+	vfloat4 origin_texel;
+
+	/** @brief The min component value of all texels in the block. */
+	vfloat4 data_min;
+
+	/** @brief The mean component value of all texels in the block. */
+	vfloat4 data_mean;
+
+	/** @brief The max component value of all texels in the block. */
+	vfloat4 data_max;
+
+	/** @brief The relative error significance of the color channels. */
+	vfloat4 channel_weight;
+
+	/** @brief Is this grayscale block where R == G == B for all texels? */
+	bool grayscale;
+
+	/** @brief Set to 1 if a texel is using HDR RGB endpoints (decompression only). */
+	uint8_t rgb_lns[BLOCK_MAX_TEXELS];
+
+	/** @brief Set to 1 if a texel is using HDR alpha endpoints (decompression only). */
+	uint8_t alpha_lns[BLOCK_MAX_TEXELS];
+
+	/** @brief The X position of this block in the input or output image. */
+	unsigned int xpos;
+
+	/** @brief The Y position of this block in the input or output image. */
+	unsigned int ypos;
+
+	/** @brief The Z position of this block in the input or output image. */
+	unsigned int zpos;
+
+	/**
+	 * @brief Get an RGBA texel value from the data.
+	 *
+	 * @param index   The texel index.
+	 *
+	 * @return The texel in RGBA component ordering.
+	 */
+	inline vfloat4 texel(unsigned int index) const
+	{
+		return vfloat4(data_r[index],
+		               data_g[index],
+		               data_b[index],
+		               data_a[index]);
+	}
+
+	/**
+	 * @brief Get an RGB texel value from the data.
+	 *
+	 * @param index   The texel index.
+	 *
+	 * @return The texel in RGB0 component ordering.
+	 */
+	inline vfloat4 texel3(unsigned int index) const
+	{
+		return vfloat3(data_r[index],
+		               data_g[index],
+		               data_b[index]);
+	}
+
+	/**
+	 * @brief Get the default alpha value for endpoints that don't store it.
+	 *
+	 * The default depends on whether the alpha endpoint is LDR or HDR.
+	 *
+	 * @return The alpha value in the scaled range used by the compressor.
+	 */
+	inline float get_default_alpha() const
+	{
+		return this->alpha_lns[0] ? static_cast<float>(0x7800) : static_cast<float>(0xFFFF);
+	}
+
+	/**
+	 * @brief Test if a single color channel is constant across the block.
+	 *
+	 * Constant color channels are easier to compress as interpolating between two identical colors
+	 * always returns the same value, irrespective of the weight used. They therefore can be ignored
+	 * for the purposes of weight selection and use of a second weight plane.
+	 *
+	 * @return @c true if the channel is constant across the block, @c false otherwise.
+	 */
+	inline bool is_constant_channel(int channel) const
+	{
+		vmask4 lane_mask = vint4::lane_id() == vint4(channel);
+		vmask4 color_mask = this->data_min == this->data_max;
+		return any(lane_mask & color_mask);
+	}
+
+	/**
+	 * @brief Test if this block is a luminance block with constant 1.0 alpha.
+	 *
+	 * @return @c true if the block is a luminance block , @c false otherwise.
+	 */
+	inline bool is_luminance() const
+	{
+		float default_alpha = this->get_default_alpha();
+		bool alpha1 = (this->data_min.lane<3>() == default_alpha) &&
+		              (this->data_max.lane<3>() == default_alpha);
+		return this->grayscale && alpha1;
+	}
+
+	/**
+	 * @brief Test if this block is a luminance block with variable alpha.
+	 *
+	 * @return @c true if the block is a luminance + alpha block , @c false otherwise.
+	 */
+	inline bool is_luminancealpha() const
+	{
+		float default_alpha = this->get_default_alpha();
+		bool alpha1 = (this->data_min.lane<3>() == default_alpha) &&
+		              (this->data_max.lane<3>() == default_alpha);
+		return this->grayscale && !alpha1;
+	}
+};
+
+/**
+ * @brief Data structure storing the color endpoints for a block.
+ */
+struct endpoints
+{
+	/** @brief The number of partition endpoints stored. */
+	unsigned int partition_count;
+
+	/** @brief The colors for endpoint 0. */
+	vfloat4 endpt0[BLOCK_MAX_PARTITIONS];
+
+	/** @brief The colors for endpoint 1. */
+	vfloat4 endpt1[BLOCK_MAX_PARTITIONS];
+};
+
+/**
+ * @brief Data structure storing the color endpoints and weights.
+ */
+struct endpoints_and_weights
+{
+	/** @brief True if all active values in weight_error_scale are the same. */
+	bool is_constant_weight_error_scale;
+
+	/** @brief The color endpoints. */
+	endpoints ep;
+
+	/** @brief The ideal weight for each texel; may be undecimated or decimated. */
+	alignas(ASTCENC_VECALIGN) float weights[BLOCK_MAX_TEXELS];
+
+	/** @brief The ideal weight error scaling for each texel; may be undecimated or decimated. */
+	alignas(ASTCENC_VECALIGN) float weight_error_scale[BLOCK_MAX_TEXELS];
+};
+
+/**
+ * @brief Utility storing estimated errors from choosing particular endpoint encodings.
+ */
+struct encoding_choice_errors
+{
+	/** @brief Error of using LDR RGB-scale instead of complete endpoints. */
+	float rgb_scale_error;
+
+	/** @brief Error of using HDR RGB-scale instead of complete endpoints. */
+	float rgb_luma_error;
+
+	/** @brief Error of using luminance instead of RGB. */
+	float luminance_error;
+
+	/** @brief Error of discarding alpha and using a constant 1.0 alpha. */
+	float alpha_drop_error;
+
+	/** @brief Can we use delta offset encoding? */
+	bool can_offset_encode;
+
+	/** @brief Can we use blue contraction encoding? */
+	bool can_blue_contract;
+};
+
+/**
+ * @brief Preallocated working buffers, allocated per thread during context creation.
+ */
+struct alignas(ASTCENC_VECALIGN) compression_working_buffers
+{
+	/** @brief Ideal endpoints and weights for plane 1. */
+	endpoints_and_weights ei1;
+
+	/** @brief Ideal endpoints and weights for plane 2. */
+	endpoints_and_weights ei2;
+
+	/**
+	 * @brief Decimated ideal weight values in the ~0-1 range.
+	 *
+	 * Note that values can be slightly below zero or higher than one due to
+	 * endpoint extents being inside the ideal color representation.
+	 *
+	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
+	 */
+	alignas(ASTCENC_VECALIGN) float dec_weights_ideal[WEIGHTS_MAX_DECIMATION_MODES * BLOCK_MAX_WEIGHTS];
+
+	/**
+	 * @brief Decimated quantized weight values in the unquantized 0-64 range.
+	 *
+	 * For two planes, second plane starts at @c WEIGHTS_PLANE2_OFFSET offsets.
+	 */
+	uint8_t dec_weights_uquant[WEIGHTS_MAX_BLOCK_MODES * BLOCK_MAX_WEIGHTS];
+
+	/** @brief Error of the best encoding combination for each block mode. */
+	alignas(ASTCENC_VECALIGN) float errors_of_best_combination[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The best color quant for each block mode. */
+	uint8_t best_quant_levels[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The best color quant for each block mode if modes are the same and we have spare bits. */
+	uint8_t best_quant_levels_mod[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The best endpoint format for each partition. */
+	uint8_t best_ep_formats[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS];
+
+	/** @brief The total bit storage needed for quantized weights for each block mode. */
+	int8_t qwt_bitcounts[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The cumulative error for quantized weights for each block mode. */
+	float qwt_errors[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The low weight value in plane 1 for each block mode. */
+	float weight_low_value1[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The high weight value in plane 1 for each block mode. */
+	float weight_high_value1[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The low weight value in plane 1 for each quant level and decimation mode. */
+	float weight_low_values1[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
+
+	/** @brief The high weight value in plane 1 for each quant level and decimation mode. */
+	float weight_high_values1[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
+
+	/** @brief The low weight value in plane 2 for each block mode. */
+	float weight_low_value2[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The high weight value in plane 2 for each block mode. */
+	float weight_high_value2[WEIGHTS_MAX_BLOCK_MODES];
+
+	/** @brief The low weight value in plane 2 for each quant level and decimation mode. */
+	float weight_low_values2[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
+
+	/** @brief The high weight value in plane 2 for each quant level and decimation mode. */
+	float weight_high_values2[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1];
+};
+
+struct dt_init_working_buffers
+{
+	uint8_t weight_count_of_texel[BLOCK_MAX_TEXELS];
+	uint8_t grid_weights_of_texel[BLOCK_MAX_TEXELS][4];
+	uint8_t weights_of_texel[BLOCK_MAX_TEXELS][4];
+
+	uint8_t texel_count_of_weight[BLOCK_MAX_WEIGHTS];
+	uint8_t texels_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
+	uint8_t texel_weights_of_weight[BLOCK_MAX_WEIGHTS][BLOCK_MAX_TEXELS];
+};
+
+/**
+ * @brief Weight quantization transfer table.
+ *
+ * ASTC can store texel weights at many quantization levels, so for performance we store essential
+ * information about each level as a precomputed data structure. Unquantized weights are integers
+ * or floats in the range [0, 64].
+ *
+ * This structure provides a table, used to estimate the closest quantized weight for a given
+ * floating-point weight. For each quantized weight, the corresponding unquantized values. For each
+ * quantized weight, a previous-value and a next-value.
+*/
+struct quant_and_transfer_table
+{
+	/** @brief The unscrambled unquantized value. */
+	int8_t quant_to_unquant[32];
+
+	/** @brief The scrambling order: scrambled_quant = map[unscrambled_quant]. */
+	int8_t scramble_map[32];
+
+	/** @brief The unscrambling order: unscrambled_unquant = map[scrambled_quant]. */
+	int8_t unscramble_and_unquant_map[32];
+
+	/**
+	 * @brief A table of previous-and-next weights, indexed by the current unquantized value.
+	 *  * bits 7:0 = previous-index, unquantized
+	 *  * bits 15:8 = next-index, unquantized
+	 */
+	uint16_t prev_next_values[65];
+};
+
+/** @brief The precomputed quant and transfer table. */
+extern const quant_and_transfer_table quant_and_xfer_tables[12];
+
+/** @brief The block is an error block, and will return error color or NaN. */
+static constexpr uint8_t SYM_BTYPE_ERROR { 0 };
+
+/** @brief The block is a constant color block using FP16 colors. */
+static constexpr uint8_t SYM_BTYPE_CONST_F16 { 1 };
+
+/** @brief The block is a constant color block using UNORM16 colors. */
+static constexpr uint8_t SYM_BTYPE_CONST_U16 { 2 };
+
+/** @brief The block is a normal non-constant color block. */
+static constexpr uint8_t SYM_BTYPE_NONCONST { 3 };
+
+/**
+ * @brief A symbolic representation of a compressed block.
+ *
+ * The symbolic representation stores the unpacked content of a single
+ * @c physical_compressed_block, in a form which is much easier to access for
+ * the rest of the compressor code.
+ */
+struct symbolic_compressed_block
+{
+	/** @brief The block type, one of the @c SYM_BTYPE_* constants. */
+	uint8_t block_type;
+
+	/** @brief The number of partitions; valid for @c NONCONST blocks. */
+	uint8_t partition_count;
+
+	/** @brief Non-zero if the color formats matched; valid for @c NONCONST blocks. */
+	uint8_t color_formats_matched;
+
+	/** @brief The plane 2 color component, or -1 if single plane; valid for @c NONCONST blocks. */
+	int8_t plane2_component;
+
+	/** @brief The block mode; valid for @c NONCONST blocks. */
+	uint16_t block_mode;
+
+	/** @brief The partition index; valid for @c NONCONST blocks if 2 or more partitions. */
+	uint16_t partition_index;
+
+	/** @brief The endpoint color formats for each partition; valid for @c NONCONST blocks. */
+	uint8_t color_formats[BLOCK_MAX_PARTITIONS];
+
+	/** @brief The endpoint color quant mode; valid for @c NONCONST blocks. */
+	quant_method quant_mode;
+
+	/** @brief The error of the current encoding; valid for @c NONCONST blocks. */
+	float errorval;
+
+	// We can't have both of these at the same time
+	union {
+		/** @brief The constant color; valid for @c CONST blocks. */
+		int constant_color[BLOCK_MAX_COMPONENTS];
+
+		/** @brief The quantized endpoint color pairs; valid for @c NONCONST blocks. */
+		uint8_t color_values[BLOCK_MAX_PARTITIONS][8];
+	};
+
+	/** @brief The quantized and decimated weights.
+	 *
+	 * Weights are stored in the 0-64 unpacked range allowing them to be used
+	 * directly in encoding passes without per-use unpacking. Packing happens
+	 * when converting to/from the physical bitstream encoding.
+	 *
+	 * If dual plane, the second plane starts at @c weights[WEIGHTS_PLANE2_OFFSET].
+	 */
+	uint8_t weights[BLOCK_MAX_WEIGHTS];
+
+	/**
+	 * @brief Get the weight quantization used by this block mode.
+	 *
+	 * @return The quantization level.
+	 */
+	inline quant_method get_color_quant_mode() const
+	{
+		return this->quant_mode;
+	}
+};
+
+/**
+ * @brief A physical representation of a compressed block.
+ *
+ * The physical representation stores the raw bytes of the format in memory.
+ */
+struct physical_compressed_block
+{
+	/** @brief The ASTC encoded data for a single block. */
+	uint8_t data[16];
+};
+
+
+/**
+ * @brief Parameter structure for @c compute_pixel_region_variance().
+ *
+ * This function takes a structure to avoid spilling arguments to the stack on every function
+ * invocation, as there are a lot of parameters.
+ */
+struct pixel_region_args
+{
+	/** @brief The image to analyze. */
+	const astcenc_image* img;
+
+	/** @brief The component swizzle pattern. */
+	astcenc_swizzle swz;
+
+	/** @brief Should the algorithm bother with Z axis processing? */
+	bool have_z;
+
+	/** @brief The kernel radius for alpha processing. */
+	unsigned int alpha_kernel_radius;
+
+	/** @brief The X dimension of the working data to process. */
+	unsigned int size_x;
+
+	/** @brief The Y dimension of the working data to process. */
+	unsigned int size_y;
+
+	/** @brief The Z dimension of the working data to process. */
+	unsigned int size_z;
+
+	/** @brief The X position of first src and dst data in the data set. */
+	unsigned int offset_x;
+
+	/** @brief The Y position of first src and dst data in the data set. */
+	unsigned int offset_y;
+
+	/** @brief The Z position of first src and dst data in the data set. */
+	unsigned int offset_z;
+
+	/** @brief The working memory buffer. */
+	vfloat4 *work_memory;
+};
+
+/**
+ * @brief Parameter structure for @c compute_averages_proc().
+ */
+struct avg_args
+{
+	/** @brief The arguments for the nested variance computation. */
+	pixel_region_args arg;
+
+	/** @brief The image X dimensions. */
+	unsigned int img_size_x;
+
+	/** @brief The image Y dimensions. */
+	unsigned int img_size_y;
+
+	/** @brief The image Z dimensions. */
+	unsigned int img_size_z;
+
+	/** @brief The maximum working block dimensions in X and Y dimensions. */
+	unsigned int blk_size_xy;
+
+	/** @brief The maximum working block dimensions in Z dimensions. */
+	unsigned int blk_size_z;
+
+	/** @brief The working block memory size. */
+	unsigned int work_memory_size;
+};
+
+#if defined(ASTCENC_DIAGNOSTICS)
+/* See astcenc_diagnostic_trace header for details. */
+class TraceLog;
+#endif
+
+/**
+ * @brief The astcenc compression context.
+ */
+struct astcenc_contexti
+{
+	/** @brief The configuration this context was created with. */
+	astcenc_config config;
+
+	/** @brief The thread count supported by this context. */
+	unsigned int thread_count;
+
+	/** @brief The block size descriptor this context was created with. */
+	block_size_descriptor* bsd;
+
+	/*
+	 * Fields below here are not needed in a decompress-only build, but some remain as they are
+	 * small and it avoids littering the code with #ifdefs. The most significant contributors to
+	 * large structure size are omitted.
+	 */
+
+	/** @brief The input image alpha channel averages table, may be @c nullptr if not needed. */
+	float* input_alpha_averages;
+
+	/** @brief The scratch working buffers, one per thread (see @c thread_count). */
+	compression_working_buffers* working_buffers;
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	/** @brief The pixel region and variance worker arguments. */
+	avg_args avg_preprocess_args;
+#endif
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	/**
+	 * @brief The diagnostic trace logger.
+	 *
+	 * Note that this is a singleton, so can only be used in single threaded mode. It only exists
+	 * here so we have a reference to close the file at the end of the capture.
+	 */
+	TraceLog* trace_log;
+#endif
+};
+
+/* ============================================================================
+  Functionality for managing block sizes and partition tables.
+============================================================================ */
+
+/**
+ * @brief Populate the block size descriptor for the target block size.
+ *
+ * This will also initialize the partition table metadata, which is stored as part of the BSD
+ * structure.
+ *
+ * @param      x_texels                 The number of texels in the block X dimension.
+ * @param      y_texels                 The number of texels in the block Y dimension.
+ * @param      z_texels                 The number of texels in the block Z dimension.
+ * @param      can_omit_modes           Can we discard modes and partitionings that astcenc won't use?
+ * @param      partition_count_cutoff   The partition count cutoff to use, if we can omit partitionings.
+ * @param      mode_cutoff              The block mode percentile cutoff [0-1].
+ * @param[out] bsd                      The descriptor to initialize.
+ */
+void init_block_size_descriptor(
+	unsigned int x_texels,
+	unsigned int y_texels,
+	unsigned int z_texels,
+	bool can_omit_modes,
+	unsigned int partition_count_cutoff,
+	float mode_cutoff,
+	block_size_descriptor& bsd);
+
+/**
+ * @brief Populate the partition tables for the target block size.
+ *
+ * Note the @c bsd descriptor must be initialized by calling @c init_block_size_descriptor() before
+ * calling this function.
+ *
+ * @param[out] bsd                      The block size information structure to populate.
+ * @param      can_omit_partitionings   True if we can we drop partitionings that astcenc won't use.
+ * @param      partition_count_cutoff   The partition count cutoff to use, if we can omit partitionings.
+ */
+void init_partition_tables(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff);
+
+/**
+ * @brief Get the percentile table for 2D block modes.
+ *
+ * This is an empirically determined prioritization of which block modes to use in the search in
+ * terms of their centile (lower centiles = more useful).
+ *
+ * Returns a dynamically allocated array; caller must free with delete[].
+ *
+ * @param xdim The block x size.
+ * @param ydim The block y size.
+ *
+ * @return The unpacked table.
+ */
+const float* get_2d_percentile_table(
+	unsigned int xdim,
+	unsigned int ydim);
+
+/**
+ * @brief Query if a 2D block size is legal.
+ *
+ * @return True if legal, false otherwise.
+ */
+bool is_legal_2d_block_size(
+	unsigned int xdim,
+	unsigned int ydim);
+
+/**
+ * @brief Query if a 3D block size is legal.
+ *
+ * @return True if legal, false otherwise.
+ */
+bool is_legal_3d_block_size(
+	unsigned int xdim,
+	unsigned int ydim,
+	unsigned int zdim);
+
+/* ============================================================================
+  Functionality for managing BISE quantization and unquantization.
+============================================================================ */
+
+/**
+ * @brief The precomputed table for quantizing color values.
+ *
+ * Converts unquant value in 0-255 range into quant value in 0-255 range.
+ * No BISE scrambling is applied at this stage.
+ *
+ * Indexed by [quant_mode - 4][data_value].
+ */
+extern const uint8_t color_unquant_to_uquant_tables[17][256];
+
+/**
+ * @brief The precomputed table for packing quantized color values.
+ *
+ * Converts quant value in 0-255 range into packed quant value in 0-N range,
+ * with BISE scrambling applied.
+ *
+ * Indexed by [quant_mode - 4][data_value].
+ */
+extern const uint8_t color_uquant_to_scrambled_pquant_tables[17][256];
+
+/**
+ * @brief The precomputed table for unpacking color values.
+ *
+ * Converts quant value in 0-N range into unpacked value in 0-255 range,
+ * with BISE unscrambling applied.
+ *
+ * Indexed by [quant_mode - 4][data_value].
+ */
+extern const uint8_t* color_scrambled_pquant_to_uquant_tables[17];
+
+/**
+ * @brief The precomputed quant mode storage table.
+ *
+ * Indexing by [integer_count/2][bits] gives us the quantization level for a given integer count and
+ * number of compressed storage bits. Returns -1 for cases where the requested integer count cannot
+ * ever fit in the supplied storage size.
+ */
+extern const int8_t quant_mode_table[10][128];
+
+/**
+ * @brief Encode a packed string using BISE.
+ *
+ * Note that BISE can return strings that are not a whole number of bytes in length, and ASTC can
+ * start storing strings in a block at arbitrary bit offsets in the encoded data.
+ *
+ * @param         quant_level       The BISE alphabet size.
+ * @param         character_count   The number of characters in the string.
+ * @param         input_data        The unpacked string, one byte per character.
+ * @param[in,out] output_data       The output packed string.
+ * @param         bit_offset        The starting offset in the output storage.
+ */
+void encode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset);
+
+/**
+ * @brief Decode a packed string using BISE.
+ *
+ * Note that BISE input strings are not a whole number of bytes in length, and ASTC can start
+ * strings at arbitrary bit offsets in the encoded data.
+ *
+ * @param         quant_level       The BISE alphabet size.
+ * @param         character_count   The number of characters in the string.
+ * @param         input_data        The packed string.
+ * @param[in,out] output_data       The output storage, one byte per character.
+ * @param         bit_offset        The starting offset in the output storage.
+ */
+void decode_ise(
+	quant_method quant_level,
+	unsigned int character_count,
+	const uint8_t* input_data,
+	uint8_t* output_data,
+	unsigned int bit_offset);
+
+/**
+ * @brief Return the number of bits needed to encode an ISE sequence.
+ *
+ * This implementation assumes that the @c quant level is untrusted, given it may come from random
+ * data being decompressed, so we return an arbitrary unencodable size if that is the case.
+ *
+ * @param character_count   The number of items in the sequence.
+ * @param quant_level       The desired quantization level.
+ *
+ * @return The number of bits needed to encode the BISE string.
+ */
+unsigned int get_ise_sequence_bitcount(
+	unsigned int character_count,
+	quant_method quant_level);
+
+/* ============================================================================
+  Functionality for managing color partitioning.
+============================================================================ */
+
+/**
+ * @brief Compute averages and dominant directions for each partition in a 2 component texture.
+ *
+ * @param      pi           The partition info for the current trial.
+ * @param      blk          The image block color data to be compressed.
+ * @param      component1   The first component included in the analysis.
+ * @param      component2   The second component included in the analysis.
+ * @param[out] pm           The output partition metrics.
+ *                          - Only pi.partition_count array entries actually get initialized.
+ *                          - Direction vectors @c pm.dir are not normalized.
+ */
+void compute_avgs_and_dirs_2_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	unsigned int component1,
+	unsigned int component2,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
+
+/**
+ * @brief Compute averages and dominant directions for each partition in a 3 component texture.
+ *
+ * @param      pi                  The partition info for the current trial.
+ * @param      blk                 The image block color data to be compressed.
+ * @param      omitted_component   The component excluded from the analysis.
+ * @param[out] pm                  The output partition metrics.
+ *                                 - Only pi.partition_count array entries actually get initialized.
+ *                                 - Direction vectors @c pm.dir are not normalized.
+ */
+void compute_avgs_and_dirs_3_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	unsigned int omitted_component,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
+
+/**
+ * @brief Compute averages and dominant directions for each partition in a 3 component texture.
+ *
+ * This is a specialization of @c compute_avgs_and_dirs_3_comp where the omitted component is
+ * always alpha, a common case during partition search.
+ *
+ * @param      pi    The partition info for the current trial.
+ * @param      blk   The image block color data to be compressed.
+ * @param[out] pm    The output partition metrics.
+ *                   - Only pi.partition_count array entries actually get initialized.
+ *                   - Direction vectors @c pm.dir are not normalized.
+ */
+void compute_avgs_and_dirs_3_comp_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
+
+/**
+ * @brief Compute averages and dominant directions for each partition in a 4 component texture.
+ *
+ * @param      pi    The partition info for the current trial.
+ * @param      blk   The image block color data to be compressed.
+ * @param[out] pm    The output partition metrics.
+ *                   - Only pi.partition_count array entries actually get initialized.
+ *                   - Direction vectors @c pm.dir are not normalized.
+ */
+void compute_avgs_and_dirs_4_comp(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_metrics pm[BLOCK_MAX_PARTITIONS]);
+
+/**
+ * @brief Compute the RGB error for uncorrelated and same chroma projections.
+ *
+ * The output of compute averages and dirs is post processed to define two lines, both of which go
+ * through the mean-color-value.  One line has a direction defined by the dominant direction; this
+ * is used to assess the error from using an uncorrelated color representation. The other line goes
+ * through (0,0,0) and is used to assess the error from using an RGBS color representation.
+ *
+ * This function computes the squared error when using these two representations.
+ *
+ * @param         pi            The partition info for the current trial.
+ * @param         blk           The image block color data to be compressed.
+ * @param[in,out] plines        Processed line inputs, and line length outputs.
+ * @param[out]    uncor_error   The cumulative error for using the uncorrelated line.
+ * @param[out]    samec_error   The cumulative error for using the same chroma line.
+ */
+void compute_error_squared_rgb(
+	const partition_info& pi,
+	const image_block& blk,
+	partition_lines3 plines[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error);
+
+/**
+ * @brief Compute the RGBA error for uncorrelated and same chroma projections.
+ *
+ * The output of compute averages and dirs is post processed to define two lines, both of which go
+ * through the mean-color-value.  One line has a direction defined by the dominant direction; this
+ * is used to assess the error from using an uncorrelated color representation. The other line goes
+ * through (0,0,0,1) and is used to assess the error from using an RGBS color representation.
+ *
+ * This function computes the squared error when using these two representations.
+ *
+ * @param      pi              The partition info for the current trial.
+ * @param      blk             The image block color data to be compressed.
+ * @param      uncor_plines    Processed uncorrelated partition lines for each partition.
+ * @param      samec_plines    Processed same chroma partition lines for each partition.
+ * @param[out] uncor_lengths   The length of each components deviation from the line.
+ * @param[out] samec_lengths   The length of each components deviation from the line.
+ * @param[out] uncor_error     The cumulative error for using the uncorrelated line.
+ * @param[out] samec_error     The cumulative error for using the same chroma line.
+ */
+void compute_error_squared_rgba(
+	const partition_info& pi,
+	const image_block& blk,
+	const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS],
+	const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS],
+	float uncor_lengths[BLOCK_MAX_PARTITIONS],
+	float samec_lengths[BLOCK_MAX_PARTITIONS],
+	float& uncor_error,
+	float& samec_error);
+
+/**
+ * @brief Find the best set of partitions to trial for a given block.
+ *
+ * On return the @c best_partitions list will contain the two best partition
+ * candidates; one assuming data has uncorrelated chroma and one assuming the
+ * data has correlated chroma. The best candidate is returned first in the list.
+ *
+ * @param      bsd                      The block size information.
+ * @param      blk                      The image block color data to compress.
+ * @param      partition_count          The number of partitions in the block.
+ * @param      partition_search_limit   The number of candidate partition encodings to trial.
+ * @param[out] best_partitions          The best partition candidates.
+ * @param      requested_candidates     The number of requested partitionings. May return fewer if
+ *                                      candidates are not available.
+ *
+ * @return The actual number of candidates returned.
+ */
+unsigned int find_best_partition_candidates(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int partition_count,
+	unsigned int partition_search_limit,
+	unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
+	unsigned int requested_candidates);
+
+/* ============================================================================
+  Functionality for managing images and image related data.
+============================================================================ */
+
+/**
+ * @brief Setup computation of regional averages in an image.
+ *
+ * This must be done by only a single thread per image, before any thread calls
+ * @c compute_averages().
+ *
+ * Results are written back into @c img->input_alpha_averages.
+ *
+ * @param      img                   The input image data, also holds output data.
+ * @param      alpha_kernel_radius   The kernel radius (in pixels) for alpha mods.
+ * @param      swz                   Input data component swizzle.
+ * @param[out] ag                    The average variance arguments to init.
+ *
+ * @return The number of tasks in the processing stage.
+ */
+unsigned int init_compute_averages(
+	const astcenc_image& img,
+	unsigned int alpha_kernel_radius,
+	const astcenc_swizzle& swz,
+	avg_args& ag);
+
+/**
+ * @brief Compute averages for a pixel region.
+ *
+ * The routine computes both in a single pass, using a summed-area table to decouple the running
+ * time from the averaging/variance kernel size.
+ *
+ * @param[out] ctx   The compressor context storing the output data.
+ * @param      arg   The input parameter structure.
+ */
+void compute_pixel_region_variance(
+	astcenc_contexti& ctx,
+	const pixel_region_args& arg);
+/**
+ * @brief Load a single image block from the input image.
+ *
+ * @param      decode_mode   The compression color profile.
+ * @param      img           The input image data.
+ * @param[out] blk           The image block to populate.
+ * @param      bsd           The block size information.
+ * @param      xpos          The block X coordinate in the input image.
+ * @param      ypos          The block Y coordinate in the input image.
+ * @param      zpos          The block Z coordinate in the input image.
+ * @param      swz           The swizzle to apply on load.
+ */
+void load_image_block(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz);
+
+/**
+ * @brief Load a single image block from the input image.
+ *
+ * This specialized variant can be used only if the block is 2D LDR U8 data,
+ * with no swizzle.
+ *
+ * @param      decode_mode   The compression color profile.
+ * @param      img           The input image data.
+ * @param[out] blk           The image block to populate.
+ * @param      bsd           The block size information.
+ * @param      xpos          The block X coordinate in the input image.
+ * @param      ypos          The block Y coordinate in the input image.
+ * @param      zpos          The block Z coordinate in the input image.
+ * @param      swz           The swizzle to apply on load.
+ */
+void load_image_block_fast_ldr(
+	astcenc_profile decode_mode,
+	const astcenc_image& img,
+	image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz);
+
+/**
+ * @brief Store a single image block to the output image.
+ *
+ * @param[out] img    The output image data.
+ * @param      blk    The image block to export.
+ * @param      bsd    The block size information.
+ * @param      xpos   The block X coordinate in the input image.
+ * @param      ypos   The block Y coordinate in the input image.
+ * @param      zpos   The block Z coordinate in the input image.
+ * @param      swz    The swizzle to apply on store.
+ */
+void store_image_block(
+	astcenc_image& img,
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	unsigned int xpos,
+	unsigned int ypos,
+	unsigned int zpos,
+	const astcenc_swizzle& swz);
+
+/* ============================================================================
+  Functionality for computing endpoint colors and weights for a block.
+============================================================================ */
+
+/**
+ * @brief Compute ideal endpoint colors and weights for 1 plane of weights.
+ *
+ * The ideal endpoints define a color line for the partition. For each texel the ideal weight
+ * defines an exact position on the partition color line. We can then use these to assess the error
+ * introduced by removing and quantizing the weight grid.
+ *
+ * @param      blk   The image block color data to compress.
+ * @param      pi    The partition info for the current trial.
+ * @param[out] ei    The endpoint and weight values.
+ */
+void compute_ideal_colors_and_weights_1plane(
+	const image_block& blk,
+	const partition_info& pi,
+	endpoints_and_weights& ei);
+
+/**
+ * @brief Compute ideal endpoint colors and weights for 2 planes of weights.
+ *
+ * The ideal endpoints define a color line for the partition. For each texel the ideal weight
+ * defines an exact position on the partition color line. We can then use these to assess the error
+ * introduced by removing and quantizing the weight grid.
+ *
+ * @param      bsd                The block size information.
+ * @param      blk                The image block color data to compress.
+ * @param      plane2_component   The component assigned to plane 2.
+ * @param[out] ei1                The endpoint and weight values for plane 1.
+ * @param[out] ei2                The endpoint and weight values for plane 2.
+ */
+void compute_ideal_colors_and_weights_2planes(
+	const block_size_descriptor& bsd,
+	const image_block& blk,
+	unsigned int plane2_component,
+	endpoints_and_weights& ei1,
+	endpoints_and_weights& ei2);
+
+/**
+ * @brief Compute the optimal unquantized weights for a decimation table.
+ *
+ * After computing ideal weights for the case for a complete weight grid, we we want to compute the
+ * ideal weights for the case where weights exist only for some texels. We do this with a
+ * steepest-descent grid solver which works as follows:
+ *
+ * First, for each actual weight, perform a weighted averaging of the texels affected by the weight.
+ * Then, set step size to <some initial value> and attempt one step towards the original ideal
+ * weight if it helps to reduce error.
+ *
+ * @param      ei                       The non-decimated endpoints and weights.
+ * @param      di                       The selected weight decimation.
+ * @param[out] dec_weight_ideal_value   The ideal values for the decimated weight set.
+ */
+void compute_ideal_weights_for_decimation(
+	const endpoints_and_weights& ei,
+	const decimation_info& di,
+	float* dec_weight_ideal_value);
+
+/**
+ * @brief Compute the optimal quantized weights for a decimation table.
+ *
+ * We test the two closest weight indices in the allowed quantization range and keep the weight that
+ * is the closest match.
+ *
+ * @param      di                        The selected weight decimation.
+ * @param      low_bound                 The lowest weight allowed.
+ * @param      high_bound                The highest weight allowed.
+ * @param      dec_weight_ideal_value    The ideal weight set.
+ * @param[out] dec_weight_quant_uvalue   The output quantized weight as a float.
+ * @param[out] dec_weight_uquant         The output quantized weight as encoded int.
+ * @param      quant_level               The desired weight quant level.
+ */
+void compute_quantized_weights_for_decimation(
+	const decimation_info& di,
+	float low_bound,
+	float high_bound,
+	const float* dec_weight_ideal_value,
+	float* dec_weight_quant_uvalue,
+	uint8_t* dec_weight_uquant,
+	quant_method quant_level);
+
+/**
+ * @brief Compute the error of a decimated weight set for 1 plane.
+ *
+ * After computing ideal weights for the case with one weight per texel, we want to compute the
+ * error for decimated weight grids where weights are stored at a lower resolution. This function
+ * computes the error of the reduced grid, compared to the full grid.
+ *
+ * @param eai                       The ideal weights for the full grid.
+ * @param di                        The selected weight decimation.
+ * @param dec_weight_quant_uvalue   The quantized weights for the decimated grid.
+ *
+ * @return The accumulated error.
+ */
+float compute_error_of_weight_set_1plane(
+	const endpoints_and_weights& eai,
+	const decimation_info& di,
+	const float* dec_weight_quant_uvalue);
+
+/**
+ * @brief Compute the error of a decimated weight set for 2 planes.
+ *
+ * After computing ideal weights for the case with one weight per texel, we want to compute the
+ * error for decimated weight grids where weights are stored at a lower resolution. This function
+ * computes the error of the reduced grid, compared to the full grid.
+ *
+ * @param eai1                             The ideal weights for the full grid and plane 1.
+ * @param eai2                             The ideal weights for the full grid and plane 2.
+ * @param di                               The selected weight decimation.
+ * @param dec_weight_quant_uvalue_plane1   The quantized weights for the decimated grid plane 1.
+ * @param dec_weight_quant_uvalue_plane2   The quantized weights for the decimated grid plane 2.
+ *
+ * @return The accumulated error.
+ */
+float compute_error_of_weight_set_2planes(
+	const endpoints_and_weights& eai1,
+	const endpoints_and_weights& eai2,
+	const decimation_info& di,
+	const float* dec_weight_quant_uvalue_plane1,
+	const float* dec_weight_quant_uvalue_plane2);
+
+/**
+ * @brief Pack a single pair of color endpoints as effectively as possible.
+ *
+ * The user requests a base color endpoint mode in @c format, but the quantizer may choose a
+ * delta-based representation. It will report back the format variant it actually used.
+ *
+ * @param      color0        The input unquantized color0 endpoint for absolute endpoint pairs.
+ * @param      color1        The input unquantized color1 endpoint for absolute endpoint pairs.
+ * @param      rgbs_color    The input unquantized RGBS variant endpoint for same chroma endpoints.
+ * @param      rgbo_color    The input unquantized RGBS variant endpoint for HDR endpoints.
+ * @param      format        The desired base format.
+ * @param[out] output        The output storage for the quantized colors/
+ * @param      quant_level   The quantization level requested.
+ *
+ * @return The actual endpoint mode used.
+ */
+uint8_t pack_color_endpoints(
+	vfloat4 color0,
+	vfloat4 color1,
+	vfloat4 rgbs_color,
+	vfloat4 rgbo_color,
+	int format,
+	uint8_t* output,
+	quant_method quant_level);
+
+/**
+ * @brief Unpack a single pair of encoded endpoints.
+ *
+ * Endpoints must be unscrambled and converted into the 0-255 range before calling this functions.
+ *
+ * @param      decode_mode   The decode mode (LDR, HDR).
+ * @param      format        The color endpoint mode used.
+ * @param      input         The raw array of encoded input integers. The length of this array
+ *                           depends on @c format; it can be safely assumed to be large enough.
+ * @param[out] rgb_hdr       Is the endpoint using HDR for the RGB channels?
+ * @param[out] alpha_hdr     Is the endpoint using HDR for the A channel?
+ * @param[out] output0       The output color for endpoint 0.
+ * @param[out] output1       The output color for endpoint 1.
+ */
+void unpack_color_endpoints(
+	astcenc_profile decode_mode,
+	int format,
+	const uint8_t* input,
+	bool& rgb_hdr,
+	bool& alpha_hdr,
+	vint4& output0,
+	vint4& output1);
+
+/**
+ * @brief Unpack a set of quantized and decimated weights.
+ *
+ * TODO: Can we skip this for non-decimated weights now that the @c scb is
+ * already storing unquantized weights?
+ *
+ * @param      bsd              The block size information.
+ * @param      scb              The symbolic compressed encoding.
+ * @param      di               The weight grid decimation table.
+ * @param      is_dual_plane    @c true if this is a dual plane block, @c false otherwise.
+ * @param[out] weights_plane1   The output array for storing the plane 1 weights.
+ * @param[out] weights_plane2   The output array for storing the plane 2 weights.
+ */
+void unpack_weights(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const decimation_info& di,
+	bool is_dual_plane,
+	int weights_plane1[BLOCK_MAX_TEXELS],
+	int weights_plane2[BLOCK_MAX_TEXELS]);
+
+/**
+ * @brief Identify, for each mode, which set of color endpoint produces the best result.
+ *
+ * Returns the best @c tune_candidate_limit best looking modes, along with the ideal color encoding
+ * combination for each. The modified quantization level can be used when all formats are the same,
+ * as this frees up two additional bits of storage.
+ *
+ * @param      pi                            The partition info for the current trial.
+ * @param      blk                           The image block color data to compress.
+ * @param      ep                            The ideal endpoints.
+ * @param      qwt_bitcounts                 Bit counts for different quantization methods.
+ * @param      qwt_errors                    Errors for different quantization methods.
+ * @param      tune_candidate_limit          The max number of candidates to return, may be less.
+ * @param      start_block_mode              The first block mode to inspect.
+ * @param      end_block_mode                The last block mode to inspect.
+ * @param[out] partition_format_specifiers   The best formats per partition.
+ * @param[out] block_mode                    The best packed block mode indexes.
+ * @param[out] quant_level                   The best color quant level.
+ * @param[out] quant_level_mod               The best color quant level if endpoints are the same.
+ * @param[out] tmpbuf                        Preallocated scratch buffers for the compressor.
+ *
+ * @return The actual number of candidate matches returned.
+ */
+unsigned int compute_ideal_endpoint_formats(
+	const partition_info& pi,
+	const image_block& blk,
+	const endpoints& ep,
+	const int8_t* qwt_bitcounts,
+	const float* qwt_errors,
+	unsigned int tune_candidate_limit,
+	unsigned int start_block_mode,
+	unsigned int end_block_mode,
+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
+	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
+	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
+	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
+	compression_working_buffers& tmpbuf);
+
+/**
+ * @brief For a given 1 plane weight set recompute the endpoint colors.
+ *
+ * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
+ * recompute the ideal colors for a specific weight set.
+ *
+ * @param         blk                  The image block color data to compress.
+ * @param         pi                   The partition info for the current trial.
+ * @param         di                   The weight grid decimation table.
+ * @param         dec_weights_uquant   The quantized weight set.
+ * @param[in,out] ep                   The color endpoints (modifed in place).
+ * @param[out]    rgbs_vectors         The RGB+scale vectors for LDR blocks.
+ * @param[out]    rgbo_vectors         The RGB+offset vectors for HDR blocks.
+ */
+void recompute_ideal_colors_1plane(
+	const image_block& blk,
+	const partition_info& pi,
+	const decimation_info& di,
+	const uint8_t* dec_weights_uquant,
+	endpoints& ep,
+	vfloat4 rgbs_vectors[BLOCK_MAX_PARTITIONS],
+	vfloat4 rgbo_vectors[BLOCK_MAX_PARTITIONS]);
+
+/**
+ * @brief For a given 2 plane weight set recompute the endpoint colors.
+ *
+ * As we quantize and decimate weights the optimal endpoint colors may change slightly, so we must
+ * recompute the ideal colors for a specific weight set.
+ *
+ * @param         blk                         The image block color data to compress.
+ * @param         bsd                         The block_size descriptor.
+ * @param         di                          The weight grid decimation table.
+ * @param         dec_weights_uquant_plane1   The quantized weight set for plane 1.
+ * @param         dec_weights_uquant_plane2   The quantized weight set for plane 2.
+ * @param[in,out] ep                          The color endpoints (modifed in place).
+ * @param[out]    rgbs_vector                 The RGB+scale color for LDR blocks.
+ * @param[out]    rgbo_vector                 The RGB+offset color for HDR blocks.
+ * @param         plane2_component            The component assigned to plane 2.
+ */
+void recompute_ideal_colors_2planes(
+	const image_block& blk,
+	const block_size_descriptor& bsd,
+	const decimation_info& di,
+	const uint8_t* dec_weights_uquant_plane1,
+	const uint8_t* dec_weights_uquant_plane2,
+	endpoints& ep,
+	vfloat4& rgbs_vector,
+	vfloat4& rgbo_vector,
+	int plane2_component);
+
+/**
+ * @brief Expand the angular tables needed for the alternative to PCA that we use.
+ */
+void prepare_angular_tables();
+
+/**
+ * @brief Compute the angular endpoints for one plane for each block mode.
+ *
+ * @param      only_always              Only consider block modes that are always enabled.
+ * @param      bsd                      The block size descriptor for the current trial.
+ * @param      dec_weight_ideal_value   The ideal decimated unquantized weight values.
+ * @param      max_weight_quant         The maximum block mode weight quantization allowed.
+ * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
+ */
+void compute_angular_endpoints_1plane(
+	bool only_always,
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf);
+
+/**
+ * @brief Compute the angular endpoints for two planes for each block mode.
+ *
+ * @param      bsd                      The block size descriptor for the current trial.
+ * @param      dec_weight_ideal_value   The ideal decimated unquantized weight values.
+ * @param      max_weight_quant         The maximum block mode weight quantization allowed.
+ * @param[out] tmpbuf                   Preallocated scratch buffers for the compressor.
+ */
+void compute_angular_endpoints_2planes(
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf);
+
+/* ============================================================================
+  Functionality for high level compression and decompression access.
+============================================================================ */
+
+/**
+ * @brief Compress an image block into a physical block.
+ *
+ * @param      ctx      The compressor context and configuration.
+ * @param      blk      The image block color data to compress.
+ * @param[out] pcb      The physical compressed block output.
+ * @param[out] tmpbuf   Preallocated scratch buffers for the compressor.
+ */
+void compress_block(
+	const astcenc_contexti& ctx,
+	const image_block& blk,
+	physical_compressed_block& pcb,
+	compression_working_buffers& tmpbuf);
+
+/**
+ * @brief Decompress a symbolic block in to an image block.
+ *
+ * @param      decode_mode   The decode mode (LDR, HDR, etc).
+ * @param      bsd           The block size information.
+ * @param      xpos          The X coordinate of the block in the overall image.
+ * @param      ypos          The Y coordinate of the block in the overall image.
+ * @param      zpos          The Z coordinate of the block in the overall image.
+ * @param[out] blk           The decompressed image block color data.
+ */
+void decompress_symbolic_block(
+	astcenc_profile decode_mode,
+	const block_size_descriptor& bsd,
+	int xpos,
+	int ypos,
+	int zpos,
+	const symbolic_compressed_block& scb,
+	image_block& blk);
+
+/**
+ * @brief Compute the error between a symbolic block and the original input data.
+ *
+ * This function is specialized for 2 plane and 1 partition search.
+ *
+ * In RGBM mode this will reject blocks that attempt to encode a zero M value.
+ *
+ * @param config   The compressor config.
+ * @param bsd      The block size information.
+ * @param scb      The symbolic compressed encoding.
+ * @param blk      The original image block color data.
+ *
+ * @return Returns the computed error, or a negative value if the encoding
+ *         should be rejected for any reason.
+ */
+float compute_symbolic_block_difference_2plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk);
+
+/**
+ * @brief Compute the error between a symbolic block and the original input data.
+ *
+ * This function is specialized for 1 plane and N partition search.
+ *
+ * In RGBM mode this will reject blocks that attempt to encode a zero M value.
+ *
+ * @param config   The compressor config.
+ * @param bsd      The block size information.
+ * @param scb      The symbolic compressed encoding.
+ * @param blk      The original image block color data.
+ *
+ * @return Returns the computed error, or a negative value if the encoding
+ *         should be rejected for any reason.
+ */
+float compute_symbolic_block_difference_1plane(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk);
+
+/**
+ * @brief Compute the error between a symbolic block and the original input data.
+ *
+ * This function is specialized for 1 plane and 1 partition search.
+ *
+ * In RGBM mode this will reject blocks that attempt to encode a zero M value.
+ *
+ * @param config   The compressor config.
+ * @param bsd      The block size information.
+ * @param scb      The symbolic compressed encoding.
+ * @param blk      The original image block color data.
+ *
+ * @return Returns the computed error, or a negative value if the encoding
+ *         should be rejected for any reason.
+ */
+float compute_symbolic_block_difference_1plane_1partition(
+	const astcenc_config& config,
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	const image_block& blk);
+
+/**
+ * @brief Convert a symbolic representation into a binary physical encoding.
+ *
+ * It is assumed that the symbolic encoding is valid and encodable, or
+ * previously flagged as an error block if an error color it to be encoded.
+ *
+ * @param      bsd   The block size information.
+ * @param      scb   The symbolic representation.
+ * @param[out] pcb   The binary encoded data.
+ */
+void symbolic_to_physical(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	physical_compressed_block& pcb);
+
+/**
+ * @brief Convert a binary physical encoding into a symbolic representation.
+ *
+ * This function can cope with arbitrary input data; output blocks will be
+ * flagged as an error block if the encoding is invalid.
+ *
+ * @param      bsd   The block size information.
+ * @param      pcb   The binary encoded data.
+ * @param[out] scb   The output symbolic representation.
+ */
+void physical_to_symbolic(
+	const block_size_descriptor& bsd,
+	const physical_compressed_block& pcb,
+	symbolic_compressed_block& scb);
+
+/* ============================================================================
+Platform-specific functions.
+============================================================================ */
+/**
+ * @brief Run-time detection if the host CPU supports the POPCNT extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+bool cpu_supports_popcnt();
+
+/**
+ * @brief Run-time detection if the host CPU supports F16C extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+bool cpu_supports_f16c();
+
+/**
+ * @brief Run-time detection if the host CPU supports SSE 4.1 extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+bool cpu_supports_sse41();
+
+/**
+ * @brief Run-time detection if the host CPU supports AVX 2 extension.
+ *
+ * @return @c true if supported, @c false if not.
+ */
+bool cpu_supports_avx2();
+
+/**
+ * @brief Allocate an aligned memory buffer.
+ *
+ * Allocated memory must be freed by aligned_free;
+ *
+ * @param size    The desired buffer size.
+ * @param align   The desired buffer alignment; must be 2^N.
+ *
+ * @return The memory buffer pointer or nullptr on allocation failure.
+ */
+template<typename T>
+T* aligned_malloc(size_t size, size_t align)
+{
+	void* ptr;
+	int error = 0;
+
+#if defined(_WIN32)
+	ptr = _aligned_malloc(size, align);
+#else
+	error = posix_memalign(&ptr, align, size);
+#endif
+
+	if (error || (!ptr))
+	{
+		return nullptr;
+	}
+
+	return static_cast<T*>(ptr);
+}
+
+/**
+ * @brief Free an aligned memory buffer.
+ *
+ * @param ptr   The buffer to free.
+ */
+template<typename T>
+void aligned_free(T* ptr)
+{
+#if defined(_WIN32)
+	_aligned_free(reinterpret_cast<void*>(ptr));
+#else
+	free(reinterpret_cast<void*>(ptr));
+#endif
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_internal_entry.h b/thirdparty/astcenc/astcenc_internal_entry.h
new file mode 100644
index 0000000000..4e8794547a
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_internal_entry.h
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data declarations for the outer context.
+ *
+ * The outer context includes thread-pool management, which is slower to
+ * compile due to increased use of C++ stdlib. The inner context used in the
+ * majority of the codec library does not include this.
+ */
+
+#ifndef ASTCENC_INTERNAL_ENTRY_INCLUDED
+#define ASTCENC_INTERNAL_ENTRY_INCLUDED
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+
+#include "astcenc_internal.h"
+
+/* ============================================================================
+  Parallel execution control
+============================================================================ */
+
+/**
+ * @brief A simple counter-based manager for parallel task execution.
+ *
+ * The task processing execution consists of:
+ *
+ *     * A single-threaded init stage.
+ *     * A multi-threaded processing stage.
+ *     * A condition variable so threads can wait for processing completion.
+ *
+ * The init stage will be executed by the first thread to arrive in the critical section, there is
+ * no main thread in the thread pool.
+ *
+ * The processing stage uses dynamic dispatch to assign task tickets to threads on an on-demand
+ * basis. Threads may each therefore executed different numbers of tasks, depending on their
+ * processing complexity. The task queue and the task tickets are just counters; the caller must map
+ * these integers to an actual processing partition in a specific problem domain.
+ *
+ * The exit wait condition is needed to ensure processing has finished before a worker thread can
+ * progress to the next stage of the pipeline. Specifically a worker may exit the processing stage
+ * because there are no new tasks to assign to it while other worker threads are still processing.
+ * Calling @c wait() will ensure that all other worker have finished before the thread can proceed.
+ *
+ * The basic usage model:
+ *
+ *     // --------- From single-threaded code ---------
+ *
+ *     // Reset the tracker state
+ *     manager->reset()
+ *
+ *     // --------- From multi-threaded code ---------
+ *
+ *     // Run the stage init; only first thread actually runs the lambda
+ *     manager->init(<lambda>)
+ *
+ *     do
+ *     {
+ *         // Request a task assignment
+ *         uint task_count;
+ *         uint base_index = manager->get_tasks(<granule>, task_count);
+ *
+ *         // Process any tasks we were given (task_count <= granule size)
+ *         if (task_count)
+ *         {
+ *             // Run the user task processing code for N tasks here
+ *             ...
+ *
+ *             // Flag these tasks as complete
+ *             manager->complete_tasks(task_count);
+ *         }
+ *     } while (task_count);
+ *
+ *     // Wait for all threads to complete tasks before progressing
+ *     manager->wait()
+ *
+  *     // Run the stage term; only first thread actually runs the lambda
+ *     manager->term(<lambda>)
+ */
+class ParallelManager
+{
+private:
+	/** @brief Lock used for critical section and condition synchronization. */
+	std::mutex m_lock;
+
+	/** @brief True if the stage init() step has been executed. */
+	bool m_init_done;
+
+	/** @brief True if the stage term() step has been executed. */
+	bool m_term_done;
+
+	/** @brief Condition variable for tracking stage processing completion. */
+	std::condition_variable m_complete;
+
+	/** @brief Number of tasks started, but not necessarily finished. */
+	std::atomic<unsigned int> m_start_count;
+
+	/** @brief Number of tasks finished. */
+	unsigned int m_done_count;
+
+	/** @brief Number of tasks that need to be processed. */
+	unsigned int m_task_count;
+
+public:
+	/** @brief Create a new ParallelManager. */
+	ParallelManager()
+	{
+		reset();
+	}
+
+	/**
+	 * @brief Reset the tracker for a new processing batch.
+	 *
+	 * This must be called from single-threaded code before starting the multi-threaded processing
+	 * operations.
+	 */
+	void reset()
+	{
+		m_init_done = false;
+		m_term_done = false;
+		m_start_count = 0;
+		m_done_count = 0;
+		m_task_count = 0;
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param init_func   Callable which executes the stage initialization. It must return the
+	 *                    total number of tasks in the stage.
+	 */
+	void init(std::function<unsigned int(void)> init_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_task_count = init_func();
+			m_init_done = true;
+		}
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage init step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * initialization. Other threads will block and wait for it to complete.
+	 *
+	 * @param task_count   Total number of tasks needing processing.
+	 */
+	void init(unsigned int task_count)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_init_done)
+		{
+			m_task_count = task_count;
+			m_init_done = true;
+		}
+	}
+
+	/**
+	 * @brief Request a task assignment.
+	 *
+	 * Assign up to @c granule tasks to the caller for processing.
+	 *
+	 * @param      granule   Maximum number of tasks that can be assigned.
+	 * @param[out] count     Actual number of tasks assigned, or zero if no tasks were assigned.
+	 *
+	 * @return Task index of the first assigned task; assigned tasks increment from this.
+	 */
+	unsigned int get_task_assignment(unsigned int granule, unsigned int& count)
+	{
+		unsigned int base = m_start_count.fetch_add(granule, std::memory_order_relaxed);
+		if (base >= m_task_count)
+		{
+			count = 0;
+			return 0;
+		}
+
+		count = astc::min(m_task_count - base, granule);
+		return base;
+	}
+
+	/**
+	 * @brief Complete a task assignment.
+	 *
+	 * Mark @c count tasks as complete. This will notify all threads blocked on @c wait() if this
+	 * completes the processing of the stage.
+	 *
+	 * @param count   The number of completed tasks.
+	 */
+	void complete_task_assignment(unsigned int count)
+	{
+		// Note: m_done_count cannot use an atomic without the mutex; this has a race between the
+		// update here and the wait() for other threads
+		std::unique_lock<std::mutex> lck(m_lock);
+		this->m_done_count += count;
+		if (m_done_count == m_task_count)
+		{
+			lck.unlock();
+			m_complete.notify_all();
+		}
+	}
+
+	/**
+	 * @brief Wait for stage processing to complete.
+	 */
+	void wait()
+	{
+		std::unique_lock<std::mutex> lck(m_lock);
+		m_complete.wait(lck, [this]{ return m_done_count == m_task_count; });
+	}
+
+	/**
+	 * @brief Trigger the pipeline stage term step.
+	 *
+	 * This can be called from multi-threaded code. The first thread to hit this will process the
+	 * work pool termination. Caller must have called @c wait() prior to calling this function to
+	 * ensure that processing is complete.
+	 *
+	 * @param term_func   Callable which executes the stage termination.
+	 */
+	void term(std::function<void(void)> term_func)
+	{
+		std::lock_guard<std::mutex> lck(m_lock);
+		if (!m_term_done)
+		{
+			term_func();
+			m_term_done = true;
+		}
+	}
+};
+
+/**
+ * @brief The astcenc compression context.
+ */
+struct astcenc_context
+{
+	/** @brief The context internal state. */
+	astcenc_contexti context;
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+	/** @brief The parallel manager for averages computation. */
+	ParallelManager manage_avg;
+
+	/** @brief The parallel manager for compression. */
+	ParallelManager manage_compress;
+#endif
+
+	/** @brief The parallel manager for decompression. */
+	ParallelManager manage_decompress;
+};
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_mathlib.cpp b/thirdparty/astcenc/astcenc_mathlib.cpp
new file mode 100644
index 0000000000..f276ac7e3d
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_mathlib.cpp
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#include "astcenc_mathlib.h"
+
+/**
+ * @brief 64-bit rotate left.
+ *
+ * @param val   The value to rotate.
+ * @param count The rotation, in bits.
+ */
+static inline uint64_t rotl(uint64_t val, int count)
+{
+	return (val << count) | (val >> (64 - count));
+}
+
+/* See header for documentation. */
+void astc::rand_init(uint64_t state[2])
+{
+	state[0] = 0xfaf9e171cea1ec6bULL;
+	state[1] = 0xf1b318cc06af5d71ULL;
+}
+
+/* See header for documentation. */
+uint64_t astc::rand(uint64_t state[2])
+{
+	uint64_t s0 = state[0];
+	uint64_t s1 = state[1];
+	uint64_t res = s0 + s1;
+	s1 ^= s0;
+	state[0] = rotl(s0, 24) ^ s1 ^ (s1 << 16);
+	state[1] = rotl(s1, 37);
+	return res;
+}
diff --git a/thirdparty/astcenc/astcenc_mathlib.h b/thirdparty/astcenc/astcenc_mathlib.h
new file mode 100644
index 0000000000..0540c4fedd
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_mathlib.h
@@ -0,0 +1,476 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements a variety of mathematical data types and library
+ * functions used by the codec.
+ */
+
+#ifndef ASTC_MATHLIB_H_INCLUDED
+#define ASTC_MATHLIB_H_INCLUDED
+
+#include <cassert>
+#include <cstdint>
+#include <cmath>
+
+#ifndef ASTCENC_POPCNT
+  #if defined(__POPCNT__)
+    #define ASTCENC_POPCNT 1
+  #else
+    #define ASTCENC_POPCNT 0
+  #endif
+#endif
+
+#ifndef ASTCENC_F16C
+  #if defined(__F16C__)
+    #define ASTCENC_F16C 1
+  #else
+    #define ASTCENC_F16C 0
+  #endif
+#endif
+
+#ifndef ASTCENC_SSE
+  #if defined(__SSE4_2__)
+    #define ASTCENC_SSE 42
+  #elif defined(__SSE4_1__)
+    #define ASTCENC_SSE 41
+  #elif defined(__SSE2__)
+    #define ASTCENC_SSE 20
+  #else
+    #define ASTCENC_SSE 0
+  #endif
+#endif
+
+#ifndef ASTCENC_AVX
+  #if defined(__AVX2__)
+    #define ASTCENC_AVX 2
+  #elif defined(__AVX__)
+    #define ASTCENC_AVX 1
+  #else
+    #define ASTCENC_AVX 0
+  #endif
+#endif
+
+#ifndef ASTCENC_NEON
+  #if defined(__aarch64__)
+    #define ASTCENC_NEON 1
+  #else
+    #define ASTCENC_NEON 0
+  #endif
+#endif
+
+#if ASTCENC_AVX
+  #define ASTCENC_VECALIGN 32
+#else
+  #define ASTCENC_VECALIGN 16
+#endif
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0 || ASTCENC_POPCNT != 0
+	#include <immintrin.h>
+#endif
+
+/* ============================================================================
+  Fast math library; note that many of the higher-order functions in this set
+  use approximations which are less accurate, but faster, than <cmath> standard
+  library equivalents.
+
+  Note: Many of these are not necessarily faster than simple C versions when
+  used on a single scalar value, but are included for testing purposes as most
+  have an option based on SSE intrinsics and therefore provide an obvious route
+  to future vectorization.
+============================================================================ */
+
+// Union for manipulation of float bit patterns
+typedef union
+{
+	uint32_t u;
+	int32_t s;
+	float f;
+} if32;
+
+// These are namespaced to avoid colliding with C standard library functions.
+namespace astc
+{
+
+static const float PI          = 3.14159265358979323846f;
+static const float PI_OVER_TWO = 1.57079632679489661923f;
+
+/**
+ * @brief SP float absolute value.
+ *
+ * @param v   The value to make absolute.
+ *
+ * @return The absolute value.
+ */
+static inline float fabs(float v)
+{
+	return std::fabs(v);
+}
+
+/**
+ * @brief Test if a float value is a nan.
+ *
+ * @param v    The value test.
+ *
+ * @return Zero is not a NaN, non-zero otherwise.
+ */
+static inline bool isnan(float v)
+{
+	return v != v;
+}
+
+/**
+ * @brief Return the minimum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q)
+{
+	return p < q ? p : q;
+}
+
+/**
+ * @brief Return the minimum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r)
+{
+	return min(min(p, q), r);
+}
+
+/**
+ * @brief Return the minimum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The smallest value.
+ */
+template<typename T>
+static inline T min(T p, T q, T r, T s)
+{
+	return min(min(p, q), min(r, s));
+}
+
+/**
+ * @brief Return the maximum of two values.
+ *
+ * For floats, NaNs are turned into @c q.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q)
+{
+	return p > q ? p : q;
+}
+
+/**
+ * @brief Return the maximum of three values.
+ *
+ * For floats, NaNs are turned into @c r.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r)
+{
+	return max(max(p, q), r);
+}
+
+/**
+ * @brief Return the maximum of four values.
+ *
+ * For floats, NaNs are turned into @c s.
+ *
+ * @param p   The first value to compare.
+ * @param q   The second value to compare.
+ * @param r   The third value to compare.
+ * @param s   The fourth value to compare.
+ *
+ * @return The largest value.
+ */
+template<typename T>
+static inline T max(T p, T q, T r, T s)
+{
+	return max(max(p, q), max(r, s));
+}
+
+/**
+ * @brief Clamp a value value between @c mn and @c mx.
+ *
+ * For floats, NaNs are turned into @c mn.
+ *
+ * @param v      The value to clamp.
+ * @param mn     The min value (inclusive).
+ * @param mx     The max value (inclusive).
+ *
+ * @return The clamped value.
+ */
+template<typename T>
+inline T clamp(T v, T mn, T mx)
+{
+	// Do not reorder; correct NaN handling relies on the fact that comparison
+	// with NaN returns false and will fall-though to the "min" value.
+	if (v > mx) return mx;
+	if (v > mn) return v;
+	return mn;
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 1.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v   The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp1f(float v)
+{
+	return astc::clamp(v, 0.0f, 1.0f);
+}
+
+/**
+ * @brief Clamp a float value between 0.0f and 255.0f.
+ *
+ * NaNs are turned into 0.0f.
+ *
+ * @param v  The value to clamp.
+ *
+ * @return The clamped value.
+ */
+static inline float clamp255f(float v)
+{
+	return astc::clamp(v, 0.0f, 255.0f);
+}
+
+/**
+ * @brief SP float round-down.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline float flt_rd(float v)
+{
+	return std::floor(v);
+}
+
+/**
+ * @brief SP float round-to-nearest and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rtn(float v)
+{
+
+	return static_cast<int>(v + 0.5f);
+}
+
+/**
+ * @brief SP float round down and convert to integer.
+ *
+ * @param v   The value to round.
+ *
+ * @return The rounded value.
+ */
+static inline int flt2int_rd(float v)
+{
+	return static_cast<int>(v);
+}
+
+/**
+ * @brief SP float bit-interpreted as an integer.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline int float_as_int(float v)
+{
+	union { int a; float b; } u;
+	u.b = v;
+	return u.a;
+}
+
+/**
+ * @brief Integer bit-interpreted as an SP float.
+ *
+ * @param v   The value to bitcast.
+ *
+ * @return The converted value.
+ */
+static inline float int_as_float(int v)
+{
+	union { int a; float b; } u;
+	u.a = v;
+	return u.b;
+}
+
+/**
+ * @brief Fast approximation of 1.0 / sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float rsqrt(float v)
+{
+	return 1.0f / std::sqrt(v);
+}
+
+/**
+ * @brief Fast approximation of sqrt(val).
+ *
+ * @param v   The input value.
+ *
+ * @return The approximated result.
+ */
+static inline float sqrt(float v)
+{
+	return std::sqrt(v);
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      v      The input value.
+ * @param[out] expo   The output exponent.
+ *
+ * @return The mantissa.
+ */
+static inline float frexp(float v, int* expo)
+{
+	if32 p;
+	p.f = v;
+	*expo = ((p.u >> 23) & 0xFF) - 126;
+	p.u = (p.u & 0x807fffff) | 0x3f000000;
+	return p.f;
+}
+
+/**
+ * @brief Initialize the seed structure for a random number generator.
+ *
+ * Important note: For the purposes of ASTC we want sets of random numbers to
+ * use the codec, but we want the same seed value across instances and threads
+ * to ensure that image output is stable across compressor runs and across
+ * platforms. Every PRNG created by this call will therefore return the same
+ * sequence of values ...
+ *
+ * @param state The state structure to initialize.
+ */
+void rand_init(uint64_t state[2]);
+
+/**
+ * @brief Return the next random number from the generator.
+ *
+ * This RNG is an implementation of the "xoroshoro-128+ 1.0" PRNG, based on the
+ * public-domain implementation given by David Blackman & Sebastiano Vigna at
+ * http://vigna.di.unimi.it/xorshift/xoroshiro128plus.c
+ *
+ * @param state The state structure to use/update.
+ */
+uint64_t rand(uint64_t state[2]);
+
+}
+
+/* ============================================================================
+  Softfloat library with fp32 and fp16 conversion functionality.
+============================================================================ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+	/* narrowing float->float conversions */
+	uint16_t float_to_sf16(float val);
+	float sf16_to_float(uint16_t val);
+#endif
+
+/*********************************
+  Vector library
+*********************************/
+#include "astcenc_vecmathlib.h"
+
+/*********************************
+  Declaration of line types
+*********************************/
+// parametric line, 2D: The line is given by line = a + b * t.
+
+struct line2
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+// parametric line, 3D
+struct line3
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+struct line4
+{
+	vfloat4 a;
+	vfloat4 b;
+};
+
+
+struct processed_line2
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line3
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+struct processed_line4
+{
+	vfloat4 amod;
+	vfloat4 bs;
+};
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp b/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
new file mode 100644
index 0000000000..42db764549
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_mathlib_softfloat.cpp
@@ -0,0 +1,411 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Soft-float library for IEEE-754.
+ */
+#if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0)
+
+#include "astcenc_mathlib.h"
+
+/*	sized soft-float types. These are mapped to the sized integer
+    types of C99, instead of C's floating-point types; this is because
+    the library needs to maintain exact, bit-level control on all
+    operations on these data types. */
+typedef uint16_t sf16;
+typedef uint32_t sf32;
+
+/******************************************
+  helper functions and their lookup tables
+ ******************************************/
+/* count leading zeros functions. Only used when the input is nonzero. */
+
+#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+#elif defined(__arm__) && defined(__ARMCC_VERSION)
+#elif defined(__arm__) && defined(__GNUC__)
+#else
+	/* table used for the slow default versions. */
+	static const uint8_t clz_table[256] =
+	{
+		8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+		3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+	};
+#endif
+
+/*
+   32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */
+static uint32_t clz32(uint32_t inp)
+{
+	#if defined(__GNUC__) && (defined(__i386) || defined(__amd64))
+		uint32_t bsr;
+		__asm__("bsrl %1, %0": "=r"(bsr):"r"(inp | 1));
+		return 31 - bsr;
+	#else
+		#if defined(__arm__) && defined(__ARMCC_VERSION)
+			return __clz(inp);			/* armcc builtin */
+		#else
+			#if defined(__arm__) && defined(__GNUC__)
+				uint32_t lz;
+				__asm__("clz %0, %1": "=r"(lz):"r"(inp));
+				return lz;
+			#else
+				/* slow default version */
+				uint32_t summa = 24;
+				if (inp >= UINT32_C(0x10000))
+				{
+					inp >>= 16;
+					summa -= 16;
+				}
+				if (inp >= UINT32_C(0x100))
+				{
+					inp >>= 8;
+					summa -= 8;
+				}
+				return summa + clz_table[inp];
+			#endif
+		#endif
+	#endif
+}
+
+/* the five rounding modes that IEEE-754r defines */
+typedef enum
+{
+	SF_UP = 0,				/* round towards positive infinity */
+	SF_DOWN = 1,			/* round towards negative infinity */
+	SF_TOZERO = 2,			/* round towards zero */
+	SF_NEARESTEVEN = 3,		/* round toward nearest value; if mid-between, round to even value */
+	SF_NEARESTAWAY = 4		/* round toward nearest value; if mid-between, round away from zero */
+} roundmode;
+
+
+static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	uint32_t inp2 = inp + (vl1 >> 1);	/* added 0.5 ULP */
+	uint32_t msk = (inp | UINT32_C(1)) & vl1;	/* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */
+	msk--;						/* negative if even, nonnegative if odd. */
+	inp2 -= (msk >> 31);		/* subtract epsilon before shift if even. */
+	inp2 >>= shamt;
+	return inp2;
+}
+
+static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = (UINT32_C(1) << shamt) >> 1;
+	inp += vl1;
+	inp >>= shamt;
+	return inp;
+}
+
+static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt)
+{
+	uint32_t vl1 = UINT32_C(1) << shamt;
+	inp += vl1;
+	inp--;
+	inp >>= shamt;
+	return inp;
+}
+
+/* convert from FP16 to FP32. */
+static sf32 sf16_to_sf32(sf16 inp)
+{
+	uint32_t inpx = inp;
+
+	/*
+		This table contains, for every FP16 sign/exponent value combination,
+		the difference between the input FP16 value and the value obtained
+		by shifting the correct FP32 result right by 13 bits.
+		This table allows us to handle every case except denormals and NaN
+		with just 1 table lookup, 2 shifts and 1 add.
+	*/
+
+	#define WITH_MSB(a) (UINT32_C(a) | (1u << 31))
+	static const uint32_t tbl[64] =
+	{
+		WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000,          0x1C000,
+		         0x1C000,  0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000),
+		WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000,          0x54000,
+		         0x54000,  0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000)
+	};
+
+	uint32_t res = tbl[inpx >> 10];
+	res += inpx;
+
+	/* Normal cases: MSB of 'res' not set. */
+	if ((res & WITH_MSB(0)) == 0)
+	{
+		return res << 13;
+	}
+
+	/* Infinity and Zero: 10 LSB of 'res' not set. */
+	if ((res & 0x3FF) == 0)
+	{
+		return res << 13;
+	}
+
+	/* NaN: the exponent field of 'inp' is non-zero. */
+	if ((inpx & 0x7C00) != 0)
+	{
+		/* All NaNs are quietened. */
+		return (res << 13) | 0x400000;
+	}
+
+	/* Denormal cases */
+	uint32_t sign = (inpx & 0x8000) << 16;
+	uint32_t mskval = inpx & 0x7FFF;
+	uint32_t leadingzeroes = clz32(mskval);
+	mskval <<= leadingzeroes;
+	return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign;
+}
+
+/* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */
+static sf16 sf32_to_sf16(sf32 inp, roundmode rmode)
+{
+	/* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */
+	static const uint8_t tab[512] {
+		0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+		10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+		20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30,
+		30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40,
+		40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50,
+
+		5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+		15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+		25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35,
+		35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45,
+		45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55,
+	};
+
+	/* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code
+	   size. */
+	static const uint32_t tabx[60] {
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000),
+		UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000),
+		UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000),
+		UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00),
+		UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00),
+		UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000),
+		UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000)
+	};
+
+	uint32_t p;
+	uint32_t idx = rmode + tab[inp >> 23];
+	uint32_t vlx = tabx[idx];
+	switch (idx)
+	{
+		/*
+			Positive number which may be Infinity or NaN.
+			We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa.
+			(If we don't do this quieting, then a NaN  that is distinguished only by having
+			its low-order bits set, would be turned into an INF. */
+	case 50:
+	case 51:
+	case 52:
+	case 53:
+	case 54:
+	case 55:
+	case 56:
+	case 57:
+	case 58:
+	case 59:
+		/*
+			the input value is 0x7F800000 or 0xFF800000 if it is INF.
+			By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero.
+			For NaNs, however, this operation will keep bit 23 with the value 1.
+			We can then extract bit 23, and logical-OR bit 9 of the result with this
+			bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit
+			of the mantissa is set.)
+		*/
+		p = (inp - 1) & UINT32_C(0x800000);	/* zero if INF, nonzero if NaN. */
+		return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14));
+		/*
+			positive, exponent = 0, round-mode == UP; need to check whether number actually is 0.
+			If it is, then return 0, else return 1 (the smallest representable nonzero number)
+		*/
+	case 0:
+		/*
+			-inp will set the MSB if the input number is nonzero.
+			Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31);
+
+		/*
+			negative, exponent = , round-mode == DOWN, need to check whether number is
+			actually 0. If it is, return 0x8000 ( float -0.0 )
+			Else return the smallest negative number ( 0x8001 ) */
+	case 6:
+		/*
+			in this case 'vlx' is 0x80000000. By subtracting the input value from it,
+			we obtain a value that is 0 if the input value is in fact zero and has
+			the MSB set if it isn't. We then right-shift the value by 31 places to
+			get a value that is 0 if the input is -0.0 and 1 otherwise.
+		*/
+		return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000));
+
+		/*
+			for all other cases involving underflow/overflow, we don't need to
+			do actual tests; we just return 'vlx'.
+		*/
+	case 1:
+	case 2:
+	case 3:
+	case 4:
+	case 5:
+	case 7:
+	case 8:
+	case 9:
+	case 10:
+	case 11:
+	case 12:
+	case 13:
+	case 14:
+	case 15:
+	case 16:
+	case 17:
+	case 18:
+	case 19:
+	case 40:
+	case 41:
+	case 42:
+	case 43:
+	case 44:
+	case 45:
+	case 46:
+	case 47:
+	case 48:
+	case 49:
+		return static_cast<sf16>(vlx);
+
+		/*
+			for normal numbers, 'vlx' is the difference between the FP32 value of a number and the
+			FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is
+			baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away
+			from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero.
+			for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even
+			except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */
+
+		/* normal number, all rounding modes except round-to-nearest-even: */
+	case 30:
+	case 31:
+	case 32:
+	case 34:
+	case 35:
+	case 36:
+	case 37:
+	case 39:
+		return static_cast<sf16>((inp + vlx) >> 13);
+
+		/* normal number, round-to-nearest-even. */
+	case 33:
+	case 38:
+		p = inp + vlx;
+		p += (inp >> 13) & 1;
+		return static_cast<sf16>(p >> 13);
+
+		/*
+			the various denormal cases. These are not expected to be common, so their performance is a bit
+			less important. For each of these cases, we need to extract an exponent and a mantissa
+			(including the implicit '1'!), and then right-shift the mantissa by a shift-amount that
+			depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the
+			sign of the resulting denormal number.
+		*/
+	case 21:
+	case 22:
+	case 25:
+	case 27:
+		/* denormal, round towards zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx);
+	case 20:
+	case 26:
+		/* denormal, round away from zero. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 24:
+	case 29:
+		/* denormal, round to nearest-away */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	case 23:
+	case 28:
+		/* denormal, round to nearest-even. */
+		p = 126 - ((inp >> 23) & 0xFF);
+		return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx);
+	}
+
+	return 0;
+}
+
+/* convert from soft-float to native-float */
+float sf16_to_float(uint16_t p)
+{
+	if32 i;
+	i.u = sf16_to_sf32(p);
+	return i.f;
+}
+
+/* convert from native-float to soft-float */
+uint16_t float_to_sf16(float p)
+{
+	if32 i;
+	i.f = p;
+	return sf32_to_sf16(i.u, SF_NEARESTEVEN);
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_partition_tables.cpp b/thirdparty/astcenc/astcenc_partition_tables.cpp
new file mode 100644
index 0000000000..cad42384d7
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_partition_tables.cpp
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for generating partition tables on demand.
+ */
+
+#include "astcenc_internal.h"
+
+/** @brief The number of 64-bit words needed to represent a canonical partition bit pattern. */
+#define BIT_PATTERN_WORDS (((ASTCENC_BLOCK_MAX_TEXELS * 2) + 63) / 64)
+
+/**
+ * @brief Generate a canonical representation of a partition pattern.
+ *
+ * The returned value stores two bits per texel, for up to 6x6x6 texels, where the two bits store
+ * the remapped texel index. Remapping ensures that we only match on the partition pattern,
+ * independent of the partition order generated by the hash.
+ *
+ * @param      texel_count          The number of texels in the block.
+ * @param      partition_of_texel   The partition assignments, in hash order.
+ * @param[out] bit_pattern          The output bit pattern representation.
+ */
+static void generate_canonical_partitioning(
+	unsigned int texel_count,
+	const uint8_t* partition_of_texel,
+	uint64_t bit_pattern[BIT_PATTERN_WORDS]
+) {
+	// Clear the pattern
+	for (unsigned int i = 0; i < BIT_PATTERN_WORDS; i++)
+	{
+		bit_pattern[i] = 0;
+	}
+
+	// Store a mapping to reorder the raw partitions so that the partitions are ordered such
+	// that the lowest texel index in partition N is smaller than the lowest texel index in
+	// partition N + 1.
+	int mapped_index[BLOCK_MAX_PARTITIONS];
+	int map_weight_count = 0;
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		mapped_index[i] = -1;
+	}
+
+	for (unsigned int i = 0; i < texel_count; i++)
+	{
+		int index = partition_of_texel[i];
+		if (mapped_index[index] < 0)
+		{
+			mapped_index[index] = map_weight_count++;
+		}
+
+		uint64_t xlat_index = mapped_index[index];
+		bit_pattern[i >> 5] |= xlat_index << (2 * (i & 0x1F));
+	}
+}
+
+/**
+ * @brief Compare two canonical patterns to see if they are the same.
+ *
+ * @param part1   The first canonical bit pattern to check.
+ * @param part2   The second canonical bit pattern to check.
+ *
+ * @return @c true if the patterns are the same, @c false otherwise.
+ */
+static bool compare_canonical_partitionings(
+	const uint64_t part1[BIT_PATTERN_WORDS],
+	const uint64_t part2[BIT_PATTERN_WORDS]
+) {
+	return (part1[0] == part2[0])
+#if BIT_PATTERN_WORDS > 1
+	    && (part1[1] == part2[1])
+#endif
+#if BIT_PATTERN_WORDS > 2
+	    && (part1[2] == part2[2])
+#endif
+#if BIT_PATTERN_WORDS > 3
+	    && (part1[3] == part2[3])
+#endif
+#if BIT_PATTERN_WORDS > 4
+	    && (part1[4] == part2[4])
+#endif
+#if BIT_PATTERN_WORDS > 5
+	    && (part1[5] == part2[5])
+#endif
+#if BIT_PATTERN_WORDS > 6
+	    && (part1[6] == part2[6])
+#endif
+	    ;
+}
+
+/**
+ * @brief Hash function used for procedural partition assignment.
+ *
+ * @param inp   The hash seed.
+ *
+ * @return The hashed value.
+ */
+static uint32_t hash52(
+	uint32_t inp
+) {
+	inp ^= inp >> 15;
+
+	// (2^4 + 1) * (2^7 + 1) * (2^17 - 1)
+	inp *= 0xEEDE0891;
+	inp ^= inp >> 5;
+	inp += inp << 16;
+	inp ^= inp >> 7;
+	inp ^= inp >> 3;
+	inp ^= inp << 6;
+	inp ^= inp >> 17;
+	return inp;
+}
+
+/**
+ * @brief Select texel assignment for a single coordinate.
+ *
+ * @param seed              The seed - the partition index from the block.
+ * @param x                 The texel X coordinate in the block.
+ * @param y                 The texel Y coordinate in the block.
+ * @param z                 The texel Z coordinate in the block.
+ * @param partition_count   The total partition count of this encoding.
+ * @param small_block       @c true if the block has fewer than 32 texels.
+ *
+ * @return The assigned partition index for this texel.
+ */
+static uint8_t select_partition(
+	int seed,
+	int x,
+	int y,
+	int z,
+	int partition_count,
+	bool small_block
+) {
+	// For small blocks bias the coordinates to get better distribution
+	if (small_block)
+	{
+		x <<= 1;
+		y <<= 1;
+		z <<= 1;
+	}
+
+	seed += (partition_count - 1) * 1024;
+
+	uint32_t rnum = hash52(seed);
+
+	uint8_t seed1 = rnum & 0xF;
+	uint8_t seed2 = (rnum >> 4) & 0xF;
+	uint8_t seed3 = (rnum >> 8) & 0xF;
+	uint8_t seed4 = (rnum >> 12) & 0xF;
+	uint8_t seed5 = (rnum >> 16) & 0xF;
+	uint8_t seed6 = (rnum >> 20) & 0xF;
+	uint8_t seed7 = (rnum >> 24) & 0xF;
+	uint8_t seed8 = (rnum >> 28) & 0xF;
+	uint8_t seed9 = (rnum >> 18) & 0xF;
+	uint8_t seed10 = (rnum >> 22) & 0xF;
+	uint8_t seed11 = (rnum >> 26) & 0xF;
+	uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF;
+
+	// Squaring all the seeds in order to bias their distribution towards lower values.
+	seed1 *= seed1;
+	seed2 *= seed2;
+	seed3 *= seed3;
+	seed4 *= seed4;
+	seed5 *= seed5;
+	seed6 *= seed6;
+	seed7 *= seed7;
+	seed8 *= seed8;
+	seed9 *= seed9;
+	seed10 *= seed10;
+	seed11 *= seed11;
+	seed12 *= seed12;
+
+	int sh1, sh2;
+	if (seed & 1)
+	{
+		sh1 = (seed & 2 ? 4 : 5);
+		sh2 = (partition_count == 3 ? 6 : 5);
+	}
+	else
+	{
+		sh1 = (partition_count == 3 ? 6 : 5);
+		sh2 = (seed & 2 ? 4 : 5);
+	}
+
+	int sh3 = (seed & 0x10) ? sh1 : sh2;
+
+	seed1 >>= sh1;
+	seed2 >>= sh2;
+	seed3 >>= sh1;
+	seed4 >>= sh2;
+	seed5 >>= sh1;
+	seed6 >>= sh2;
+	seed7 >>= sh1;
+	seed8 >>= sh2;
+
+	seed9 >>= sh3;
+	seed10 >>= sh3;
+	seed11 >>= sh3;
+	seed12 >>= sh3;
+
+	int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14);
+	int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10);
+	int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6);
+	int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2);
+
+	// Apply the saw
+	a &= 0x3F;
+	b &= 0x3F;
+	c &= 0x3F;
+	d &= 0x3F;
+
+	// Remove some of the components if we are to output < 4 partitions.
+	if (partition_count <= 3)
+	{
+		d = 0;
+	}
+
+	if (partition_count <= 2)
+	{
+		c = 0;
+	}
+
+	if (partition_count <= 1)
+	{
+		b = 0;
+	}
+
+	uint8_t partition;
+	if (a >= b && a >= c && a >= d)
+	{
+		partition = 0;
+	}
+	else if (b >= c && b >= d)
+	{
+		partition = 1;
+	}
+	else if (c >= d)
+	{
+		partition = 2;
+	}
+	else
+	{
+		partition = 3;
+	}
+
+	return partition;
+}
+
+/**
+ * @brief Generate a single partition info structure.
+ *
+ * @param[out] bsd                     The block size information.
+ * @param      partition_count         The partition count of this partitioning.
+ * @param      partition_index         The partition index / seed of this partitioning.
+ * @param      partition_remap_index   The remapped partition index of this partitioning.
+ * @param[out] pi                      The partition info structure to populate.
+ *
+ * @return True if this is a useful partition index, False if we can skip it.
+ */
+static bool generate_one_partition_info_entry(
+	block_size_descriptor& bsd,
+	unsigned int partition_count,
+	unsigned int partition_index,
+	unsigned int partition_remap_index,
+	partition_info& pi
+) {
+	int texels_per_block = bsd.texel_count;
+	bool small_block = texels_per_block < 32;
+
+	uint8_t *partition_of_texel = pi.partition_of_texel;
+
+	// Assign texels to partitions
+	int texel_idx = 0;
+	int counts[BLOCK_MAX_PARTITIONS] { 0 };
+	for (unsigned int z = 0; z < bsd.zdim; z++)
+	{
+		for (unsigned int y = 0; y <  bsd.ydim; y++)
+		{
+			for (unsigned int x = 0; x <  bsd.xdim; x++)
+			{
+				uint8_t part = select_partition(partition_index, x, y, z, partition_count, small_block);
+				pi.texels_of_partition[part][counts[part]++] = static_cast<uint8_t>(texel_idx++);
+				*partition_of_texel++ = part;
+			}
+		}
+	}
+
+	// Fill loop tail so we can overfetch later
+	for (unsigned int i = 0; i < partition_count; i++)
+	{
+		int ptex_count = counts[i];
+		int ptex_count_simd = round_up_to_simd_multiple_vla(ptex_count);
+		for (int j = ptex_count; j < ptex_count_simd; j++)
+		{
+			pi.texels_of_partition[i][j] = pi.texels_of_partition[i][ptex_count - 1];
+		}
+	}
+
+	// Populate the actual procedural partition count
+	if (counts[0] == 0)
+	{
+		pi.partition_count = 0;
+	}
+	else if (counts[1] == 0)
+	{
+		pi.partition_count = 1;
+	}
+	else if (counts[2] == 0)
+	{
+		pi.partition_count = 2;
+	}
+	else if (counts[3] == 0)
+	{
+		pi.partition_count = 3;
+	}
+	else
+	{
+		pi.partition_count = 4;
+	}
+
+	// Populate the partition index
+	pi.partition_index = static_cast<uint16_t>(partition_index);
+
+	// Populate the coverage bitmaps for 2/3/4 partitions
+	uint64_t* bitmaps { nullptr };
+	if (partition_count == 2)
+	{
+		bitmaps = bsd.coverage_bitmaps_2[partition_remap_index];
+	}
+	else if (partition_count == 3)
+	{
+		bitmaps = bsd.coverage_bitmaps_3[partition_remap_index];
+	}
+	else if (partition_count == 4)
+	{
+		bitmaps = bsd.coverage_bitmaps_4[partition_remap_index];
+	}
+
+	for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONS; i++)
+	{
+		pi.partition_texel_count[i] = static_cast<uint8_t>(counts[i]);
+	}
+
+	// Valid partitionings have texels in all of the requested partitions
+	bool valid = pi.partition_count == partition_count;
+
+	if (bitmaps)
+	{
+		// Populate the partition coverage bitmap
+		for (unsigned int i = 0; i < partition_count; i++)
+		{
+			bitmaps[i] = 0ULL;
+		}
+
+		unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+		for (unsigned int i = 0; i < texels_to_process; i++)
+		{
+			unsigned int idx = bsd.kmeans_texels[i];
+			bitmaps[pi.partition_of_texel[idx]] |= 1ULL << i;
+		}
+	}
+
+	return valid;
+}
+
+static void build_partition_table_for_one_partition_count(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff,
+	unsigned int partition_count,
+	partition_info* ptab,
+	uint64_t* canonical_patterns
+) {
+	unsigned int next_index = 0;
+	bsd.partitioning_count_selected[partition_count - 1] = 0;
+	bsd.partitioning_count_all[partition_count - 1] = 0;
+
+	// Skip tables larger than config max partition count if we can omit modes
+	if (can_omit_partitionings && (partition_count > partition_count_cutoff))
+	{
+		return;
+	}
+
+	// Iterate through twice
+	//   - Pass 0: Keep selected partitionings
+	//   - Pass 1: Keep non-selected partitionings (skip if in omit mode)
+	unsigned int max_iter = can_omit_partitionings ? 1 : 2;
+
+	// Tracker for things we built in the first iteration
+	uint8_t build[BLOCK_MAX_PARTITIONINGS] { 0 };
+	for (unsigned int x = 0; x < max_iter; x++)
+	{
+		for (unsigned int i = 0; i < BLOCK_MAX_PARTITIONINGS; i++)
+		{
+			// Don't include things we built in the first pass
+			if ((x == 1) && build[i])
+			{
+				continue;
+			}
+
+			bool keep_useful = generate_one_partition_info_entry(bsd, partition_count, i, next_index, ptab[next_index]);
+			if ((x == 0) && !keep_useful)
+			{
+				continue;
+			}
+
+			generate_canonical_partitioning(bsd.texel_count, ptab[next_index].partition_of_texel, canonical_patterns + next_index * BIT_PATTERN_WORDS);
+			bool keep_canonical = true;
+			for (unsigned int j = 0; j < next_index; j++)
+			{
+				bool match = compare_canonical_partitionings(canonical_patterns + next_index * BIT_PATTERN_WORDS, canonical_patterns +  j * BIT_PATTERN_WORDS);
+				if (match)
+				{
+					keep_canonical = false;
+					break;
+				}
+			}
+
+			if (keep_useful && keep_canonical)
+			{
+				if (x == 0)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_selected[partition_count - 1]++;
+					bsd.partitioning_count_all[partition_count - 1]++;
+					build[i] = 1;
+					next_index++;
+				}
+			}
+			else
+			{
+				if (x == 1)
+				{
+					bsd.partitioning_packed_index[partition_count - 2][i] = static_cast<uint16_t>(next_index);
+					bsd.partitioning_count_all[partition_count - 1]++;
+					next_index++;
+				}
+			}
+		}
+	}
+}
+
+/* See header for documentation. */
+void init_partition_tables(
+	block_size_descriptor& bsd,
+	bool can_omit_partitionings,
+	unsigned int partition_count_cutoff
+) {
+	partition_info* par_tab2 = bsd.partitionings;
+	partition_info* par_tab3 = par_tab2 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab4 = par_tab3 + BLOCK_MAX_PARTITIONINGS;
+	partition_info* par_tab1 = par_tab4 + BLOCK_MAX_PARTITIONINGS;
+
+	generate_one_partition_info_entry(bsd, 1, 0, 0, *par_tab1);
+	bsd.partitioning_count_selected[0] = 1;
+	bsd.partitioning_count_all[0] = 1;
+
+	uint64_t* canonical_patterns = new uint64_t[BLOCK_MAX_PARTITIONINGS * BIT_PATTERN_WORDS];
+
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 2, par_tab2, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 3, par_tab3, canonical_patterns);
+	build_partition_table_for_one_partition_count(bsd, can_omit_partitionings, partition_count_cutoff, 4, par_tab4, canonical_patterns);
+
+	delete[] canonical_patterns;
+}
diff --git a/thirdparty/astcenc/astcenc_percentile_tables.cpp b/thirdparty/astcenc/astcenc_percentile_tables.cpp
new file mode 100644
index 0000000000..448ddcc968
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_percentile_tables.cpp
@@ -0,0 +1,1251 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Percentile data tables for different block encodings.
+ *
+ * To reduce binary size the tables are stored using a packed differential encoding.
+ */
+
+#include "astcenc_internal.h"
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+/**
+ * @brief Structure containing packed percentile metadata.
+ *
+ * Note that percentile tables do not exist for 3D textures, so no zdim is stored.
+ */
+struct packed_percentile_table
+{
+	/** The block X dimension. */
+	uint8_t xdim;
+
+	/** The block Y dimension. */
+	uint8_t ydim;
+
+	/** The number of packed items in the 1 and 2 plane data. */
+	uint16_t item_count[2];
+
+	/** The accumulator divisor for 1 and 2 plane data. */
+	uint16_t difscales[2];
+
+	/** The initial accumulator values for 1 and 2 plane data. */
+	uint16_t initial_percs[2];
+
+	/** The packed data for the 1 and 2 plane data. */
+	const uint16_t *items[2];
+};
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (4 * 4)
+static const uint16_t percentile_arr_4x4_0[61] {
+	0x0242, 0x7243, 0x6A51, 0x6A52, 0x5A41, 0x4A53, 0x8851, 0x3842,
+	0x3852, 0x3853, 0x3043, 0xFA33, 0x1BDF, 0x2022, 0x1032, 0x29CE,
+	0x21DE, 0x2823, 0x0813, 0x0A13, 0x0A31, 0x0A23, 0x09CF, 0x0833,
+	0x0A32, 0x01DF, 0x0BDD, 0x0BCF, 0x0221, 0x095F, 0x0A01, 0x0BDE,
+	0x0BCD, 0x0A22, 0x09AF, 0x0B5F, 0x0B4D, 0x0BCE, 0x0BBF, 0x0A11,
+	0x01BF, 0x0202, 0x0B5D, 0x1203, 0x034E, 0x0B8E, 0x035E, 0x0212,
+	0x032E, 0x0B4F, 0x03AF, 0x03AD, 0x03BD, 0x0BBE, 0x03AE, 0x039F,
+	0x039E, 0x033E, 0x033F, 0x038F, 0x032F
+};
+
+static const uint16_t percentile_arr_4x4_1[84] {
+	0x0452, 0xFFAE, 0x2433, 0x1DDF, 0x17CD, 0x1E21, 0x1C43, 0x1442,
+	0x3FBE, 0x1FDD, 0x0E31, 0x0F4F, 0x1423, 0x0FBD, 0x1451, 0x0E03,
+	0x05CF, 0x0C32, 0x0DDE, 0x27AD, 0x274E, 0x0E02, 0x0F5E, 0x07AF,
+	0x0F5F, 0x0DCE, 0x0C41, 0x0422, 0x0613, 0x0E12, 0x0611, 0x0F3F,
+	0x0601, 0x0DBF, 0x05DD, 0x075D, 0x0C02, 0x054E, 0x0431, 0x0413,
+	0x079F, 0x05BE, 0x0F4D, 0x0403, 0x05AF, 0x055F, 0x05AE, 0x054F,
+	0x0421, 0x05BD, 0x0DCD, 0x0411, 0x0412, 0x055E, 0x055D, 0x073D,
+	0x058E, 0x072F, 0x072D, 0x079D, 0x0D2E, 0x0453, 0x078D, 0x053E,
+	0x053F, 0x059E, 0x052F, 0x058F, 0x072E, 0x078F, 0x059F, 0x078E,
+	0x071F, 0x073E, 0x051F, 0x070D, 0x079E, 0x070E, 0x071D, 0x0622,
+	0x070F, 0x071E, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_4x4 {
+	4, 4,
+	{ 61, 84 },
+	{ 184, 141 },
+	{ 0, 53 },
+	{ percentile_arr_4x4_0, percentile_arr_4x4_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 4)
+static const uint16_t percentile_arr_5x4_0[91] {
+	0x02C1, 0xFAD1, 0xE8D3, 0xDAC2, 0xA8D2, 0x70D1, 0x50C2, 0x80C3,
+	0xD2C3, 0x4AA2, 0x2AD2, 0x2242, 0x2251, 0x42A3, 0x1A43, 0x4A52,
+	0x32B3, 0x2A41, 0x1042, 0x1851, 0x5892, 0x10A2, 0x2253, 0x10B2,
+	0x10B3, 0x13DF, 0x3083, 0x08B1, 0x1043, 0x12B1, 0x0AB2, 0x1A93,
+	0x1852, 0x1A33, 0x09CE, 0x08A3, 0x1022, 0x1283, 0x0853, 0x1AA1,
+	0x1093, 0x11DE, 0x135F, 0x1832, 0x195F, 0x0A81, 0x11CF, 0x0A31,
+	0x09DF, 0x0B4D, 0x09AF, 0x03CF, 0x0813, 0x03DD, 0x0A92, 0x0A82,
+	0x03CD, 0x0023, 0x0BDE, 0x0BBF, 0x1232, 0x0221, 0x0291, 0x0A23,
+	0x0833, 0x035D, 0x0BCE, 0x01BF, 0x0222, 0x134E, 0x0213, 0x0A01,
+	0x0B4F, 0x0B5E, 0x038E, 0x032E, 0x03AF, 0x0A11, 0x03AD, 0x0203,
+	0x0202, 0x0BBD, 0x033E, 0x03AE, 0x03BE, 0x0212, 0x033F, 0x039E,
+	0x039F, 0x032F, 0x038F
+};
+
+static const uint16_t percentile_arr_5x4_1[104] {
+	0x0433, 0xB621, 0x5452, 0x4443, 0x7FAE, 0xFCA3, 0x7CC2, 0x24B2,
+	0x45DF, 0x44B3, 0x7631, 0x27CD, 0x1CD1, 0x1E03, 0x4FBE, 0x774F,
+	0x1C42, 0x7691, 0x24A2, 0x2681, 0x3C23, 0x3C93, 0x0FBD, 0x1C32,
+	0x1E82, 0x1E12, 0x0F4E, 0x1602, 0x0FAD, 0x0C51, 0x1FDD, 0x0E13,
+	0x0DCF, 0x175E, 0x0C22, 0x175F, 0x15DE, 0x0CB1, 0x17AF, 0x1CC1,
+	0x1F3F, 0x1483, 0x0441, 0x0C91, 0x04D2, 0x0DCE, 0x154E, 0x079F,
+	0x0CA1, 0x0F5D, 0x0431, 0x15DD, 0x05BF, 0x0C92, 0x0611, 0x0C82,
+	0x0402, 0x074D, 0x0DBD, 0x055E, 0x05BE, 0x0DCD, 0x0421, 0x05AF,
+	0x0403, 0x0D4F, 0x055F, 0x05AE, 0x0413, 0x0E01, 0x055D, 0x073D,
+	0x0C12, 0x0692, 0x0411, 0x072D, 0x078D, 0x079D, 0x058E, 0x0D2E,
+	0x0453, 0x072F, 0x059E, 0x052F, 0x071F, 0x053F, 0x053E, 0x078F,
+	0x058F, 0x051F, 0x0F2E, 0x059F, 0x078E, 0x073E, 0x071D, 0x070D,
+	0x070E, 0x079E, 0x0622, 0x0683, 0x070F, 0x071E, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_5x4 {
+	5, 4,
+	{ 91, 104 },
+	{ 322, 464 },
+	{ 0, 202 },
+	{ percentile_arr_5x4_0, percentile_arr_5x4_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 5)
+static const uint16_t percentile_arr_5x5_0[129] {
+	0x00F3, 0xF8F2, 0x70E3, 0x62E1, 0x60E1, 0x4AC1, 0x3261, 0x38D3,
+	0x3271, 0x5AF1, 0x5873, 0x2AD1, 0x28E2, 0x28F1, 0x2262, 0x9AC2,
+	0x18D2, 0x1072, 0x1071, 0x22A2, 0x2062, 0x1A51, 0x10C2, 0x0892,
+	0x08D1, 0x1AA3, 0x23EE, 0x08C3, 0x0BEF, 0x2242, 0x0863, 0x0AB3,
+	0x0BFF, 0x0A93, 0x08A2, 0x0A41, 0x1083, 0x0842, 0x10B3, 0x21EE,
+	0x10B2, 0x00B1, 0x1263, 0x12C3, 0x0A83, 0x0851, 0x11FE, 0x0253,
+	0x09FD, 0x0A72, 0x09FF, 0x1AB2, 0x0BDF, 0x0A33, 0x0243, 0x0B7F,
+	0x0AB1, 0x12D2, 0x0252, 0x096F, 0x00A3, 0x0893, 0x0822, 0x0843,
+	0x097E, 0x097F, 0x01EF, 0x09CE, 0x03FE, 0x0A81, 0x036F, 0x0052,
+	0x13FD, 0x0AA1, 0x1853, 0x036D, 0x0A92, 0x0832, 0x01DE, 0x0A82,
+	0x0BED, 0x0231, 0x0BBF, 0x03DD, 0x0B6E, 0x01AF, 0x0813, 0x0023,
+	0x0A91, 0x015F, 0x037E, 0x01CF, 0x0232, 0x0BCD, 0x0221, 0x0BDE,
+	0x0213, 0x035F, 0x0B7D, 0x0223, 0x01BF, 0x0BCF, 0x01DF, 0x0033,
+	0x0222, 0x03CE, 0x0A01, 0x03AF, 0x034D, 0x0B8E, 0x032E, 0x0203,
+	0x0211, 0x0202, 0x0B5D, 0x03AD, 0x034E, 0x03AE, 0x034F, 0x033F,
+	0x039F, 0x03BD, 0x03BE, 0x035E, 0x0212, 0x033E, 0x039E, 0x032F,
+	0x038F
+};
+
+static const uint16_t percentile_arr_5x5_1[126] {
+	0x0443, 0x6452, 0xFE21, 0x27AE, 0x2433, 0x1FCD, 0x25DF, 0x6CC2,
+	0x2C62, 0x1F4F, 0x4C42, 0x1FBE, 0x0DEF, 0x34A3, 0x0E03, 0x54B2,
+	0x1F7D, 0x17DD, 0x0DFF, 0x0CD1, 0x0E31, 0x0C71, 0x1CF1, 0x15FE,
+	0x1691, 0x1681, 0x24B3, 0x174E, 0x0F6E, 0x0493, 0x175E, 0x1C51,
+	0x17BD, 0x076D, 0x2CA2, 0x05EE, 0x1472, 0x2423, 0x0DCF, 0x0432,
+	0x15DE, 0x0612, 0x0CD2, 0x0682, 0x0F5F, 0x07AD, 0x0602, 0x0CE1,
+	0x0C91, 0x0FAF, 0x073F, 0x0E13, 0x0D7F, 0x0DCE, 0x0422, 0x0D7D,
+	0x0441, 0x05FD, 0x0CB1, 0x0C83, 0x04C1, 0x0461, 0x0F9F, 0x0DDD,
+	0x056E, 0x0C92, 0x0482, 0x0431, 0x05ED, 0x0D6F, 0x075D, 0x0402,
+	0x057E, 0x0DBF, 0x04A1, 0x054E, 0x0F4D, 0x0403, 0x05CD, 0x0453,
+	0x05AE, 0x0421, 0x0F1F, 0x05BE, 0x0601, 0x0611, 0x05BD, 0x05AF,
+	0x078D, 0x072D, 0x073D, 0x055E, 0x0F9D, 0x0411, 0x0413, 0x0412,
+	0x055F, 0x077E, 0x055D, 0x052E, 0x054F, 0x053E, 0x058E, 0x078F,
+	0x059E, 0x071D, 0x0E92, 0x053F, 0x059F, 0x051F, 0x072F, 0x052F,
+	0x070D, 0x079E, 0x058F, 0x072E, 0x070E, 0x078E, 0x070F, 0x073E,
+	0x0622, 0x0683, 0x071E, 0x076F, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_5x5 {
+	5, 5,
+	{ 129, 126 },
+	{ 258, 291 },
+	{ 0, 116 },
+	{ percentile_arr_5x5_0, percentile_arr_5x5_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 5)
+static const uint16_t percentile_arr_6x5_0[165] {
+	0x0163, 0xF8F3, 0x9962, 0x8972, 0x7961, 0x7173, 0x6953, 0x5943,
+	0x4B41, 0x3AE1, 0x38E3, 0x6971, 0x32C1, 0x28D3, 0x2A61, 0xC8F2,
+	0x2271, 0x4873, 0x5B21, 0x3AD1, 0x1B13, 0x1952, 0x1B51, 0x12F1,
+	0x1A62, 0x1322, 0x1951, 0x10E2, 0x1B31, 0x20F1, 0x2102, 0x2072,
+	0x10D2, 0x1142, 0x2912, 0x3871, 0x2BEE, 0x0862, 0x1123, 0x0AC2,
+	0x12A2, 0x0A51, 0x1922, 0x0941, 0x1BEF, 0x0B42, 0x08D1, 0x13FF,
+	0x1933, 0x08C3, 0x08C2, 0x1131, 0x08E1, 0x2903, 0x0863, 0x0B32,
+	0x1132, 0x1AC3, 0x0A42, 0x1A41, 0x0042, 0x21EE, 0x09FF, 0x03DF,
+	0x0AA3, 0x11FE, 0x02B3, 0x0B11, 0x10B3, 0x0B03, 0x11FD, 0x0913,
+	0x0A53, 0x037F, 0x1263, 0x0051, 0x0A33, 0x0B01, 0x016F, 0x0A72,
+	0x1312, 0x08A2, 0x10B1, 0x0BFE, 0x11EF, 0x0B02, 0x0A52, 0x0043,
+	0x0822, 0x01CE, 0x0A43, 0x097F, 0x036F, 0x08B2, 0x03FD, 0x0A83,
+	0x0B33, 0x0AB1, 0x017E, 0x0B23, 0x0852, 0x02D2, 0x0BBF, 0x0BDD,
+	0x03ED, 0x0AB2, 0x02A1, 0x0853, 0x036D, 0x0892, 0x0032, 0x0A31,
+	0x0083, 0x09DE, 0x0A93, 0x08A3, 0x1213, 0x0BDE, 0x03CD, 0x036E,
+	0x037E, 0x0A21, 0x0023, 0x0BCF, 0x01CF, 0x0013, 0x01AF, 0x0A92,
+	0x0232, 0x035F, 0x0093, 0x0B7D, 0x015F, 0x0282, 0x01BF, 0x09DF,
+	0x03CE, 0x0223, 0x0833, 0x0222, 0x03AF, 0x0A01, 0x0291, 0x0B4D,
+	0x032E, 0x038E, 0x0203, 0x0281, 0x035D, 0x03AD, 0x0B9F, 0x0202,
+	0x034F, 0x03BE, 0x0211, 0x03AE, 0x03BD, 0x0212, 0x034E, 0x033F,
+	0x033E, 0x035E, 0x039E, 0x032F, 0x038F
+};
+
+static const uint16_t percentile_arr_6x5_1[145] {
+	0x0443, 0xEFAE, 0x2CC2, 0x2E21, 0x2C52, 0x7C33, 0x47CD, 0x25DF,
+	0x3CA3, 0xFFBE, 0x2551, 0x24B3, 0x474F, 0x1513, 0x2691, 0x1603,
+	0x1462, 0x1D32, 0x14B2, 0x5442, 0x2CD2, 0x35EF, 0x0CD1, 0x3D22,
+	0x17BD, 0x0FDD, 0x0DFF, 0x2631, 0x177D, 0x0CF1, 0x1E81, 0x0E82,
+	0x1DFE, 0x0F5E, 0x0701, 0x2CA2, 0x1D03, 0x0F4E, 0x1471, 0x0C51,
+	0x1F6E, 0x2FAF, 0x0561, 0x0C72, 0x176D, 0x0FAD, 0x0DEE, 0x05CF,
+	0x0E13, 0x0F5F, 0x0E12, 0x0C23, 0x1E02, 0x1D12, 0x0CB1, 0x0C32,
+	0x0C93, 0x15DE, 0x0F9F, 0x0F3F, 0x0D41, 0x0C41, 0x0CC1, 0x0D31,
+	0x0C22, 0x05FD, 0x057F, 0x0D01, 0x0461, 0x04E1, 0x0D7D, 0x05CE,
+	0x0502, 0x0C31, 0x05ED, 0x05DD, 0x0511, 0x0F11, 0x0491, 0x0D6F,
+	0x0521, 0x056E, 0x0C83, 0x0D23, 0x04A1, 0x0C02, 0x075D, 0x05BF,
+	0x0C21, 0x079D, 0x0482, 0x05BD, 0x0DBE, 0x05CD, 0x054E, 0x057E,
+	0x0DAE, 0x074D, 0x078D, 0x0542, 0x0492, 0x05AF, 0x0611, 0x0F3D,
+	0x0601, 0x071F, 0x055E, 0x059E, 0x0571, 0x054F, 0x0412, 0x0453,
+	0x058E, 0x0413, 0x0D3E, 0x077E, 0x072D, 0x052E, 0x059F, 0x055D,
+	0x072F, 0x0403, 0x0411, 0x058F, 0x055F, 0x0692, 0x078E, 0x053F,
+	0x0D2F, 0x078F, 0x070D, 0x071D, 0x051F, 0x072E, 0x079E, 0x070E,
+	0x070F, 0x073E, 0x0622, 0x0683, 0x0702, 0x071E, 0x076F, 0x07BF,
+	0x07CE
+};
+
+static const packed_percentile_table block_pcd_6x5 {
+	6, 5,
+	{ 165, 145 },
+	{ 388, 405 },
+	{ 0, 156 },
+	{ percentile_arr_6x5_0, percentile_arr_6x5_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 6)
+static const uint16_t percentile_arr_6x6_0[206] {
+	0x006F, 0xF908, 0xF104, 0xE918, 0xE963, 0xD114, 0xB0F3, 0xA07E,
+	0x7972, 0x705F, 0x687F, 0x6162, 0x5953, 0x586E, 0x610C, 0x524D,
+	0x5973, 0x9943, 0x98E3, 0x904F, 0x8341, 0x7AC1, 0x3A61, 0x70D3,
+	0xA073, 0x6AE1, 0x30F2, 0x3313, 0x2B21, 0x9A2E, 0x4322, 0x225D,
+	0x2331, 0x2271, 0x22D1, 0x1A2D, 0x221F, 0x22F1, 0x1971, 0x6952,
+	0x1951, 0x187D, 0x18F1, 0x1902, 0x185E, 0x1B51, 0x105D, 0x1A3D,
+	0x30E2, 0x10D2, 0x1961, 0x12A2, 0x6072, 0x3942, 0x386D, 0x33EE,
+	0x104E, 0x4923, 0x101E, 0x2122, 0x1251, 0x1141, 0x182F, 0x3133,
+	0x080E, 0x1262, 0x123E, 0x1B32, 0x102E, 0x1931, 0x10D1, 0x1912,
+	0x0871, 0x12C2, 0x08C2, 0x1103, 0x0B03, 0x1062, 0x083D, 0x08E1,
+	0x1132, 0x184D, 0x0863, 0x08C3, 0x303F, 0x083E, 0x10B3, 0x12A3,
+	0x0BEF, 0x0B11, 0x1A42, 0x2233, 0x13FF, 0x080F, 0x0A41, 0x0AC3,
+	0x0842, 0x1A63, 0x0BDF, 0x09FF, 0x12B3, 0x124E, 0x0B12, 0x0B42,
+	0x0A2F, 0x1253, 0x0913, 0x1051, 0x0B01, 0x120F, 0x0B02, 0x08A2,
+	0x0BBF, 0x00B1, 0x22B1, 0x01EE, 0x1B33, 0x0B23, 0x0283, 0x13FD,
+	0x0AB2, 0x11FD, 0x09FE, 0x0A43, 0x08B2, 0x0A1D, 0x0A52, 0x023F,
+	0x101F, 0x01CE, 0x0A31, 0x0BDD, 0x0293, 0x1822, 0x12A1, 0x03FE,
+	0x121E, 0x0843, 0x0272, 0x0B6F, 0x0052, 0x0A0D, 0x0BED, 0x12D2,
+	0x1B7F, 0x1053, 0x0032, 0x01DE, 0x08A3, 0x020E, 0x0883, 0x09EF,
+	0x0892, 0x0A21, 0x03CD, 0x0B5F, 0x0213, 0x0A32, 0x016F, 0x1292,
+	0x03DE, 0x017E, 0x0BAF, 0x0223, 0x1093, 0x0BCF, 0x037E, 0x01DF,
+	0x09CF, 0x015F, 0x09AF, 0x0023, 0x01BF, 0x0222, 0x0282, 0x03CE,
+	0x1013, 0x036E, 0x097F, 0x0033, 0x0A01, 0x0B6D, 0x03BE, 0x037D,
+	0x0281, 0x0BAE, 0x0203, 0x032E, 0x034D, 0x034F, 0x0291, 0x0211,
+	0x038E, 0x03BD, 0x039E, 0x0BAD, 0x033E, 0x034E, 0x039F, 0x0202,
+	0x035D, 0x0212, 0x033F, 0x035E, 0x038F, 0x032F
+};
+
+static const uint16_t percentile_arr_6x6_1[164] {
+	0x07AE, 0x8443, 0x7E21, 0x77CD, 0x6C62, 0x9433, 0x6452, 0x34C2,
+	0x5DDF, 0xC7BE, 0x25EF, 0x24A3, 0x3CF1, 0xFDFF, 0x177D, 0x1F4F,
+	0xC551, 0x5CB3, 0x1532, 0x1513, 0x143E, 0x245D, 0x14B2, 0x2472,
+	0x14D2, 0x1FBD, 0x1631, 0x2DFE, 0x1691, 0x17DD, 0x2E03, 0x376E,
+	0x2442, 0x0F6D, 0x3C71, 0x2CD1, 0x2522, 0x6C51, 0x260D, 0x17AF,
+	0x0DEE, 0x1C1F, 0x2F01, 0x142E, 0x0CA2, 0x0FAD, 0x3D03, 0x275E,
+	0x1681, 0x274E, 0x1682, 0x1C23, 0x273F, 0x0F5F, 0x05DE, 0x15FD,
+	0x0DCF, 0x1E02, 0x04B1, 0x144D, 0x0E12, 0x0D12, 0x1CC1, 0x0E13,
+	0x1C6D, 0x0C32, 0x043D, 0x0C61, 0x0F9F, 0x04E1, 0x0DCE, 0x0D41,
+	0x1C93, 0x0C22, 0x061D, 0x0D7F, 0x0C41, 0x0561, 0x0531, 0x0D21,
+	0x0711, 0x0C91, 0x0501, 0x0C1E, 0x040F, 0x15DD, 0x0431, 0x0C2F,
+	0x057D, 0x0C2D, 0x0DBE, 0x040E, 0x0D02, 0x0D11, 0x054E, 0x040D,
+	0x0D23, 0x0DBF, 0x04A1, 0x05ED, 0x0C1D, 0x05BD, 0x072D, 0x056E,
+	0x0483, 0x0F3D, 0x0482, 0x078D, 0x0F5D, 0x0453, 0x0D9E, 0x0C4E,
+	0x05CD, 0x079D, 0x0402, 0x05AE, 0x0F1F, 0x0542, 0x074D, 0x056F,
+	0x0421, 0x0D4F, 0x0601, 0x0571, 0x0492, 0x059F, 0x053F, 0x05AF,
+	0x0611, 0x055E, 0x0D8E, 0x053E, 0x055D, 0x047D, 0x0411, 0x052E,
+	0x058F, 0x051F, 0x055F, 0x0D7E, 0x072F, 0x052F, 0x0412, 0x078F,
+	0x0403, 0x077E, 0x070D, 0x070E, 0x078E, 0x0F1D, 0x072E, 0x0413,
+	0x070F, 0x0692, 0x079E, 0x060E, 0x0622, 0x0683, 0x0702, 0x071E,
+	0x073E, 0x076F, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_6x6 {
+	6, 6,
+	{ 206, 164 },
+	{ 769, 644 },
+	{ 0, 256 },
+	{ percentile_arr_6x6_0, percentile_arr_6x6_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 5)
+static const uint16_t percentile_arr_8x5_0[226] {
+	0x0066, 0xF865, 0xE963, 0xA856, 0xA1F2, 0x9875, 0x91C3, 0x91E2,
+	0x80F3, 0x8076, 0x61E3, 0x6153, 0x5172, 0x59D2, 0x51D3, 0x5047,
+	0xA943, 0x49B3, 0x4846, 0x4962, 0xC037, 0x4173, 0x39F1, 0x7027,
+	0xA2C1, 0x3AE1, 0x9341, 0x30D3, 0x5225, 0x2A61, 0x33C1, 0x28E3,
+	0x53A1, 0x49C2, 0x2A06, 0x4055, 0x2006, 0x21D1, 0x2271, 0x4321,
+	0x3873, 0x18F2, 0x2015, 0x1A15, 0x1857, 0x52D1, 0x3045, 0x4835,
+	0x1952, 0x29E1, 0x3207, 0x1036, 0x1816, 0x2A16, 0x2971, 0x13B1,
+	0x2A17, 0x2351, 0x1025, 0x1826, 0x30E2, 0x1262, 0x20F1, 0x1007,
+	0x1072, 0x1151, 0x10D2, 0x1235, 0x1205, 0x1062, 0x4AF1, 0x1251,
+	0x0B31, 0x1381, 0x13EE, 0x1B92, 0x13EF, 0x0942, 0x1AA2, 0x13FF,
+	0x1161, 0x0B93, 0x19A2, 0x11B1, 0x08D1, 0x12C2, 0x0B13, 0x1B22,
+	0x2123, 0x09A3, 0x2071, 0x1B7F, 0x1817, 0x0A42, 0x10C2, 0x1233,
+	0x08C3, 0x0A41, 0x0B42, 0x09C1, 0x0933, 0x1AB3, 0x1382, 0x1BDF,
+	0x2122, 0x0A53, 0x0AC3, 0x20E1, 0x0941, 0x0931, 0x0042, 0x0BA2,
+	0x0AA3, 0x0992, 0x0863, 0x08B3, 0x11B2, 0x0902, 0x1283, 0x09FF,
+	0x0B83, 0x0982, 0x0932, 0x0BFE, 0x0B32, 0x0BBF, 0x11FE, 0x036F,
+	0x0851, 0x08B1, 0x18A2, 0x11EE, 0x0A52, 0x0BB2, 0x01FD, 0x0A43,
+	0x1A63, 0x1193, 0x0B91, 0x0043, 0x1231, 0x0A26, 0x0AB1, 0x03FD,
+	0x096F, 0x00B2, 0x0983, 0x0A72, 0x01CE, 0x0BDD, 0x0022, 0x0B11,
+	0x1213, 0x0B6D, 0x017E, 0x1333, 0x0112, 0x0852, 0x02D2, 0x097F,
+	0x01EF, 0x0AB2, 0x0293, 0x0853, 0x0BED, 0x0B12, 0x1303, 0x02A1,
+	0x0892, 0x0032, 0x0883, 0x0B6E, 0x0292, 0x0A32, 0x037E, 0x0B23,
+	0x0103, 0x0A21, 0x0B01, 0x0302, 0x0BCD, 0x00A3, 0x0BCF, 0x0BDE,
+	0x0113, 0x01DE, 0x0B5F, 0x0013, 0x0BAF, 0x0223, 0x0222, 0x0A82,
+	0x0833, 0x0023, 0x09CF, 0x037D, 0x01AF, 0x095F, 0x03CE, 0x09DF,
+	0x01BF, 0x0893, 0x0203, 0x0201, 0x0B4D, 0x03BE, 0x032E, 0x03AE,
+	0x0291, 0x0A02, 0x0211, 0x039F, 0x0281, 0x038E, 0x03AD, 0x033F,
+	0x035D, 0x033E, 0x034E, 0x034F, 0x0212, 0x03BD, 0x032F, 0x035E,
+	0x038F, 0x039E
+};
+
+static const uint16_t percentile_arr_8x5_1[167] {
+	0x0621, 0xFCC2, 0x3443, 0xA433, 0x5532, 0x2551, 0x6CA3, 0x27AE,
+	0x6452, 0x8E03, 0x3CB3, 0x4DA2, 0x6DDF, 0x37CD, 0x6F01, 0x1691,
+	0x2E82, 0x27BE, 0x1513, 0x34D2, 0x1D22, 0x3E31, 0x2593, 0x2CB2,
+	0x1C16, 0x374F, 0x0DD1, 0x2583, 0x6613, 0x0CD1, 0x0C35, 0x1462,
+	0x3E81, 0x2612, 0x2C42, 0x3407, 0x14A2, 0x0E02, 0x1CF1, 0x0C06,
+	0x17BD, 0x0F7D, 0x1D23, 0x35B1, 0x179F, 0x0D92, 0x0F5E, 0x1451,
+	0x04B1, 0x1F6E, 0x0DEF, 0x0D31, 0x374E, 0x15C1, 0x0541, 0x2405,
+	0x17AD, 0x0471, 0x1472, 0x0DFE, 0x0711, 0x0FDD, 0x0DFF, 0x0432,
+	0x1D82, 0x0423, 0x0F6D, 0x07AF, 0x0F5F, 0x04C1, 0x1542, 0x0561,
+	0x0DCF, 0x1D03, 0x1493, 0x0422, 0x0445, 0x0D12, 0x0C25, 0x0415,
+	0x0DA1, 0x1591, 0x0DEE, 0x05DE, 0x0C31, 0x0491, 0x0441, 0x0D21,
+	0x078D, 0x057D, 0x0C61, 0x0F3F, 0x0581, 0x0D6E, 0x0501, 0x0CA1,
+	0x04E1, 0x0DFD, 0x057F, 0x0502, 0x0511, 0x0C82, 0x0483, 0x0C03,
+	0x079D, 0x0402, 0x0DDD, 0x0611, 0x05AE, 0x0DCE, 0x056F, 0x0421,
+	0x057E, 0x071F, 0x0DBF, 0x05BE, 0x0412, 0x059F, 0x054E, 0x077E,
+	0x0C26, 0x05ED, 0x073D, 0x0601, 0x0492, 0x0453, 0x075D, 0x058E,
+	0x0F2D, 0x05CD, 0x0571, 0x053E, 0x0692, 0x05BD, 0x054F, 0x055E,
+	0x0411, 0x0F1D, 0x074D, 0x059E, 0x05AF, 0x070D, 0x053F, 0x058F,
+	0x0413, 0x070F, 0x055D, 0x070E, 0x078F, 0x052E, 0x072F, 0x055F,
+	0x078E, 0x0F2E, 0x052F, 0x051F, 0x0417, 0x071E, 0x0781, 0x0622,
+	0x0683, 0x0702, 0x073E, 0x076F, 0x079E, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_8x5 {
+	8, 5,
+	{ 226, 167 },
+	{ 763, 517 },
+	{ 0, 178 },
+	{ percentile_arr_8x5_0, percentile_arr_8x5_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 6)
+static const uint16_t percentile_arr_8x6_0[273] {
+	0x0154, 0xF944, 0xE066, 0xA128, 0x9963, 0x8118, 0x806F, 0x79F2,
+	0x79E2, 0x7108, 0xD934, 0x6056, 0x69C3, 0x60F3, 0x5972, 0x59E3,
+	0x5075, 0x91B3, 0xC9D2, 0x807E, 0x385F, 0x4153, 0x3943, 0x4162,
+	0x3837, 0x3847, 0x7173, 0x31D3, 0x6948, 0x3046, 0x307F, 0x5827,
+	0x3114, 0x32C1, 0x3076, 0x2A4D, 0x58E3, 0x306E, 0x2924, 0x2A61,
+	0x29F1, 0x50D3, 0x704F, 0x210C, 0x2BA1, 0x2225, 0x2873, 0x4865,
+	0x2206, 0x8341, 0x2006, 0x3B21, 0x18F2, 0x21C2, 0x1A1F, 0x23C1,
+	0x3AE1, 0x1855, 0x19D1, 0x1A15, 0x3815, 0x1207, 0x1835, 0x2A2E,
+	0x1A16, 0x1836, 0x2271, 0x2845, 0x1A2D, 0x11E1, 0x1816, 0x1171,
+	0x2217, 0x1952, 0x12D1, 0x3904, 0x125D, 0x4BB1, 0x207D, 0x10E2,
+	0x1026, 0x2025, 0x12F1, 0x28F1, 0x105D, 0x1235, 0x12A2, 0x1007,
+	0x123D, 0x1A05, 0x1072, 0x1331, 0x101E, 0x0951, 0x10D2, 0x1057,
+	0x1B92, 0x185E, 0x1251, 0x19A2, 0x186D, 0x0B81, 0x2BEE, 0x080E,
+	0x1A33, 0x1942, 0x0B13, 0x0B51, 0x11A3, 0x0923, 0x2322, 0x09B1,
+	0x184E, 0x1161, 0x18D1, 0x0933, 0x0B93, 0x4A62, 0x1017, 0x082F,
+	0x0A42, 0x0B82, 0x0AA3, 0x0A41, 0x08C2, 0x08B3, 0x0A3E, 0x22B3,
+	0x0871, 0x1BBF, 0x09C1, 0x0AC2, 0x09B2, 0x0BEF, 0x082E, 0x1062,
+	0x0922, 0x08C3, 0x1063, 0x0A53, 0x0BDF, 0x080F, 0x0B42, 0x0A83,
+	0x084D, 0x103F, 0x0931, 0x08E1, 0x0A0F, 0x1BA2, 0x09FF, 0x1332,
+	0x03FF, 0x0941, 0x12C3, 0x0A63, 0x003D, 0x0842, 0x083E, 0x0B83,
+	0x0BB2, 0x0A31, 0x0932, 0x1102, 0x0992, 0x0982, 0x1051, 0x08B1,
+	0x0A2F, 0x121E, 0x02B1, 0x0A4E, 0x11EE, 0x00A2, 0x1022, 0x0043,
+	0x0A52, 0x0A1D, 0x0226, 0x1193, 0x03DD, 0x08B2, 0x0BFD, 0x0A43,
+	0x0A13, 0x0AB2, 0x01FD, 0x09FE, 0x020D, 0x081F, 0x0B33, 0x0053,
+	0x0B91, 0x0293, 0x0B11, 0x0B7F, 0x0AA1, 0x0B03, 0x0A0E, 0x03FE,
+	0x01CE, 0x0B6F, 0x0183, 0x0912, 0x023F, 0x0852, 0x0A21, 0x0323,
+	0x03ED, 0x0A32, 0x13AF, 0x0272, 0x08A3, 0x0B12, 0x0083, 0x0832,
+	0x13CD, 0x0223, 0x0A92, 0x0092, 0x0AD2, 0x0301, 0x0302, 0x0BDE,
+	0x0A22, 0x01EF, 0x0B5F, 0x0103, 0x0BCF, 0x096F, 0x017E, 0x0113,
+	0x01DE, 0x0823, 0x0282, 0x0B6E, 0x015F, 0x0813, 0x01AF, 0x01CF,
+	0x0B7E, 0x0033, 0x01DF, 0x0BCE, 0x01BF, 0x036D, 0x0A03, 0x017F,
+	0x03BE, 0x0201, 0x0893, 0x038E, 0x034D, 0x03AE, 0x0202, 0x039F,
+	0x0291, 0x0A11, 0x032E, 0x033F, 0x034F, 0x0281, 0x037D, 0x03BD,
+	0x0212, 0x033E, 0x035E, 0x034E, 0x035D, 0x03AD, 0x032F, 0x038F,
+	0x039E
+};
+
+static const uint16_t percentile_arr_8x6_1[186] {
+	0x0621, 0xFC33, 0x37AE, 0x1CC2, 0x2C43, 0xAD32, 0x34A3, 0x4551,
+	0x6452, 0x5C62, 0x1FCD, 0x14F1, 0x4CB3, 0x24D2, 0x15DF, 0x0FBE,
+	0x2603, 0x3DA2, 0x2E31, 0x25D1, 0x25EF, 0x0D22, 0x2E91, 0x1E82,
+	0x0FBD, 0x1513, 0x0CB2, 0x0CD1, 0x0F4F, 0x1F7D, 0x1701, 0x0C16,
+	0x2593, 0x2C42, 0x0C72, 0x14A2, 0x0F6E, 0x0C35, 0x0C71, 0x0D83,
+	0x0C07, 0x1DFF, 0x043E, 0x1613, 0x07DD, 0x0FAD, 0x1451, 0x076D,
+	0x0E81, 0x05FE, 0x0406, 0x0E0D, 0x045D, 0x2612, 0x0E02, 0x07AF,
+	0x0DB1, 0x0F5E, 0x15C1, 0x0C23, 0x1523, 0x0C1F, 0x0D92, 0x04B1,
+	0x0D31, 0x0432, 0x0D61, 0x0F4E, 0x0D41, 0x0DEE, 0x0D42, 0x04C1,
+	0x0CE1, 0x079F, 0x0C2E, 0x0405, 0x0C22, 0x0461, 0x0E1D, 0x0582,
+	0x073F, 0x0571, 0x0C4D, 0x0DFD, 0x05CE, 0x0C6D, 0x05DE, 0x0415,
+	0x0C45, 0x075F, 0x0C41, 0x0D03, 0x05A1, 0x0711, 0x05CF, 0x0425,
+	0x0C93, 0x0D21, 0x0591, 0x043D, 0x0D12, 0x0501, 0x040F, 0x0511,
+	0x0431, 0x0C03, 0x04A1, 0x078D, 0x0581, 0x041E, 0x040D, 0x0C02,
+	0x040E, 0x05DD, 0x057F, 0x079D, 0x042D, 0x0D9F, 0x0502, 0x056E,
+	0x0412, 0x071F, 0x044E, 0x05BF, 0x0C1D, 0x0482, 0x05AE, 0x042F,
+	0x057D, 0x0491, 0x054E, 0x047D, 0x0DBE, 0x0611, 0x0492, 0x0601,
+	0x05BD, 0x05CD, 0x0426, 0x05ED, 0x072D, 0x073D, 0x0483, 0x0F5D,
+	0x0421, 0x056F, 0x053F, 0x058E, 0x054F, 0x078F, 0x053E, 0x059E,
+	0x057E, 0x051F, 0x055D, 0x0413, 0x070D, 0x05AF, 0x0411, 0x0453,
+	0x0D5E, 0x077E, 0x052F, 0x070F, 0x074D, 0x0692, 0x070E, 0x072F,
+	0x072E, 0x058F, 0x071D, 0x052E, 0x0417, 0x073E, 0x0781, 0x078E,
+	0x055F, 0x060E, 0x0622, 0x0683, 0x0702, 0x071E, 0x076F, 0x079E,
+	0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_8x6 {
+	8, 6,
+	{ 273, 186 },
+	{ 880, 300 },
+	{ 0, 64 },
+	{ percentile_arr_8x6_0, percentile_arr_8x6_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 8)
+static const uint16_t percentile_arr_8x8_0[347] {
+	0x0334, 0xFD44, 0xDD14, 0x9154, 0x9B08, 0x906A, 0x8928, 0x8108,
+	0xE866, 0xC918, 0x606F, 0xC0FE, 0x5963, 0x58EE, 0x6534, 0x505A,
+	0x51E2, 0xA8CF, 0x5354, 0x5314, 0x5134, 0x5524, 0x48F3, 0x504B,
+	0x487E, 0x5344, 0x49C3, 0x4972, 0x49F2, 0x4856, 0xD0EF, 0x81D2,
+	0x78DE, 0x4261, 0x3AC1, 0x71E3, 0x6879, 0x390C, 0x3143, 0x31B3,
+	0x385F, 0x3153, 0x306E, 0x3037, 0x30DF, 0x3162, 0x304F, 0x3075,
+	0xB03B, 0x2847, 0x28E3, 0x2914, 0x507F, 0x28BF, 0x5173, 0x5073,
+	0x20D3, 0x2A06, 0x2827, 0x2508, 0x2229, 0x29D3, 0x204A, 0x207A,
+	0x2046, 0x4148, 0x20FD, 0x4225, 0x23A1, 0x3944, 0x2065, 0x1924,
+	0x2324, 0x1806, 0x19F1, 0x2215, 0x1876, 0x22AD, 0x502B, 0x1B04,
+	0x18F2, 0x3A4D, 0x3216, 0x3504, 0x18DD, 0x1B21, 0x10CE, 0x1869,
+	0x1B41, 0x1855, 0x1207, 0x1AE1, 0x2845, 0x19D1, 0x2A0A, 0x1A2D,
+	0x2A1A, 0x11C2, 0x1A0B, 0x1217, 0x2816, 0x121B, 0x1271, 0x2AD1,
+	0x1035, 0x1015, 0x287D, 0x12F1, 0x43C1, 0x1171, 0x1A05, 0x08E2,
+	0x11E1, 0x3251, 0x2049, 0x20F1, 0x12CD, 0x0A39, 0x1219, 0x1059,
+	0x1104, 0x1036, 0x1872, 0x3007, 0x08ED, 0x205E, 0x1026, 0x0952,
+	0x1392, 0x1019, 0x0951, 0x100A, 0x13EE, 0x08D2, 0x1242, 0x0ABD,
+	0x22A2, 0x0BDF, 0x2B81, 0x0A35, 0x13B1, 0x0839, 0x13BF, 0x0A33,
+	0x1B31, 0x205D, 0x1241, 0x183A, 0x2025, 0x0B93, 0x0A3D, 0x1017,
+	0x1313, 0x1253, 0x082A, 0x204E, 0x09A2, 0x080B, 0x0A1F, 0x125D,
+	0x0A2E, 0x081A, 0x08D1, 0x082F, 0x086D, 0x1B82, 0x0A09, 0x0B22,
+	0x1062, 0x11A3, 0x2161, 0x0923, 0x129F, 0x1A62, 0x0871, 0x0942,
+	0x081B, 0x1133, 0x18AE, 0x0A9E, 0x0863, 0x09FF, 0x18C2, 0x0B51,
+	0x08BD, 0x0AA3, 0x09B1, 0x1AC2, 0x08B3, 0x0829, 0x0BEF, 0x0B83,
+	0x0AAE, 0x0A8D, 0x1857, 0x185B, 0x08AF, 0x103F, 0x08C3, 0x09B2,
+	0x0A4E, 0x11C1, 0x0A31, 0x0B42, 0x0A83, 0x0BFF, 0x13DD, 0x00CD,
+	0x0AB3, 0x0842, 0x08BE, 0x0922, 0x1A8E, 0x08E1, 0x002E, 0x0BA2,
+	0x0A8F, 0x2263, 0x0252, 0x0B32, 0x0AC3, 0x0941, 0x0A43, 0x083D,
+	0x083E, 0x0A3E, 0x084D, 0x1131, 0x136F, 0x0AB1, 0x0193, 0x0BFD,
+	0x0391, 0x0851, 0x13AF, 0x0843, 0x0213, 0x1226, 0x0932, 0x03B2,
+	0x0902, 0x0BCD, 0x0221, 0x089E, 0x00B1, 0x0BDE, 0x03FE, 0x02A1,
+	0x0982, 0x009F, 0x080E, 0x0B5F, 0x02BE, 0x0A32, 0x0A2A, 0x01EE,
+	0x0053, 0x0AB2, 0x0192, 0x09FD, 0x0052, 0x0B03, 0x0293, 0x00A2,
+	0x0B7F, 0x0BED, 0x0311, 0x08B2, 0x0A72, 0x088E, 0x0333, 0x0B12,
+	0x0A23, 0x0822, 0x0083, 0x11CE, 0x021D, 0x08A3, 0x088F, 0x029D,
+	0x0A22, 0x0A3F, 0x01FE, 0x020F, 0x0983, 0x02D2, 0x0292, 0x0B23,
+	0x001E, 0x0BCF, 0x03CE, 0x09AF, 0x0B02, 0x0301, 0x022F, 0x137E,
+	0x021E, 0x09EF, 0x016F, 0x0112, 0x097E, 0x080F, 0x020D, 0x0092,
+	0x01DE, 0x09DF, 0x0032, 0x0033, 0x0A82, 0x03BE, 0x0B6E, 0x001F,
+	0x020E, 0x0023, 0x09CF, 0x0113, 0x0103, 0x0013, 0x0BAE, 0x0203,
+	0x0BAD, 0x01BF, 0x034F, 0x095F, 0x036D, 0x0202, 0x017F, 0x0093,
+	0x0201, 0x034D, 0x0212, 0x035D, 0x03BD, 0x0B3F, 0x035E, 0x0211,
+	0x0281, 0x0291, 0x032E, 0x037D, 0x034E, 0x038E, 0x039F, 0x032F,
+	0x033E, 0x038F, 0x039E
+};
+
+static const uint16_t percentile_arr_8x8_1[208] {
+	0x0621, 0x3443, 0x47CD, 0x97AE, 0xFC62, 0x14F1, 0x24C2, 0x25DF,
+	0x3C33, 0x1C52, 0x9C72, 0x0FBE, 0x0C5D, 0x343E, 0x24A3, 0x1551,
+	0x5D32, 0x1CD2, 0x15EF, 0x4E31, 0x04DD, 0x1FDD, 0x174F, 0x0DD1,
+	0x3E0D, 0x15FF, 0x0DA2, 0x1E03, 0x17BD, 0x177D, 0x14B3, 0x0471,
+	0x0CAE, 0x1C1F, 0x04D1, 0x0F6E, 0x0DFE, 0x1C42, 0x0C16, 0x0D22,
+	0x0C9F, 0x2C2E, 0x0FAD, 0x0571, 0x147D, 0x0C07, 0x04B2, 0x0F6D,
+	0x0F5E, 0x07AF, 0x146D, 0x0C51, 0x0593, 0x2583, 0x0C4E, 0x040B,
+	0x0C35, 0x0513, 0x0E91, 0x0406, 0x073F, 0x144D, 0x0561, 0x048F,
+	0x0F01, 0x0F4E, 0x0CA2, 0x075F, 0x1682, 0x04E1, 0x0C1A, 0x04BD,
+	0x0542, 0x0D41, 0x0DEE, 0x04CD, 0x0DCF, 0x04B1, 0x0C15, 0x0C3D,
+	0x0423, 0x0592, 0x0DDE, 0x0422, 0x0432, 0x05FD, 0x0DC1, 0x05B1,
+	0x0DCE, 0x0612, 0x0C2F, 0x0445, 0x0602, 0x0531, 0x0439, 0x0E81,
+	0x0582, 0x0C61, 0x061D, 0x049E, 0x0405, 0x0409, 0x0DBE, 0x079F,
+	0x0D21, 0x04C1, 0x0C0A, 0x0E13, 0x04AD, 0x040E, 0x0581, 0x0419,
+	0x05DD, 0x0D03, 0x049D, 0x0449, 0x0429, 0x048E, 0x0DA1, 0x0425,
+	0x0512, 0x0501, 0x0431, 0x0523, 0x0441, 0x042D, 0x040F, 0x0D7D,
+	0x0511, 0x0502, 0x05BF, 0x04A1, 0x0C03, 0x0402, 0x079D, 0x05AE,
+	0x075D, 0x057F, 0x041D, 0x048D, 0x042A, 0x0453, 0x05AF, 0x078D,
+	0x0C0D, 0x073D, 0x0491, 0x0591, 0x05BD, 0x072D, 0x057E, 0x051F,
+	0x0482, 0x0492, 0x041E, 0x0412, 0x0D9F, 0x0421, 0x0493, 0x0711,
+	0x056E, 0x059E, 0x054E, 0x0611, 0x05ED, 0x074D, 0x070F, 0x056F,
+	0x052F, 0x053F, 0x071F, 0x054F, 0x05CD, 0x0483, 0x055E, 0x072F,
+	0x0E01, 0x0426, 0x058F, 0x0413, 0x078F, 0x071D, 0x055F, 0x058E,
+	0x0411, 0x053E, 0x071E, 0x055D, 0x077E, 0x052E, 0x0692, 0x0417,
+	0x070D, 0x078E, 0x070E, 0x072E, 0x041B, 0x060E, 0x0622, 0x0683,
+	0x068D, 0x0702, 0x073E, 0x076F, 0x0781, 0x079E, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_8x8 {
+	8, 8,
+	{ 347, 208 },
+	{ 1144, 267 },
+	{ 0, 38 },
+	{ percentile_arr_8x8_0, percentile_arr_8x8_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 5)
+static const uint16_t percentile_arr_10x5_0[274] {
+	0x0165, 0xF975, 0xD866, 0xC056, 0xA946, 0x90C6, 0x90F5, 0x8963,
+	0x80D6, 0x80E6, 0x60F3, 0x61C3, 0x59F2, 0xA927, 0x5075, 0x4847,
+	0x5153, 0x4955, 0x49E2, 0x48B6, 0x41D2, 0x4943, 0x8305, 0x8172,
+	0x4046, 0x4037, 0x40A7, 0x70B7, 0x7AC1, 0x31E3, 0x7027, 0x30E5,
+	0x69D3, 0x99B3, 0x3315, 0x6115, 0x3136, 0x3076, 0x3173, 0x30D5,
+	0x3106, 0x8962, 0x2916, 0x30C7, 0x5126, 0x30D3, 0x2956, 0x5117,
+	0x2B41, 0x2AE1, 0x2A61, 0x29F1, 0x2306, 0x2145, 0x4A85, 0x2057,
+	0x40E3, 0x4137, 0x3B21, 0x23C1, 0x2065, 0x1925, 0x51C2, 0x5225,
+	0x4935, 0x1AD1, 0x23A1, 0x19D1, 0x1A71, 0x4055, 0x1873, 0x1A86,
+	0x1295, 0x18F2, 0x28A6, 0x1952, 0x4AA5, 0x20B5, 0x10C5, 0x2AA2,
+	0x11E1, 0x1107, 0x10D2, 0x2171, 0x1351, 0x3036, 0x1331, 0x1BEE,
+	0x2035, 0x1045, 0x1313, 0x0A15, 0x1087, 0x1296, 0x13EF, 0x18E2,
+	0x1151, 0x1086, 0x10F1, 0x08A5, 0x12C2, 0x1BFF, 0x1095, 0x1A62,
+	0x1322, 0x0942, 0x1026, 0x1872, 0x1062, 0x0897, 0x1123, 0x08D1,
+	0x1A06, 0x0806, 0x137F, 0x13B1, 0x13DF, 0x1A51, 0x09B1, 0x0A83,
+	0x1015, 0x22F1, 0x0961, 0x0B81, 0x12B3, 0x0A35, 0x0AA3, 0x20B3,
+	0x08C3, 0x2342, 0x0933, 0x0A33, 0x09A2, 0x10C2, 0x0896, 0x2205,
+	0x0825, 0x20E1, 0x0922, 0x1242, 0x0B16, 0x0B32, 0x09A3, 0x0AC3,
+	0x0BBF, 0x0B93, 0x0071, 0x0931, 0x0A41, 0x2392, 0x13FE, 0x09C1,
+	0x0B07, 0x0016, 0x1182, 0x09B2, 0x0A26, 0x0132, 0x0941, 0x0A93,
+	0x0992, 0x1063, 0x1217, 0x01FF, 0x11EE, 0x1216, 0x0B23, 0x0B82,
+	0x0042, 0x1102, 0x0213, 0x0B6F, 0x09FE, 0x1207, 0x0807, 0x18B1,
+	0x0253, 0x0AB1, 0x08A2, 0x13FD, 0x01FD, 0x1983, 0x0AB2, 0x0A31,
+	0x016F, 0x0B11, 0x00B2, 0x0851, 0x0AD2, 0x0993, 0x0BDD, 0x12A1,
+	0x017F, 0x0A97, 0x1022, 0x0383, 0x0843, 0x0A52, 0x03A2, 0x097E,
+	0x0817, 0x03B2, 0x0A43, 0x09EF, 0x0A63, 0x0B33, 0x0B03, 0x0292,
+	0x0272, 0x09CE, 0x0287, 0x136D, 0x0053, 0x0B12, 0x0083, 0x0892,
+	0x0112, 0x1282, 0x03ED, 0x0852, 0x0301, 0x1391, 0x0232, 0x0B7E,
+	0x0221, 0x08A3, 0x0BCD, 0x0BCF, 0x036E, 0x09DE, 0x0103, 0x03DE,
+	0x0832, 0x0BAF, 0x0302, 0x13CE, 0x035F, 0x0093, 0x0A23, 0x01DF,
+	0x0013, 0x0A22, 0x0023, 0x0113, 0x09AF, 0x01BF, 0x0033, 0x095F,
+	0x0203, 0x0281, 0x09CF, 0x037D, 0x0201, 0x0B4D, 0x03AE, 0x03BE,
+	0x0291, 0x035E, 0x038E, 0x0B9F, 0x03AD, 0x0202, 0x034F, 0x0211,
+	0x035D, 0x0212, 0x032E, 0x039E, 0x033F, 0x034E, 0x03BD, 0x032F,
+	0x033E, 0x038F
+};
+
+static const uint16_t percentile_arr_10x5_1[180] {
+	0x0532, 0xFCA3, 0x3621, 0x6E82, 0x2CC2, 0x3D51, 0x3F01, 0x2691,
+	0x17AE, 0x35A2, 0x74B3, 0x1603, 0x4433, 0x3C43, 0x6C35, 0x25D1,
+	0x1D13, 0x15DF, 0x37CD, 0x0D93, 0x1D22, 0x0E81, 0x1452, 0x0CD2,
+	0x37BE, 0x0CB2, 0x3407, 0x1523, 0x0C16, 0x0CB5, 0x0C96, 0x1486,
+	0x2631, 0x1506, 0x0F4F, 0x1583, 0x0CD1, 0x2CA2, 0x2612, 0x1613,
+	0x1602, 0x1F11, 0x179F, 0x17BD, 0x15B1, 0x0406, 0x1D41, 0x0CF1,
+	0x0D31, 0x0442, 0x1C62, 0x0F6E, 0x077D, 0x0C51, 0x0445, 0x0D15,
+	0x2592, 0x0CB1, 0x05EF, 0x0542, 0x17AF, 0x1425, 0x075E, 0x0FAD,
+	0x0CC1, 0x0503, 0x0512, 0x15C1, 0x0C95, 0x0415, 0x0505, 0x0F4E,
+	0x04A5, 0x0493, 0x0C32, 0x0F5F, 0x04E1, 0x0521, 0x0C85, 0x07DD,
+	0x0582, 0x15FF, 0x05CF, 0x0405, 0x0D91, 0x05A1, 0x05FE, 0x0C23,
+	0x0561, 0x0472, 0x0471, 0x0C22, 0x0DEE, 0x076D, 0x0502, 0x0426,
+	0x0C61, 0x0D7D, 0x0525, 0x05DE, 0x0DCE, 0x079D, 0x0692, 0x0441,
+	0x0C91, 0x05DD, 0x0511, 0x057F, 0x0611, 0x0DFD, 0x078D, 0x056E,
+	0x0492, 0x04A1, 0x073F, 0x0C31, 0x05BE, 0x0483, 0x0571, 0x056F,
+	0x0D9F, 0x0581, 0x0501, 0x057E, 0x05BF, 0x078F, 0x0516, 0x05ED,
+	0x0402, 0x0F7E, 0x0482, 0x054E, 0x075D, 0x071F, 0x05CD, 0x0535,
+	0x05AE, 0x0C11, 0x058F, 0x05AF, 0x0421, 0x0413, 0x0601, 0x054F,
+	0x073D, 0x059E, 0x0487, 0x070F, 0x078E, 0x0781, 0x053E, 0x0403,
+	0x072D, 0x055D, 0x05BD, 0x079E, 0x0D8E, 0x0412, 0x052E, 0x074D,
+	0x053F, 0x051F, 0x070E, 0x055F, 0x072F, 0x052F, 0x070D, 0x055E,
+	0x0417, 0x0453, 0x072E, 0x0622, 0x0683, 0x0702, 0x071D, 0x071E,
+	0x073E, 0x076F, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_10x5 {
+	10, 5,
+	{ 274, 180 },
+	{ 954, 324 },
+	{ 0, 79 },
+	{ percentile_arr_10x5_0, percentile_arr_10x5_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 6)
+static const uint16_t percentile_arr_10x6_0[325] {
+	0x01A4, 0xF954, 0xA066, 0x9975, 0x80F5, 0x7056, 0x6918, 0x6963,
+	0x58C6, 0x5946, 0x5928, 0x5174, 0x586F, 0xA0E6, 0x5108, 0x48D6,
+	0x49E2, 0x40F3, 0x9172, 0x41F2, 0xB875, 0x3927, 0x39C3, 0xA953,
+	0x3934, 0x3305, 0x30B6, 0x6943, 0x31D2, 0x3876, 0x3037, 0x2955,
+	0x30A7, 0x32C1, 0x29B3, 0x3027, 0x287E, 0x30B7, 0x29E3, 0x5846,
+	0x2B15, 0x2847, 0x3162, 0x5173, 0x4936, 0x285F, 0x48D3, 0x2164,
+	0x4906, 0x20E5, 0x2915, 0x2116, 0x407F, 0x20D5, 0x2A61, 0x4117,
+	0x20E3, 0x2126, 0x4148, 0x206E, 0x39D3, 0x2145, 0x41B4, 0x1B06,
+	0x2114, 0x2165, 0x5321, 0x5A85, 0x1A4D, 0x1A1F, 0x19F1, 0x3341,
+	0x184F, 0x1956, 0x3125, 0x30C7, 0x28F2, 0x1937, 0x1AE1, 0x1073,
+	0x1BA1, 0x1935, 0x110C, 0x1BC1, 0x3A25, 0x19C2, 0x1295, 0x122E,
+	0x1944, 0x11D1, 0x1124, 0x1857, 0x22D1, 0x2286, 0x1A2D, 0x12A2,
+	0x2107, 0x1055, 0x2065, 0x0A71, 0x2152, 0x10C5, 0x10D2, 0x1331,
+	0x08B5, 0x1171, 0x2836, 0x10A6, 0x0904, 0x123D, 0x20F1, 0x12A5,
+	0x10E2, 0x107D, 0x1AF1, 0x1313, 0x0951, 0x11E1, 0x1B22, 0x1B51,
+	0x0835, 0x101E, 0x0A5D, 0x0A15, 0x3045, 0x0A96, 0x08A5, 0x1142,
+	0x12A3, 0x1872, 0x085D, 0x09B1, 0x100E, 0x0887, 0x0886, 0x086D,
+	0x0933, 0x12B3, 0x0897, 0x08B3, 0x0A33, 0x0923, 0x1095, 0x0BEE,
+	0x2BB1, 0x085E, 0x1283, 0x0A51, 0x1026, 0x0A06, 0x12C2, 0x08D1,
+	0x11A2, 0x13BF, 0x08C3, 0x10C2, 0x0A3E, 0x0BDF, 0x0B81, 0x13EF,
+	0x0A35, 0x0B16, 0x082F, 0x2161, 0x1B32, 0x0806, 0x084E, 0x11A3,
+	0x1015, 0x1122, 0x2931, 0x0342, 0x0825, 0x0A0F, 0x0896, 0x0A05,
+	0x0241, 0x09C1, 0x083F, 0x0A42, 0x0071, 0x0B07, 0x082E, 0x0393,
+	0x12B1, 0x0A62, 0x0226, 0x0A2F, 0x0B92, 0x0063, 0x0932, 0x0862,
+	0x09FF, 0x0A31, 0x00E1, 0x12B2, 0x09B2, 0x0AC3, 0x0941, 0x0293,
+	0x1323, 0x104D, 0x003E, 0x083D, 0x0992, 0x1382, 0x03FF, 0x0A13,
+	0x1016, 0x0A53, 0x0182, 0x1007, 0x0AA1, 0x080F, 0x0A16, 0x0A1E,
+	0x0042, 0x0902, 0x13DD, 0x0BB2, 0x0A63, 0x00A2, 0x08B1, 0x03FE,
+	0x1207, 0x08B2, 0x0B83, 0x09EE, 0x0311, 0x0A87, 0x0BAF, 0x03A2,
+	0x09FD, 0x0051, 0x0B33, 0x020D, 0x09CE, 0x0217, 0x021D, 0x0817,
+	0x020E, 0x0A4E, 0x001F, 0x0BFD, 0x0297, 0x0983, 0x0A92, 0x0252,
+	0x0243, 0x0B03, 0x0193, 0x036F, 0x0B12, 0x0043, 0x0822, 0x0A21,
+	0x01FE, 0x0853, 0x037F, 0x023F, 0x0BED, 0x02D2, 0x0B91, 0x0232,
+	0x0282, 0x0912, 0x08A3, 0x0852, 0x0223, 0x0BCD, 0x0083, 0x0301,
+	0x0832, 0x01EF, 0x0892, 0x0302, 0x0A72, 0x03DE, 0x0893, 0x0BCF,
+	0x09DE, 0x03CE, 0x035F, 0x0833, 0x0023, 0x0103, 0x017E, 0x0813,
+	0x01CF, 0x01BF, 0x016F, 0x0A22, 0x037E, 0x0113, 0x01AF, 0x0B6E,
+	0x03BE, 0x0201, 0x0A03, 0x01DF, 0x036D, 0x03AE, 0x015F, 0x0281,
+	0x033E, 0x0A02, 0x038E, 0x017F, 0x0291, 0x034D, 0x03BD, 0x0B7D,
+	0x03AD, 0x0211, 0x0212, 0x034F, 0x032E, 0x039F, 0x034E, 0x035D,
+	0x035E, 0x033F, 0x039E, 0x032F, 0x038F
+};
+
+static const uint16_t percentile_arr_10x6_1[199] {
+	0x0621, 0xBD32, 0x5CA3, 0x1FAE, 0x64C2, 0x1D51, 0x6C33, 0xFC43,
+	0x5CB3, 0x25A2, 0x2E82, 0x35D1, 0x4F01, 0x3FBE, 0x3691, 0x2DDF,
+	0x2E03, 0x3FCD, 0x14D2, 0x1CF1, 0x0C52, 0x3C35, 0x2D22, 0x1513,
+	0x1462, 0x54B2, 0x0E31, 0x4E81, 0x1593, 0x1D23, 0x1CD1, 0x14B5,
+	0x2FBD, 0x0C07, 0x1D06, 0x0DEF, 0x14A2, 0x1612, 0x1F4F, 0x0C16,
+	0x1F7D, 0x0C96, 0x0486, 0x1F9F, 0x0D42, 0x4583, 0x0E02, 0x0472,
+	0x0DB1, 0x1613, 0x0FAD, 0x0D41, 0x0F11, 0x0E0D, 0x1C42, 0x143E,
+	0x076E, 0x04B1, 0x0FAF, 0x0D61, 0x0531, 0x0C71, 0x0DFF, 0x0DFE,
+	0x0406, 0x0C45, 0x0451, 0x0D15, 0x05C1, 0x2CC1, 0x141F, 0x0CE1,
+	0x0FDD, 0x0C22, 0x0582, 0x0D92, 0x0571, 0x0F6D, 0x0C93, 0x045D,
+	0x0F5E, 0x044D, 0x0423, 0x0D05, 0x0425, 0x0C95, 0x04A5, 0x0DCE,
+	0x075F, 0x0E1D, 0x0503, 0x042E, 0x0D91, 0x0512, 0x0DDE, 0x05A1,
+	0x074E, 0x0C32, 0x0431, 0x0415, 0x0D21, 0x05EE, 0x040E, 0x0DDD,
+	0x0485, 0x1525, 0x0491, 0x0C26, 0x046D, 0x0C05, 0x05CF, 0x05FD,
+	0x0E92, 0x073F, 0x0C0D, 0x043D, 0x0502, 0x0C1E, 0x041D, 0x0461,
+	0x04A1, 0x0511, 0x0581, 0x05BD, 0x0C41, 0x059F, 0x05BF, 0x040F,
+	0x0C7D, 0x0402, 0x054E, 0x057D, 0x0403, 0x078D, 0x05AE, 0x042D,
+	0x0483, 0x079D, 0x0D7F, 0x0482, 0x0611, 0x056E, 0x0516, 0x05BE,
+	0x0535, 0x044E, 0x05AF, 0x0DED, 0x042F, 0x0492, 0x058E, 0x078F,
+	0x0412, 0x057E, 0x053E, 0x0F1F, 0x073D, 0x0601, 0x0501, 0x075D,
+	0x059E, 0x05CD, 0x053F, 0x054F, 0x055E, 0x055D, 0x0421, 0x074D,
+	0x051F, 0x072F, 0x0781, 0x0411, 0x0D6F, 0x077E, 0x0487, 0x070E,
+	0x070F, 0x072D, 0x058F, 0x078E, 0x079E, 0x052E, 0x0413, 0x072E,
+	0x071D, 0x052F, 0x055F, 0x073E, 0x0417, 0x0453, 0x060E, 0x0622,
+	0x0683, 0x0702, 0x070D, 0x071E, 0x076F, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_10x6 {
+	10, 6,
+	{ 325, 199 },
+	{ 922, 381 },
+	{ 0, 78 },
+	{ percentile_arr_10x6_0, percentile_arr_10x6_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 8)
+static const uint16_t percentile_arr_10x8_0[400] {
+	0x0154, 0xAB34, 0xAD44, 0x8308, 0x7866, 0x7B64, 0x79A4, 0x7975,
+	0x686A, 0x6908, 0xC514, 0x6174, 0x6128, 0x6118, 0x5B54, 0x5163,
+	0xF856, 0x50F5, 0x986F, 0xDD34, 0x48FE, 0x4972, 0x48E6, 0x4146,
+	0x48EE, 0x40F3, 0x4AC1, 0x38C6, 0x41E2, 0xBB05, 0x707E, 0x38D6,
+	0x3927, 0x6B14, 0x384B, 0x3948, 0x3153, 0x385A, 0x3134, 0x6B15,
+	0x39F2, 0x30CF, 0x3143, 0x91D2, 0x31C3, 0x60EF, 0x5973, 0x3076,
+	0x28D3, 0x3261, 0x2875, 0x28DE, 0x290C, 0x51E3, 0x28A7, 0x20E3,
+	0x2962, 0x2B06, 0x2917, 0x483B, 0x20B6, 0x2D24, 0x206E, 0x285F,
+	0x20B7, 0x2936, 0x4047, 0x2037, 0x20DF, 0x28BF, 0x21B4, 0x21B3,
+	0x1D08, 0x2027, 0x404F, 0x3846, 0x2116, 0x187F, 0x1879, 0x2285,
+	0x1A29, 0x3915, 0x4873, 0x1955, 0x3114, 0x1B44, 0x2165, 0x107A,
+	0x1956, 0x6137, 0x1106, 0x3145, 0x1B21, 0x19D3, 0x12AD, 0x1B41,
+	0x1AD1, 0x1126, 0x18F2, 0x282B, 0x40E5, 0x20D5, 0x2A0A, 0x284A,
+	0x1286, 0x1295, 0x121A, 0x2A0B, 0x321B, 0x122D, 0x10FD, 0x13A1,
+	0x32A2, 0x12E1, 0x1164, 0x13C1, 0x124D, 0x1239, 0x4504, 0x10C7,
+	0x22F1, 0x11F1, 0x0AC2, 0x2125, 0x1225, 0x0B04, 0x1107, 0x1069,
+	0x1A19, 0x13BF, 0x2A96, 0x08D2, 0x1271, 0x0952, 0x2BDF, 0x0B31,
+	0x1251, 0x2124, 0x0B13, 0x12BD, 0x1233, 0x13EE, 0x2144, 0x0B16,
+	0x0A15, 0x18E2, 0x08DD, 0x1097, 0x0857, 0x0B24, 0x0AA5, 0x12A3,
+	0x11C2, 0x11D1, 0x10CE, 0x0865, 0x123D, 0x08B3, 0x0B51, 0x1971,
+	0x0A41, 0x0A06, 0x1039, 0x080A, 0x0B22, 0x0923, 0x0836, 0x08C3,
+	0x0A1F, 0x1072, 0x080B, 0x0935, 0x0855, 0x18A6, 0x0A42, 0x1133,
+	0x0A83, 0x0A09, 0x0ACD, 0x0A2E, 0x0887, 0x083A, 0x10C5, 0x085E,
+	0x13B1, 0x087D, 0x0819, 0x0A9F, 0x0049, 0x08F1, 0x0BEF, 0x1161,
+	0x0B42, 0x09E1, 0x0A05, 0x0904, 0x12AE, 0x029E, 0x0A31, 0x09FF,
+	0x0951, 0x0859, 0x001A, 0x082F, 0x0B81, 0x08B5, 0x0A35, 0x082A,
+	0x08ED, 0x1142, 0x1262, 0x0B32, 0x08A5, 0x12D2, 0x03DD, 0x0B07,
+	0x18AE, 0x083F, 0x00AF, 0x0AB3, 0x086D, 0x0287, 0x0A93, 0x025D,
+	0x0816, 0x13FF, 0x0A8D, 0x005D, 0x08D1, 0x0392, 0x0845, 0x0AC3,
+	0x08C2, 0x01A3, 0x0AB1, 0x09A2, 0x005B, 0x0B93, 0x02B2, 0x1086,
+	0x001B, 0x0863, 0x0216, 0x0AA1, 0x0896, 0x0A8F, 0x084E, 0x0A8E,
+	0x0A53, 0x0026, 0x0A26, 0x0382, 0x0807, 0x0862, 0x0029, 0x0871,
+	0x00BD, 0x0835, 0x024E, 0x0806, 0x0941, 0x0895, 0x03AF, 0x0A13,
+	0x0932, 0x03ED, 0x0BFD, 0x0207, 0x0B83, 0x0993, 0x09B1, 0x03CD,
+	0x0A3E, 0x03FE, 0x0A21, 0x0015, 0x0B11, 0x0A43, 0x00E1, 0x136F,
+	0x00BE, 0x00A2, 0x0842, 0x0043, 0x0825, 0x082E, 0x0A2A, 0x03DE,
+	0x0BA2, 0x0122, 0x0BCF, 0x004D, 0x0323, 0x09C1, 0x0292, 0x083E,
+	0x0252, 0x0017, 0x0A72, 0x00CD, 0x0182, 0x0A63, 0x0131, 0x09B2,
+	0x0303, 0x0902, 0x0053, 0x035F, 0x0A32, 0x003D, 0x0992, 0x0A2F,
+	0x03B2, 0x0ABE, 0x009F, 0x0183, 0x0312, 0x08B1, 0x0B02, 0x0A17,
+	0x0B7F, 0x0333, 0x0297, 0x0A23, 0x020F, 0x0282, 0x0851, 0x0822,
+	0x03CE, 0x01EE, 0x000E, 0x08B2, 0x0083, 0x0A1D, 0x00A3, 0x0222,
+	0x088F, 0x0112, 0x029D, 0x0092, 0x0A3F, 0x0391, 0x089E, 0x0301,
+	0x01FD, 0x09BF, 0x01CE, 0x0852, 0x01FE, 0x0013, 0x0903, 0x088E,
+	0x037E, 0x021E, 0x01EF, 0x095F, 0x016F, 0x09DE, 0x03BE, 0x020E,
+	0x0113, 0x01DF, 0x080F, 0x020D, 0x0833, 0x03AE, 0x0032, 0x03BD,
+	0x0823, 0x001E, 0x01AF, 0x0203, 0x034F, 0x0093, 0x0A81, 0x036E,
+	0x0291, 0x038E, 0x0A01, 0x001F, 0x017F, 0x01CF, 0x017E, 0x0202,
+	0x0BAD, 0x0211, 0x035D, 0x035E, 0x039F, 0x0212, 0x032E, 0x033F,
+	0x034D, 0x034E, 0x036D, 0x032F, 0x033E, 0x037D, 0x038F, 0x039E
+};
+
+static const uint16_t percentile_arr_10x8_1[221] {
+	0x0621, 0xDFAE, 0x2443, 0x54C2, 0x37CD, 0x1CF1, 0xFCA3, 0x14D2,
+	0x2D32, 0x5551, 0x7DDF, 0x5C33, 0x15D1, 0x3462, 0x24B3, 0x7452,
+	0x5FBE, 0x6472, 0x65A2, 0x1D06, 0x445D, 0x15EF, 0x0E31, 0x1D71,
+	0x343E, 0x0D42, 0x0CDD, 0x1F01, 0x4691, 0x1435, 0x0E82, 0x0DFF,
+	0x17DD, 0x0D22, 0x24B2, 0x1603, 0x04B5, 0x24AE, 0x060D, 0x2D13,
+	0x0C7D, 0x0496, 0x17BD, 0x1F4F, 0x1F7D, 0x1486, 0x0593, 0x1C16,
+	0x0C07, 0x15FE, 0x041F, 0x14D1, 0x0C9F, 0x0E81, 0x0D15, 0x27AF,
+	0x0C2E, 0x0D23, 0x176E, 0x0FAD, 0x1C06, 0x1561, 0x0DB1, 0x040B,
+	0x1C4E, 0x0D83, 0x1711, 0x0C42, 0x0C71, 0x1C1A, 0x0D25, 0x04A2,
+	0x0C45, 0x076D, 0x0F9F, 0x075F, 0x0E12, 0x046D, 0x048F, 0x1D92,
+	0x0602, 0x0C39, 0x174E, 0x0C51, 0x0CA1, 0x075E, 0x05C1, 0x14BD,
+	0x0D31, 0x0423, 0x0F3F, 0x0495, 0x0C93, 0x049E, 0x0D05, 0x04E1,
+	0x0DEE, 0x0415, 0x04B1, 0x0503, 0x0CCD, 0x042F, 0x0DCF, 0x044D,
+	0x0541, 0x1582, 0x05DE, 0x0D01, 0x0487, 0x040A, 0x0516, 0x0CA5,
+	0x05FD, 0x05BF, 0x057D, 0x0DA1, 0x0426, 0x040F, 0x071F, 0x0613,
+	0x0432, 0x0D12, 0x043D, 0x0425, 0x0461, 0x061D, 0x0D21, 0x0591,
+	0x079D, 0x048D, 0x0429, 0x0C49, 0x04C1, 0x042A, 0x040E, 0x0485,
+	0x0511, 0x0405, 0x0502, 0x0441, 0x0C19, 0x0692, 0x0535, 0x058F,
+	0x041D, 0x059F, 0x072D, 0x04AD, 0x049D, 0x05CE, 0x048E, 0x0C31,
+	0x057F, 0x078D, 0x0409, 0x041E, 0x05AE, 0x0611, 0x058E, 0x05DD,
+	0x05CD, 0x056E, 0x0483, 0x073D, 0x054E, 0x0D9E, 0x0402, 0x0491,
+	0x040D, 0x056F, 0x042D, 0x0581, 0x0421, 0x057E, 0x0781, 0x053E,
+	0x0482, 0x078F, 0x0413, 0x052E, 0x0601, 0x0422, 0x0492, 0x055E,
+	0x05BE, 0x0F9E, 0x072F, 0x074D, 0x0412, 0x070F, 0x075D, 0x05BD,
+	0x051F, 0x071D, 0x073E, 0x077E, 0x0403, 0x0411, 0x078E, 0x055D,
+	0x05AF, 0x05ED, 0x052F, 0x053F, 0x070D, 0x070E, 0x072E, 0x054F,
+	0x0417, 0x041B, 0x0453, 0x055F, 0x060E, 0x0622, 0x0683, 0x068D,
+	0x0702, 0x071E, 0x076F, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_10x8 =
+{
+	10, 8,
+	{ 400, 221 },
+	{ 1119, 376 },
+	{ 0, 52 },
+	{ percentile_arr_10x8_0, percentile_arr_10x8_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 10)
+static const uint16_t percentile_arr_10x10_0[453] {
+	0x0334, 0x9514, 0x8954, 0x806A, 0x6F14, 0x6724, 0x6108, 0x6364,
+	0x5175, 0x5D44, 0x5866, 0x5118, 0x5308, 0xA179, 0x5128, 0xF534,
+	0x49A4, 0x5354, 0x9174, 0x486F, 0x48EA, 0x40F3, 0x4963, 0x414A,
+	0xF8F9, 0x3984, 0x4172, 0x387E, 0x405A, 0x38DA, 0x38F5, 0x9B05,
+	0x30EE, 0x32C1, 0x3261, 0x3D08, 0x31E2, 0x3056, 0x292B, 0x3146,
+	0x3127, 0x3315, 0x58CA, 0x58E6, 0x290C, 0x3314, 0x8134, 0x28E3,
+	0x28FE, 0x2948, 0x28C6, 0x78DE, 0x28BB, 0x68D6, 0x286E, 0x2173,
+	0x2962, 0x21D2, 0x205F, 0x49F2, 0x2917, 0x2306, 0x207F, 0x404F,
+	0x2153, 0x2943, 0x20CF, 0x21C3, 0x2073, 0x20D3, 0x2136, 0x183B,
+	0x430A, 0x40A7, 0x18B6, 0x2079, 0x2309, 0x2075, 0x184B, 0x20EF,
+	0x187A, 0x7837, 0x1B19, 0x20AB, 0x18BA, 0x20B7, 0x1994, 0x19E3,
+	0x21B4, 0x49B3, 0x38BF, 0x193B, 0x1876, 0x182B, 0x30F2, 0x193A,
+	0x1827, 0x1965, 0x1914, 0x184A, 0x4047, 0x1916, 0x1285, 0x1937,
+	0x122D, 0x1915, 0x1321, 0x1955, 0x1046, 0x191B, 0x2106, 0x2919,
+	0x1344, 0x1524, 0x12E1, 0x3926, 0x10E5, 0x2295, 0x1159, 0x1145,
+	0x10DF, 0x124D, 0x1271, 0x092A, 0x2169, 0x1704, 0x22A2, 0x1164,
+	0x13EE, 0x12F1, 0x0AD1, 0x128A, 0x110A, 0x11D3, 0x1286, 0x115A,
+	0x2BA1, 0x0BBF, 0x3956, 0x2A89, 0x12AD, 0x10E9, 0x0B41, 0x1A29,
+	0x2225, 0x08FD, 0x1107, 0x08D5, 0x191A, 0x1125, 0x1A96, 0x0B04,
+	0x18D9, 0x2B16, 0x11F1, 0x0A33, 0x0924, 0x131A, 0x1149, 0x1324,
+	0x0BEF, 0x0A99, 0x08CB, 0x123D, 0x1331, 0x0BDF, 0x0872, 0x22A3,
+	0x0AC2, 0x1144, 0x0D04, 0x08D2, 0x08CE, 0x0AA9, 0x0A9A, 0x0B13,
+	0x1251, 0x0865, 0x1069, 0x0897, 0x1215, 0x18B3, 0x1A62, 0x08C7,
+	0x185E, 0x10E2, 0x0AA5, 0x21FF, 0x090B, 0x0952, 0x09E1, 0x0A42,
+	0x08F1, 0x0A06, 0x0B22, 0x087D, 0x1139, 0x021F, 0x122E, 0x082F,
+	0x09C2, 0x0887, 0x0A0A, 0x03C1, 0x0929, 0x0A5D, 0x0A83, 0x0BFF,
+	0x0935, 0x085B, 0x0104, 0x08DD, 0x0923, 0x083F, 0x0241, 0x09D1,
+	0x0A39, 0x0863, 0x0A8B, 0x08A6, 0x008B, 0x1133, 0x13B1, 0x089B,
+	0x0AB3, 0x0036, 0x0BDD, 0x08ED, 0x0857, 0x0971, 0x0219, 0x1235,
+	0x0AB1, 0x0ACD, 0x036F, 0x0A31, 0x08AA, 0x003A, 0x08C3, 0x0A05,
+	0x02BD, 0x0B92, 0x0B07, 0x12B2, 0x08C5, 0x0B51, 0x0381, 0x0A8D,
+	0x01A3, 0x0896, 0x0855, 0x0BFD, 0x005D, 0x0BFE, 0x023E, 0x08AF,
+	0x00B9, 0x0A93, 0x00B5, 0x0862, 0x0A0B, 0x0A09, 0x0A72, 0x0332,
+	0x0AA1, 0x08C9, 0x024E, 0x1382, 0x0951, 0x00A5, 0x0A2A, 0x0059,
+	0x0A9E, 0x0B42, 0x004E, 0x0942, 0x03ED, 0x09B2, 0x02D2, 0x0849,
+	0x0035, 0x0216, 0x0961, 0x0BAF, 0x00AE, 0x0826, 0x0287, 0x0A1A,
+	0x0393, 0x0221, 0x09A2, 0x086D, 0x0226, 0x0871, 0x0039, 0x082A,
+	0x08C2, 0x08E1, 0x0845, 0x0207, 0x0B23, 0x0015, 0x00D1, 0x0B83,
+	0x037F, 0x0252, 0x08A9, 0x0099, 0x0A13, 0x0053, 0x0807, 0x03CD,
+	0x0BDE, 0x0016, 0x089A, 0x0232, 0x035F, 0x0A8E, 0x0AC3, 0x022F,
+	0x0263, 0x0829, 0x004D, 0x0132, 0x0806, 0x0311, 0x01B1, 0x0941,
+	0x0086, 0x000B, 0x1122, 0x0025, 0x0842, 0x00BD, 0x0BCF, 0x03A2,
+	0x0043, 0x0B03, 0x0895, 0x0A8F, 0x008A, 0x09EF, 0x0253, 0x0A1B,
+	0x0182, 0x0243, 0x0A92, 0x00CD, 0x083E, 0x030B, 0x0223, 0x081A,
+	0x0A9F, 0x0193, 0x00BE, 0x0017, 0x0931, 0x0391, 0x037E, 0x09C1,
+	0x0312, 0x0333, 0x03B2, 0x083D, 0x08B1, 0x00B2, 0x002E, 0x021D,
+	0x0A9D, 0x0192, 0x02AE, 0x0102, 0x0022, 0x081B, 0x0222, 0x009E,
+	0x021E, 0x000A, 0x089F, 0x0217, 0x0BCE, 0x0052, 0x020F, 0x0A97,
+	0x0282, 0x008E, 0x0A3F, 0x01FD, 0x00A3, 0x0019, 0x08A2, 0x0301,
+	0x036E, 0x01FE, 0x03BE, 0x0ABE, 0x01CE, 0x0302, 0x029B, 0x0051,
+	0x0883, 0x008F, 0x0BAE, 0x01DF, 0x0183, 0x0912, 0x000E, 0x020D,
+	0x01EE, 0x0B4F, 0x0033, 0x0103, 0x020E, 0x0832, 0x01AF, 0x0913,
+	0x01DE, 0x0203, 0x001E, 0x0092, 0x0093, 0x000F, 0x015F, 0x0291,
+	0x0281, 0x0813, 0x001F, 0x01CF, 0x033F, 0x0023, 0x01BF, 0x0202,
+	0x016F, 0x017E, 0x03AD, 0x0201, 0x034E, 0x0BBD, 0x036D, 0x017F,
+	0x0211, 0x038E, 0x0212, 0x032E, 0x034D, 0x035E, 0x037D, 0x039E,
+	0x032F, 0x033E, 0x035D, 0x038F, 0x039F
+};
+
+static const uint16_t percentile_arr_10x10_1[234] {
+	0x07CD, 0x6E21, 0x24F1, 0x8443, 0xD7AE, 0x24C2, 0x1C62, 0xCCA3,
+	0x1C33, 0xFDEF, 0x2532, 0x55DF, 0x1472, 0x6C3E, 0x14D2, 0x34DD,
+	0x1452, 0x745D, 0x4D51, 0x8DD1, 0x247D, 0x75FF, 0x0CB3, 0x17BE,
+	0x6CAE, 0x17DD, 0x1571, 0x3D06, 0x4E31, 0x0DA2, 0x67BD, 0x160D,
+	0x2C4E, 0x0D22, 0x176E, 0x3CB2, 0x142E, 0x4DFE, 0x0F4F, 0x1435,
+	0x0F01, 0x0D42, 0x0F7D, 0x0CB5, 0x1E03, 0x149F, 0x1C96, 0x141F,
+	0x14B9, 0x0FAF, 0x0439, 0x0E91, 0x2682, 0x1D13, 0x1FAD, 0x0407,
+	0x3471, 0x0C86, 0x0F6D, 0x0D15, 0x0D61, 0x040B, 0x0C6D, 0x0C16,
+	0x0C9A, 0x0D0A, 0x0593, 0x0CD1, 0x248F, 0x0C2F, 0x3C42, 0x1523,
+	0x0445, 0x0E81, 0x0CA2, 0x1525, 0x0406, 0x1C8A, 0x0C1A, 0x04BD,
+	0x0F5E, 0x0F3F, 0x1F4E, 0x0E1D, 0x0423, 0x0DCF, 0x044D, 0x0D92,
+	0x0583, 0x0DB1, 0x1449, 0x15EE, 0x0F5F, 0x079F, 0x0D19, 0x0409,
+	0x04CD, 0x05FD, 0x143D, 0x0612, 0x0D03, 0x0D82, 0x04B1, 0x0C95,
+	0x0C2A, 0x049E, 0x05AF, 0x0D31, 0x05BE, 0x04E1, 0x0D05, 0x0516,
+	0x0711, 0x05C1, 0x0509, 0x0D41, 0x0493, 0x048E, 0x0602, 0x05BF,
+	0x0CA5, 0x0529, 0x0535, 0x0D12, 0x0539, 0x0451, 0x0C29, 0x071F,
+	0x040A, 0x0F3D, 0x0432, 0x059F, 0x0425, 0x0C99, 0x05DE, 0x05CE,
+	0x0C0F, 0x0489, 0x051A, 0x0501, 0x0415, 0x057F, 0x0431, 0x0E13,
+	0x040D, 0x041D, 0x075D, 0x0C53, 0x0502, 0x04C1, 0x049D, 0x0426,
+	0x040E, 0x05A1, 0x055F, 0x0781, 0x0591, 0x04A9, 0x048B, 0x0D8E,
+	0x052E, 0x0412, 0x0521, 0x0405, 0x04AD, 0x074D, 0x0611, 0x077E,
+	0x078F, 0x078D, 0x048D, 0x041E, 0x0487, 0x0461, 0x0C85, 0x05ED,
+	0x0402, 0x0483, 0x0419, 0x0511, 0x0491, 0x0482, 0x059E, 0x068D,
+	0x055D, 0x072E, 0x05DD, 0x054E, 0x0441, 0x0422, 0x052F, 0x057D,
+	0x072D, 0x079D, 0x0CA1, 0x072F, 0x079E, 0x0581, 0x042D, 0x055E,
+	0x0601, 0x0413, 0x0692, 0x0403, 0x051F, 0x053F, 0x054F, 0x05CD,
+	0x070F, 0x071D, 0x05AE, 0x05BD, 0x0492, 0x056E, 0x0411, 0x0417,
+	0x041B, 0x0421, 0x053E, 0x056F, 0x057E, 0x058F, 0x060E, 0x0622,
+	0x0683, 0x0702, 0x070D, 0x070E, 0x071E, 0x073E, 0x076F, 0x078E,
+	0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_10x10 {
+	10, 10,
+	{ 453, 234 },
+	{ 1095, 472 },
+	{ 0, 70 },
+	{ percentile_arr_10x10_0, percentile_arr_10x10_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 10)
+static const uint16_t percentile_arr_12x10_0[491] {
+	0x0334, 0x9954, 0x8514, 0x7128, 0x6364, 0xC174, 0x5D34, 0x5866,
+	0x5975, 0x5354, 0xAF14, 0x506A, 0x5108, 0x5724, 0x5308, 0x4544,
+	0x4918, 0x4064, 0x49E2, 0x4179, 0x8163, 0x4054, 0xF81C, 0x394A,
+	0x38F3, 0x4172, 0x38F5, 0xA06F, 0x68EA, 0x69F2, 0x3134, 0x31A4,
+	0x305A, 0x68DA, 0x3056, 0x3146, 0x31F5, 0x3148, 0x5A61, 0x32C1,
+	0x31D2, 0x307E, 0x29E3, 0x30E6, 0x59C3, 0x2984, 0x29B6, 0x28F9,
+	0x5204, 0x28EE, 0x50CA, 0x2997, 0x48C6, 0x4838, 0x2953, 0x200C,
+	0x2943, 0x2173, 0x2D08, 0x4162, 0x29B4, 0x2314, 0x21B3, 0x212B,
+	0x210C, 0x48E3, 0x60DE, 0x205F, 0x20FE, 0x2028, 0x21A6, 0x404F,
+	0x20D6, 0x2214, 0x2127, 0x1873, 0x40CF, 0x206E, 0x1B09, 0x21C6,
+	0x2075, 0x19D5, 0x2305, 0x18D3, 0x2076, 0x1804, 0x230A, 0x304B,
+	0x20BB, 0x18B6, 0x1936, 0x1B19, 0x3037, 0x187F, 0x18A7, 0x1B85,
+	0x30BA, 0x183B, 0x1027, 0x18EF, 0x1B21, 0x1879, 0x10AB, 0x1917,
+	0x1114, 0x18BF, 0x1074, 0x1994, 0x2847, 0x111B, 0x28F2, 0x11E5,
+	0x19A7, 0x113A, 0x1046, 0x28B7, 0x207A, 0x182B, 0x1155, 0x104A,
+	0x1344, 0x293B, 0x11D3, 0x2014, 0x1044, 0x1018, 0x13A1, 0x1315,
+	0x2524, 0x20DF, 0x10E5, 0x1126, 0x12A2, 0x1824, 0x2271, 0x11F1,
+	0x2964, 0x12D1, 0x115A, 0x092A, 0x2341, 0x1A2D, 0x12E1, 0x090A,
+	0x13BF, 0x0A4D, 0x2119, 0x0BC1, 0x1233, 0x1A8A, 0x2008, 0x1159,
+	0x1A89, 0x08D5, 0x1156, 0x0834, 0x13EE, 0x1169, 0x1187, 0x1AA3,
+	0x1229, 0x1331, 0x0A85, 0x0937, 0x1704, 0x08FD, 0x2124, 0x0B13,
+	0x1251, 0x0AAD, 0x082C, 0x091A, 0x18D9, 0x0A99, 0x1848, 0x18E9,
+	0x0B95, 0x1144, 0x0AF1, 0x1A25, 0x131A, 0x09C5, 0x0986, 0x1BDF,
+	0x0B24, 0x0965, 0x1262, 0x0949, 0x0872, 0x09C2, 0x12C2, 0x0916,
+	0x085E, 0x0B06, 0x08CB, 0x08C7, 0x1242, 0x1BEF, 0x0A9A, 0x1152,
+	0x08B3, 0x0AA9, 0x090B, 0x08D2, 0x1B22, 0x0B04, 0x0865, 0x0A15,
+	0x1286, 0x0A83, 0x0A95, 0x09D1, 0x0A06, 0x0196, 0x1139, 0x0A3D,
+	0x0933, 0x13B1, 0x0123, 0x0D04, 0x08E2, 0x122E, 0x08A6, 0x00CE,
+	0x0A31, 0x1241, 0x0B51, 0x1057, 0x1171, 0x007D, 0x1145, 0x0A0A,
+	0x0129, 0x09FF, 0x089B, 0x085B, 0x0063, 0x0AB1, 0x0A1F, 0x0A5D,
+	0x0AA5, 0x0036, 0x0904, 0x0B86, 0x0A8B, 0x0897, 0x11E1, 0x0332,
+	0x083F, 0x0A19, 0x02B3, 0x0859, 0x08C3, 0x0855, 0x11B5, 0x01A5,
+	0x0AB2, 0x0392, 0x10DD, 0x09A3, 0x00ED, 0x0907, 0x1161, 0x002F,
+	0x0887, 0x0216, 0x0ABD, 0x0B81, 0x0A93, 0x0A21, 0x003A, 0x0ACD,
+	0x0AA1, 0x0A35, 0x0272, 0x0BDD, 0x03FE, 0x0BAF, 0x0869, 0x0213,
+	0x088B, 0x020B, 0x00B5, 0x1035, 0x08F1, 0x0151, 0x0A4E, 0x0239,
+	0x0BA2, 0x00AA, 0x0896, 0x0382, 0x0A08, 0x0A05, 0x0A09, 0x0142,
+	0x086D, 0x004E, 0x0B23, 0x0106, 0x0807, 0x036F, 0x0995, 0x03FD,
+	0x08AF, 0x08C5, 0x0062, 0x0053, 0x0B42, 0x0826, 0x021A, 0x01A2,
+	0x09B1, 0x00C9, 0x09B2, 0x0045, 0x0207, 0x08B9, 0x00A5, 0x0AD2,
+	0x0095, 0x003E, 0x0A32, 0x0383, 0x0849, 0x0135, 0x029E, 0x0A26,
+	0x023E, 0x0BFF, 0x0A52, 0x0311, 0x001B, 0x0915, 0x0A8D, 0x0223,
+	0x022A, 0x0BED, 0x0086, 0x0A96, 0x0222, 0x035F, 0x0A43, 0x085D,
+	0x0303, 0x0393, 0x0A63, 0x082A, 0x037F, 0x0932, 0x0043, 0x0292,
+	0x03CD, 0x0BDE, 0x009F, 0x0125, 0x08A9, 0x0253, 0x0015, 0x0192,
+	0x0A17, 0x08C2, 0x0316, 0x00D1, 0x0282, 0x0871, 0x0312, 0x0122,
+	0x0A9F, 0x02AE, 0x0006, 0x0A8E, 0x08E1, 0x0016, 0x0B0B, 0x00AE,
+	0x0025, 0x0193, 0x0AC3, 0x0017, 0x0307, 0x00BD, 0x08BE, 0x0039,
+	0x0BB2, 0x021B, 0x01FD, 0x084D, 0x03CE, 0x00A3, 0x0302, 0x0BCF,
+	0x0033, 0x0391, 0x028F, 0x0852, 0x0287, 0x008A, 0x0333, 0x080B,
+	0x0131, 0x01C1, 0x037E, 0x0A0F, 0x00B1, 0x002E, 0x0099, 0x0902,
+	0x009A, 0x003D, 0x0982, 0x0301, 0x00CD, 0x0941, 0x0042, 0x0183,
+	0x029D, 0x08A2, 0x021D, 0x001A, 0x0A97, 0x01EF, 0x01CE, 0x0051,
+	0x0BAE, 0x022F, 0x03BE, 0x021E, 0x000A, 0x09DF, 0x0029, 0x020D,
+	0x02BE, 0x029B, 0x09EE, 0x00B2, 0x0912, 0x036E, 0x009E, 0x0022,
+	0x0019, 0x0892, 0x0032, 0x01FE, 0x0083, 0x023F, 0x0B96, 0x000E,
+	0x008F, 0x0113, 0x0103, 0x001E, 0x0A0E, 0x0013, 0x008E, 0x0281,
+	0x09AF, 0x017E, 0x0203, 0x016F, 0x0291, 0x0023, 0x0093, 0x03BD,
+	0x001F, 0x01CF, 0x01DE, 0x0201, 0x01BF, 0x0B4F, 0x000F, 0x0202,
+	0x037D, 0x038E, 0x0211, 0x0212, 0x034E, 0x039F, 0x03AD, 0x015F,
+	0x017F, 0x032E, 0x033F, 0x034D, 0x035E, 0x036D, 0x032F, 0x033E,
+	0x035D, 0x038F, 0x039E
+};
+
+static const uint16_t percentile_arr_12x10_1[240] {
+	0x0621, 0xA443, 0xFCC2, 0x3CA3, 0x1D32, 0x14F1, 0x7462, 0x1433,
+	0x27CD, 0x2571, 0x57AE, 0x5DD1, 0x64B3, 0x44D2, 0x2C72, 0x25A2,
+	0x1E31, 0x55DF, 0x4C52, 0x1DEF, 0x0D51, 0x3C5D, 0x3C3E, 0x74DD,
+	0x347D, 0x27BE, 0x5CB5, 0x17DD, 0x2C14, 0x0CAE, 0x24B2, 0x15FF,
+	0x2701, 0x0D42, 0x1FBD, 0x0C35, 0x1603, 0x060D, 0x1D93, 0x0C96,
+	0x1C07, 0x1522, 0x0D06, 0x0F4F, 0x0C9F, 0x1F6E, 0x0D86, 0x0C2E,
+	0x1DFE, 0x0682, 0x1E91, 0x0F7D, 0x0C86, 0x040B, 0x1513, 0x044E,
+	0x14D1, 0x0C39, 0x14B9, 0x1C71, 0x05B1, 0x0C1F, 0x0681, 0x1445,
+	0x0C16, 0x0D95, 0x1583, 0x0D61, 0x0FAD, 0x1442, 0x048F, 0x0D0A,
+	0x049A, 0x0F6D, 0x146D, 0x0C2F, 0x0D25, 0x0406, 0x0C1A, 0x0D23,
+	0x0612, 0x0FAF, 0x0F11, 0x0592, 0x0515, 0x14E1, 0x0602, 0x048A,
+	0x0E1D, 0x0CBD, 0x0F9F, 0x0423, 0x075E, 0x174E, 0x0426, 0x0404,
+	0x0C22, 0x0CA2, 0x0DEE, 0x0CA5, 0x0F3F, 0x05C1, 0x0CCD, 0x0503,
+	0x044D, 0x0D16, 0x0449, 0x0D82, 0x0613, 0x0585, 0x0519, 0x0C95,
+	0x075F, 0x0D35, 0x04B1, 0x0509, 0x0531, 0x0DA1, 0x049E, 0x040A,
+	0x05CF, 0x0D41, 0x0415, 0x0692, 0x05FD, 0x0C25, 0x04A1, 0x0529,
+	0x0591, 0x0C93, 0x057F, 0x04C1, 0x0512, 0x051A, 0x078D, 0x0451,
+	0x0C0F, 0x0487, 0x0611, 0x0432, 0x042A, 0x05AF, 0x0461, 0x072D,
+	0x0409, 0x0405, 0x0D39, 0x05DE, 0x048E, 0x0499, 0x0483, 0x04A9,
+	0x0491, 0x042D, 0x049D, 0x0429, 0x040E, 0x05AE, 0x0521, 0x043D,
+	0x0581, 0x05DD, 0x0492, 0x0CAD, 0x041E, 0x058F, 0x071F, 0x072F,
+	0x0419, 0x073D, 0x057D, 0x0511, 0x05CE, 0x041D, 0x0485, 0x056E,
+	0x0412, 0x0431, 0x05BF, 0x0441, 0x054E, 0x0489, 0x0421, 0x0502,
+	0x0408, 0x040D, 0x051F, 0x059F, 0x073E, 0x078F, 0x0482, 0x079D,
+	0x0C02, 0x05BE, 0x048B, 0x0411, 0x0505, 0x057E, 0x052E, 0x074D,
+	0x077E, 0x054F, 0x0601, 0x055F, 0x068D, 0x070D, 0x070F, 0x071E,
+	0x072E, 0x05CD, 0x0403, 0x0501, 0x055D, 0x059E, 0x0781, 0x0413,
+	0x0417, 0x041B, 0x0453, 0x048D, 0x052F, 0x053E, 0x053F, 0x055E,
+	0x056F, 0x058E, 0x05BD, 0x05ED, 0x060E, 0x0622, 0x0683, 0x0702,
+	0x070E, 0x071D, 0x075D, 0x076F, 0x078E, 0x079E, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_12x10 =
+{
+	12, 10,
+	{ 491, 240 },
+	{ 1099, 341 },
+	{ 0, 23 },
+	{ percentile_arr_12x10_0, percentile_arr_12x10_1 }
+};
+#endif
+
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 12)
+static const uint16_t percentile_arr_12x12_0[529] {
+	0x0334, 0xF534, 0x8514, 0x8954, 0x7F14, 0xFB54, 0x7B08, 0x7128,
+	0x7974, 0x6179, 0x6B64, 0x6908, 0x606A, 0x6724, 0xB544, 0xB066,
+	0xA14A, 0x5118, 0x9975, 0x51F9, 0x981C, 0x49CA, 0x4854, 0x886F,
+	0x88D4, 0x48EE, 0x41E2, 0x4163, 0x40F3, 0x4261, 0x4064, 0x407E,
+	0x385A, 0x42C1, 0x4172, 0x38EA, 0x3946, 0x78CF, 0xA056, 0x38DE,
+	0x3D08, 0x38F9, 0x3B14, 0x38FE, 0xA134, 0x38B8, 0x31A4, 0x71D2,
+	0x60DA, 0x39C3, 0x99BA, 0x60CA, 0x39F2, 0x30F5, 0x304F, 0x31B6,
+	0x31F5, 0x3204, 0x3148, 0x305F, 0x2953, 0x3194, 0x3184, 0x310C,
+	0x889C, 0x300C, 0x2943, 0x30EF, 0x28C6, 0x2997, 0x2838, 0x58E6,
+	0x20E4, 0x28E3, 0x2873, 0x29E3, 0x2A84, 0x28D3, 0x492B, 0x2962,
+	0x286E, 0x20BF, 0x21AA, 0x29A6, 0x6A14, 0x2828, 0x89C6, 0x21B3,
+	0x2305, 0x29B4, 0x2173, 0x2127, 0x20D6, 0x407F, 0x2294, 0x21D9,
+	0x21D5, 0x2004, 0x404B, 0x18DF, 0x2079, 0x219B, 0x18A8, 0x2385,
+	0x1936, 0x21AB, 0x188C, 0x1B09, 0x18BA, 0x203B, 0x187A, 0x1875,
+	0x2344, 0x18BB, 0x18B6, 0x193A, 0x1837, 0x1914, 0x1846, 0x1876,
+	0x1884, 0x1D24, 0x182B, 0x284A, 0x18A7, 0x18AB, 0x1917, 0x322D,
+	0x1047, 0x1874, 0x1818, 0x18F2, 0x1164, 0x1B89, 0x2959, 0x1B21,
+	0x39E5, 0x1827, 0x10F4, 0x18B7, 0x11D3, 0x1A4D, 0x1315, 0x12AD,
+	0x1AD1, 0x3A71, 0x1319, 0x11A7, 0x2044, 0x2F04, 0x2341, 0x10E5,
+	0x1155, 0x195A, 0x1024, 0x111B, 0x1251, 0x1233, 0x12E1, 0x13A1,
+	0x13BF, 0x212A, 0x22A2, 0x113B, 0x23DF, 0x10D5, 0x2399, 0x0814,
+	0x1126, 0x13EE, 0x1285, 0x10C4, 0x18FD, 0x20D9, 0x0987, 0x1242,
+	0x29C5, 0x2313, 0x0898, 0x13C1, 0x08C8, 0x11F1, 0x1034, 0x1B24,
+	0x0B0A, 0x11E9, 0x0808, 0x125D, 0x18E9, 0x0848, 0x1395, 0x0965,
+	0x123D, 0x2186, 0x1295, 0x18CE, 0x098B, 0x0BEF, 0x1504, 0x082C,
+	0x0A41, 0x1144, 0x0A89, 0x0956, 0x1331, 0x085E, 0x0B04, 0x128A,
+	0x12A3, 0x1937, 0x19C2, 0x0952, 0x0872, 0x08B4, 0x1262, 0x1124,
+	0x1969, 0x1063, 0x0AF1, 0x1225, 0x0894, 0x11C9, 0x18D2, 0x0ACD,
+	0x0A29, 0x0B06, 0x09B5, 0x18C7, 0x0916, 0x1088, 0x09FF, 0x2206,
+	0x0A15, 0x08B3, 0x0B51, 0x0A1F, 0x18CB, 0x0AC2, 0x0A2E, 0x1865,
+	0x08AC, 0x0A31, 0x08A4, 0x138A, 0x0A99, 0x09D1, 0x0A86, 0x189B,
+	0x0283, 0x0BDD, 0x0ABD, 0x1933, 0x083F, 0x1386, 0x0923, 0x0322,
+	0x0869, 0x10DD, 0x13B1, 0x082F, 0x087D, 0x11B9, 0x085B, 0x08ED,
+	0x00C3, 0x08E2, 0x084E, 0x0887, 0x0855, 0x0A0A, 0x0857, 0x0B92,
+	0x1036, 0x12A5, 0x0293, 0x0945, 0x08A6, 0x0196, 0x19A3, 0x036F,
+	0x0904, 0x1205, 0x09E1, 0x0381, 0x0971, 0x1219, 0x0BAF, 0x0949,
+	0x00AF, 0x0AA9, 0x018A, 0x0907, 0x0BFD, 0x003A, 0x0BCD, 0x0AB2,
+	0x088B, 0x0252, 0x0A4E, 0x03FF, 0x0845, 0x0897, 0x0059, 0x090B,
+	0x0B42, 0x0807, 0x0A16, 0x0853, 0x0A8D, 0x01B2, 0x0AB1, 0x091A,
+	0x0195, 0x0A35, 0x00B5, 0x10AA, 0x0115, 0x0A21, 0x0096, 0x0A08,
+	0x03FE, 0x0B7F, 0x08B9, 0x12B3, 0x023E, 0x0A23, 0x029E, 0x08F1,
+	0x01A9, 0x0BDE, 0x0843, 0x02D2, 0x0A1A, 0x08C5, 0x0151, 0x0A43,
+	0x0332, 0x0383, 0x0826, 0x0BED, 0x10C2, 0x00AE, 0x0B82, 0x0213,
+	0x0232, 0x085D, 0x02A1, 0x101B, 0x035F, 0x0303, 0x0A39, 0x0207,
+	0x0A53, 0x0142, 0x01A5, 0x082A, 0x0099, 0x0A17, 0x03CF, 0x0906,
+	0x0125, 0x0A96, 0x0A9A, 0x0209, 0x0393, 0x0961, 0x0131, 0x0A88,
+	0x0139, 0x099A, 0x0292, 0x0272, 0x0862, 0x08BE, 0x0141, 0x02C3,
+	0x0886, 0x0039, 0x08A9, 0x01A2, 0x01B1, 0x0851, 0x020B, 0x086D,
+	0x0312, 0x08CD, 0x020F, 0x0311, 0x0BCE, 0x0135, 0x0006, 0x0849,
+	0x0132, 0x0A8F, 0x022F, 0x022A, 0x0AAE, 0x0A8E, 0x0263, 0x03A2,
+	0x083E, 0x009A, 0x021B, 0x0835, 0x0323, 0x0871, 0x0993, 0x0226,
+	0x0302, 0x0922, 0x0119, 0x0222, 0x021D, 0x0B07, 0x08C9, 0x037E,
+	0x08BD, 0x0042, 0x00D1, 0x0B33, 0x01C1, 0x0B9A, 0x0282, 0x088A,
+	0x0182, 0x083D, 0x004D, 0x010A, 0x0A1E, 0x0019, 0x00B2, 0x0999,
+	0x00A5, 0x0095, 0x0817, 0x0022, 0x031A, 0x0902, 0x00A3, 0x01BF,
+	0x029F, 0x0816, 0x03B2, 0x0015, 0x0391, 0x0BBE, 0x01FE, 0x1129,
+	0x002E, 0x01DF, 0x0301, 0x0033, 0x0B6E, 0x00E1, 0x0297, 0x00B1,
+	0x009F, 0x0B16, 0x000A, 0x001A, 0x0052, 0x080B, 0x030B, 0x029D,
+	0x0BAE, 0x01FD, 0x020E, 0x00A2, 0x0A3F, 0x0192, 0x0ABE, 0x020D,
+	0x008F, 0x028B, 0x0083, 0x0025, 0x09EE, 0x01EF, 0x0029, 0x0291,
+	0x0B4F, 0x0396, 0x0287, 0x008E, 0x0092, 0x0B4E, 0x017E, 0x001E,
+	0x009E, 0x0103, 0x080F, 0x000E, 0x0113, 0x0203, 0x01CF, 0x0183,
+	0x01CE, 0x001F, 0x0112, 0x01DE, 0x038E, 0x0832, 0x033E, 0x0212,
+	0x029B, 0x0023, 0x016F, 0x0201, 0x09AF, 0x0202, 0x0281, 0x035E,
+	0x034D, 0x037D, 0x03AD, 0x0013, 0x0093, 0x015F, 0x0211, 0x033F,
+	0x036D, 0x039F, 0x03BD, 0x017F, 0x032E, 0x032F, 0x035D, 0x038F,
+	0x039E
+};
+
+static const uint16_t percentile_arr_12x12_1[246] {
+	0x0443, 0xFFCD, 0x2C62, 0x2E21, 0x3CF1, 0x34C2, 0x4CDD, 0x2452,
+	0xD5DF, 0x1DD1, 0x0FAE, 0x64A3, 0x0C7D, 0x3433, 0x1CD2, 0x2DEF,
+	0x0C3E, 0x1D71, 0xA472, 0x0D32, 0x54B3, 0x4D51, 0x445D, 0x0E31,
+	0x1FDD, 0x0DFF, 0x0CAE, 0x45A2, 0x2FBE, 0xA4B9, 0x1C4E, 0x2C9F,
+	0x160D, 0x0D42, 0x342E, 0x074F, 0x1414, 0x0F6E, 0x0CB2, 0x34B5,
+	0x0DFE, 0x0D86, 0x1496, 0x1D22, 0x0691, 0x140B, 0x041F, 0x0C35,
+	0x1D93, 0x1506, 0x1439, 0x0C9A, 0x0F01, 0x2442, 0x0C8F, 0x04D1,
+	0x1486, 0x0C6D, 0x0513, 0x0C71, 0x0E82, 0x177D, 0x0E03, 0x07BD,
+	0x0C2F, 0x0D83, 0x07AF, 0x0D61, 0x1407, 0x0DB1, 0x050A, 0x0C94,
+	0x07AD, 0x0D8A, 0x0C04, 0x0416, 0x0C49, 0x0445, 0x15C1, 0x0C1A,
+	0x0525, 0x0595, 0x0C8A, 0x075E, 0x0CBD, 0x0681, 0x0F4E, 0x075F,
+	0x061D, 0x1541, 0x0CB1, 0x0F3F, 0x0406, 0x076D, 0x0DCF, 0x05EE,
+	0x0D23, 0x0599, 0x0CCD, 0x0711, 0x0C23, 0x079F, 0x0D15, 0x0585,
+	0x04A2, 0x042A, 0x0D31, 0x05BF, 0x0D92, 0x0C26, 0x043D, 0x0C93,
+	0x0502, 0x0C15, 0x048B, 0x0D03, 0x0613, 0x0516, 0x0495, 0x0C29,
+	0x04A5, 0x040F, 0x0425, 0x0539, 0x0D19, 0x04E1, 0x05BE, 0x0422,
+	0x0432, 0x0C0A, 0x0431, 0x041E, 0x0492, 0x04A9, 0x0582, 0x0529,
+	0x0487, 0x0C4D, 0x0512, 0x049E, 0x0505, 0x0451, 0x0D7F, 0x0489,
+	0x0602, 0x05DE, 0x0591, 0x0535, 0x074D, 0x055E, 0x04C1, 0x0612,
+	0x05DD, 0x05FD, 0x0C61, 0x0521, 0x0484, 0x05CE, 0x0581, 0x0491,
+	0x051A, 0x04A1, 0x048E, 0x040D, 0x0499, 0x071F, 0x072E, 0x075D,
+	0x0441, 0x0589, 0x057E, 0x0CAD, 0x0501, 0x054F, 0x0692, 0x0511,
+	0x049D, 0x0509, 0x056E, 0x040E, 0x0409, 0x0601, 0x048D, 0x0413,
+	0x053E, 0x0419, 0x072D, 0x0408, 0x0485, 0x042D, 0x041D, 0x05A1,
+	0x0781, 0x0402, 0x05ED, 0x0C82, 0x0403, 0x057D, 0x05CD, 0x0611,
+	0x0488, 0x0411, 0x054E, 0x051F, 0x053F, 0x056F, 0x059F, 0x070F,
+	0x071D, 0x073D, 0x073E, 0x077E, 0x078F, 0x0405, 0x079D, 0x079E,
+	0x058E, 0x0412, 0x055D, 0x05AE, 0x041B, 0x0421, 0x0453, 0x0417,
+	0x0483, 0x052E, 0x052F, 0x055F, 0x058F, 0x059E, 0x05AF, 0x05BD,
+	0x060E, 0x0622, 0x0683, 0x068D, 0x0702, 0x070D, 0x070E, 0x071E,
+	0x072F, 0x076F, 0x078D, 0x078E, 0x07BF, 0x07CE
+};
+
+static const packed_percentile_table block_pcd_12x12 {
+	12, 12,
+	{ 529, 246 },
+	{ 1435, 335 },
+	{ 0, 22 },
+	{ percentile_arr_12x12_0, percentile_arr_12x12_1 }
+};
+#endif
+
+/**
+ * @brief Fetch the packed percentile table for the given 2D block size.
+ *
+ * @param xdim The block x size.
+ * @param ydim The block y size.
+ *
+ * @return The packed table.
+ */
+static const packed_percentile_table *get_packed_table(
+	int xdim,
+	int ydim
+) {
+	int idx = (ydim << 8) | xdim;
+	switch (idx)
+	{
+#if ASTCENC_BLOCK_MAX_TEXELS >= (4 * 4)
+		case 0x0404: return &block_pcd_4x4;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 4)
+		case 0x0405: return &block_pcd_5x4;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (5 * 5)
+		case 0x0505: return &block_pcd_5x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 5)
+		case 0x0506: return &block_pcd_6x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (6 * 6)
+		case 0x0606: return &block_pcd_6x6;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 5)
+		case 0x0508: return &block_pcd_8x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 6)
+		case 0x0608: return &block_pcd_8x6;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (8 * 8)
+		case 0x0808: return &block_pcd_8x8;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 5)
+		case 0x050A: return &block_pcd_10x5;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 6)
+		case 0x060A: return &block_pcd_10x6;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 8)
+		case 0x080A: return &block_pcd_10x8;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (10 * 10)
+		case 0x0A0A: return &block_pcd_10x10;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 10)
+		case 0x0A0C: return &block_pcd_12x10;
+#endif
+#if ASTCENC_BLOCK_MAX_TEXELS >= (12 * 12)
+		case 0x0C0C: return &block_pcd_12x12;
+#endif
+	}
+
+	// Should never hit this with a valid 2D block size
+	return nullptr;
+}
+
+/* See header for documentation. */
+const float *get_2d_percentile_table(
+	unsigned int xdim,
+	unsigned int ydim
+) {
+	float* unpacked_table = new float[WEIGHTS_MAX_BLOCK_MODES];
+	const packed_percentile_table *apt = get_packed_table(xdim, ydim);
+
+	// Set the default percentile
+	for (unsigned int i = 0; i < WEIGHTS_MAX_BLOCK_MODES; i++)
+	{
+		unpacked_table[i] = 1.0f;
+	}
+
+	// Populate the unpacked percentile values
+	for (int i = 0; i < 2; i++)
+	{
+		unsigned int itemcount = apt->item_count[i];
+		unsigned int difscale = apt->difscales[i];
+		unsigned int accum = apt->initial_percs[i];
+		const uint16_t *item_ptr = apt->items[i];
+
+		for (unsigned int j = 0; j < itemcount; j++)
+		{
+			uint16_t item = item_ptr[j];
+			unsigned int idx = item & 0x7FF;
+			unsigned int weight = (item >> 11) & 0x1F;
+			accum += weight;
+			unpacked_table[idx] = static_cast<float>(accum) / static_cast<float>(difscale);
+		}
+	}
+
+	return unpacked_table;
+}
+#endif
+
+/* See header for documentation. */
+bool is_legal_2d_block_size(
+	unsigned int xdim,
+	unsigned int ydim
+) {
+	unsigned int idx = (xdim << 8) | ydim;
+	switch (idx)
+	{
+		case 0x0404:
+		case 0x0504:
+		case 0x0505:
+		case 0x0605:
+		case 0x0606:
+		case 0x0805:
+		case 0x0806:
+		case 0x0808:
+		case 0x0A05:
+		case 0x0A06:
+		case 0x0A08:
+		case 0x0A0A:
+		case 0x0C0A:
+		case 0x0C0C:
+			return true;
+	}
+
+	return false;
+}
+
+/* See header for documentation. */
+bool is_legal_3d_block_size(
+	unsigned int xdim,
+	unsigned int ydim,
+	unsigned int zdim
+) {
+	unsigned int idx = (xdim << 16) | (ydim << 8) | zdim;
+	switch (idx)
+	{
+		case 0x030303:
+		case 0x040303:
+		case 0x040403:
+		case 0x040404:
+		case 0x050404:
+		case 0x050504:
+		case 0x050505:
+		case 0x060505:
+		case 0x060605:
+		case 0x060606:
+			return true;
+	}
+
+	return false;
+}
diff --git a/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp b/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
new file mode 100644
index 0000000000..f25140d4c7
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_pick_best_endpoint_format.cpp
@@ -0,0 +1,1350 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for finding best endpoint format.
+ *
+ * We assume there are two independent sources of error in any given partition:
+ *
+ *   - Encoding choice errors
+ *   - Quantization errors
+ *
+ * Encoding choice errors are caused by encoder decisions. For example:
+ *
+ *   - Using luminance instead of separate RGB components.
+ *   - Using a constant 1.0 alpha instead of storing an alpha component.
+ *   - Using RGB+scale instead of storing two full RGB endpoints.
+ *
+ * Quantization errors occur due to the limited precision we use for storage. These errors generally
+ * scale with quantization level, but are not actually independent of color encoding. In particular:
+ *
+ *   - If we can use offset encoding then quantization error is halved.
+ *   - If we can use blue-contraction then quantization error for RG is halved.
+ *   - If we use HDR endpoints the quantization error is higher.
+ *
+ * Apart from these effects, we assume the error is proportional to the quantization step size.
+ */
+
+
+#include "astcenc_internal.h"
+#include "astcenc_vecmathlib.h"
+
+#include <assert.h>
+
+/**
+ * @brief Compute the errors of the endpoint line options for one partition.
+ *
+ * Uncorrelated data assumes storing completely independent RGBA channels for each endpoint. Same
+ * chroma data assumes storing RGBA endpoints which pass though the origin (LDR only). RGBL data
+ * assumes storing RGB + lumashift (HDR only). Luminance error assumes storing RGB channels as a
+ * single value.
+ *
+ *
+ * @param      pi                The partition info data.
+ * @param      partition_index   The partition index to compule the error for.
+ * @param      blk               The image block.
+ * @param      uncor_pline       The endpoint line assuming uncorrelated endpoints.
+ * @param[out] uncor_err         The computed error for the uncorrelated endpoint line.
+ * @param      samec_pline       The endpoint line assuming the same chroma for both endpoints.
+ * @param[out] samec_err         The computed error for the uncorrelated endpoint line.
+ * @param      rgbl_pline        The endpoint line assuming RGB + lumashift data.
+ * @param[out] rgbl_err          The computed error for the RGB + lumashift endpoint line.
+ * @param      l_pline           The endpoint line assuming luminance data.
+ * @param[out] l_err             The computed error for the luminance endpoint line.
+ * @param[out] a_drop_err        The computed error for dropping the alpha component.
+ */
+static void compute_error_squared_rgb_single_partition(
+	const partition_info& pi,
+	int partition_index,
+	const image_block& blk,
+	const processed_line3& uncor_pline,
+	float& uncor_err,
+	const processed_line3& samec_pline,
+	float& samec_err,
+	const processed_line3& rgbl_pline,
+	float& rgbl_err,
+	const processed_line3& l_pline,
+	float& l_err,
+	float& a_drop_err
+) {
+	vfloat4 ews = blk.channel_weight;
+
+	unsigned int texel_count = pi.partition_texel_count[partition_index];
+	const uint8_t* texel_indexes = pi.texels_of_partition[partition_index];
+	promise(texel_count > 0);
+
+	vfloatacc a_drop_errv = vfloatacc::zero();
+	vfloat default_a(blk.get_default_alpha());
+
+	vfloatacc uncor_errv = vfloatacc::zero();
+	vfloat uncor_bs0(uncor_pline.bs.lane<0>());
+	vfloat uncor_bs1(uncor_pline.bs.lane<1>());
+	vfloat uncor_bs2(uncor_pline.bs.lane<2>());
+
+	vfloat uncor_amod0(uncor_pline.amod.lane<0>());
+	vfloat uncor_amod1(uncor_pline.amod.lane<1>());
+	vfloat uncor_amod2(uncor_pline.amod.lane<2>());
+
+	vfloatacc samec_errv = vfloatacc::zero();
+	vfloat samec_bs0(samec_pline.bs.lane<0>());
+	vfloat samec_bs1(samec_pline.bs.lane<1>());
+	vfloat samec_bs2(samec_pline.bs.lane<2>());
+
+	vfloatacc rgbl_errv = vfloatacc::zero();
+	vfloat rgbl_bs0(rgbl_pline.bs.lane<0>());
+	vfloat rgbl_bs1(rgbl_pline.bs.lane<1>());
+	vfloat rgbl_bs2(rgbl_pline.bs.lane<2>());
+
+	vfloat rgbl_amod0(rgbl_pline.amod.lane<0>());
+	vfloat rgbl_amod1(rgbl_pline.amod.lane<1>());
+	vfloat rgbl_amod2(rgbl_pline.amod.lane<2>());
+
+	vfloatacc l_errv = vfloatacc::zero();
+	vfloat l_bs0(l_pline.bs.lane<0>());
+	vfloat l_bs1(l_pline.bs.lane<1>());
+	vfloat l_bs2(l_pline.bs.lane<2>());
+
+	vint lane_ids = vint::lane_id();
+	for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		vint tix(texel_indexes + i);
+
+		vmask mask = lane_ids < vint(texel_count);
+		lane_ids += vint(ASTCENC_SIMD_WIDTH);
+
+		// Compute the error that arises from just ditching alpha
+		vfloat data_a = gatherf(blk.data_a, tix);
+		vfloat alpha_diff = data_a - default_a;
+		alpha_diff = alpha_diff * alpha_diff;
+
+		haccumulate(a_drop_errv, alpha_diff, mask);
+
+		vfloat data_r = gatherf(blk.data_r, tix);
+		vfloat data_g = gatherf(blk.data_g, tix);
+		vfloat data_b = gatherf(blk.data_b, tix);
+
+		// Compute uncorrelated error
+		vfloat param = data_r * uncor_bs0
+		             + data_g * uncor_bs1
+		             + data_b * uncor_bs2;
+
+		vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r;
+		vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g;
+		vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b;
+
+		vfloat error = dist0 * dist0 * ews.lane<0>()
+		             + dist1 * dist1 * ews.lane<1>()
+		             + dist2 * dist2 * ews.lane<2>();
+
+		haccumulate(uncor_errv, error, mask);
+
+		// Compute same chroma error - no "amod", its always zero
+		param = data_r * samec_bs0
+		      + data_g * samec_bs1
+		      + data_b * samec_bs2;
+
+		dist0 = (param * samec_bs0) - data_r;
+		dist1 = (param * samec_bs1) - data_g;
+		dist2 = (param * samec_bs2) - data_b;
+
+		error = dist0 * dist0 * ews.lane<0>()
+		      + dist1 * dist1 * ews.lane<1>()
+		      + dist2 * dist2 * ews.lane<2>();
+
+		haccumulate(samec_errv, error, mask);
+
+		// Compute rgbl error
+		param = data_r * rgbl_bs0
+		      + data_g * rgbl_bs1
+		      + data_b * rgbl_bs2;
+
+		dist0 = (rgbl_amod0 + param * rgbl_bs0) - data_r;
+		dist1 = (rgbl_amod1 + param * rgbl_bs1) - data_g;
+		dist2 = (rgbl_amod2 + param * rgbl_bs2) - data_b;
+
+		error = dist0 * dist0 * ews.lane<0>()
+		      + dist1 * dist1 * ews.lane<1>()
+		      + dist2 * dist2 * ews.lane<2>();
+
+		haccumulate(rgbl_errv, error, mask);
+
+		// Compute luma error - no "amod", its always zero
+		param = data_r * l_bs0
+		      + data_g * l_bs1
+		      + data_b * l_bs2;
+
+		dist0 = (param * l_bs0) - data_r;
+		dist1 = (param * l_bs1) - data_g;
+		dist2 = (param * l_bs2) - data_b;
+
+		error = dist0 * dist0 * ews.lane<0>()
+		      + dist1 * dist1 * ews.lane<1>()
+		      + dist2 * dist2 * ews.lane<2>();
+
+		haccumulate(l_errv, error, mask);
+	}
+
+	a_drop_err = hadd_s(a_drop_errv) * ews.lane<3>();
+	uncor_err = hadd_s(uncor_errv);
+	samec_err = hadd_s(samec_errv);
+	rgbl_err = hadd_s(rgbl_errv);
+	l_err = hadd_s(l_errv);
+}
+
+/**
+ * @brief For a given set of input colors and partitioning determine endpoint encode errors.
+ *
+ * This function determines the color error that results from RGB-scale encoding (LDR only),
+ * RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether
+ * the endpoints are eligible for offset encoding or blue-contraction
+ *
+ * @param      blk   The image block.
+ * @param      pi    The partition info data.
+ * @param      ep    The idealized endpoints.
+ * @param[out] eci   The resulting encoding choice error metrics.
+  */
+static void compute_encoding_choice_errors(
+	const image_block& blk,
+	const partition_info& pi,
+	const endpoints& ep,
+	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS])
+{
+	int partition_count = pi.partition_count;
+	promise(partition_count > 0);
+
+	partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+	compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		partition_metrics& pm = pms[i];
+
+		line3 uncor_rgb_lines;
+		line3 samec_rgb_lines;  // for LDR-RGB-scale
+		line3 rgb_luma_lines;   // for HDR-RGB-scale
+
+		processed_line3 uncor_rgb_plines;
+		processed_line3 samec_rgb_plines;
+		processed_line3 rgb_luma_plines;
+		processed_line3 luminance_plines;
+
+		float uncorr_rgb_error;
+		float samechroma_rgb_error;
+		float rgb_luma_error;
+		float luminance_rgb_error;
+		float alpha_drop_error;
+
+		uncor_rgb_lines.a = pm.avg;
+		uncor_rgb_lines.b = normalize_safe(pm.dir, unit3());
+
+		samec_rgb_lines.a = vfloat4::zero();
+		samec_rgb_lines.b = normalize_safe(pm.avg, unit3());
+
+		rgb_luma_lines.a = pm.avg;
+		rgb_luma_lines.b = unit3();
+
+		uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b);
+		uncor_rgb_plines.bs   = uncor_rgb_lines.b;
+
+		// Same chroma always goes though zero, so this is simpler than the others
+		samec_rgb_plines.amod = vfloat4::zero();
+		samec_rgb_plines.bs   = samec_rgb_lines.b;
+
+		rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b);
+		rgb_luma_plines.bs   = rgb_luma_lines.b;
+
+		// Luminance always goes though zero, so this is simpler than the others
+		luminance_plines.amod = vfloat4::zero();
+		luminance_plines.bs   = unit3();
+
+		compute_error_squared_rgb_single_partition(
+		    pi, i, blk,
+		    uncor_rgb_plines, uncorr_rgb_error,
+		    samec_rgb_plines, samechroma_rgb_error,
+		    rgb_luma_plines,  rgb_luma_error,
+		    luminance_plines, luminance_rgb_error,
+		                      alpha_drop_error);
+
+		// Determine if we can offset encode RGB lanes
+		vfloat4 endpt0 = ep.endpt0[i];
+		vfloat4 endpt1 = ep.endpt1[i];
+		vfloat4 endpt_diff = abs(endpt1 - endpt0);
+		vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f);
+		bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7;
+
+		// Store out the settings
+		eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f;  // empirical
+		eci[i].rgb_luma_error  = (rgb_luma_error - uncorr_rgb_error) * 1.5f;        // wild guess
+		eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f;   // empirical
+		eci[i].alpha_drop_error = alpha_drop_error * 3.0f;
+		eci[i].can_offset_encode = can_offset_encode;
+		eci[i].can_blue_contract = !blk.is_luminance();
+	}
+}
+
+/**
+ * @brief For a given partition compute the error for every endpoint integer count and quant level.
+ *
+ * @param      encode_hdr_rgb     @c true if using HDR for RGB, @c false for LDR.
+ * @param      encode_hdr_alpha   @c true if using HDR for alpha, @c false for LDR.
+ * @param      partition_index    The partition index.
+ * @param      pi                 The partition info.
+ * @param      eci                The encoding choice error metrics.
+ * @param      ep                 The idealized endpoints.
+ * @param      error_weight       The resulting encoding choice error metrics.
+ * @param[out] best_error         The best error for each integer count and quant level.
+ * @param[out] format_of_choice   The preferred endpoint format for each integer count and quant level.
+ */
+static void compute_color_error_for_every_integer_count_and_quant_level(
+	bool encode_hdr_rgb,
+	bool encode_hdr_alpha,
+	int partition_index,
+	const partition_info& pi,
+	const encoding_choice_errors& eci,
+	const endpoints& ep,
+	vfloat4 error_weight,
+	float best_error[21][4],
+	uint8_t format_of_choice[21][4]
+) {
+	int partition_size = pi.partition_texel_count[partition_index];
+
+	static const float baseline_quant_error[21 - QUANT_6] {
+		(65536.0f * 65536.0f / 18.0f) / (5 * 5),
+		(65536.0f * 65536.0f / 18.0f) / (7 * 7),
+		(65536.0f * 65536.0f / 18.0f) / (9 * 9),
+		(65536.0f * 65536.0f / 18.0f) / (11 * 11),
+		(65536.0f * 65536.0f / 18.0f) / (15 * 15),
+		(65536.0f * 65536.0f / 18.0f) / (19 * 19),
+		(65536.0f * 65536.0f / 18.0f) / (23 * 23),
+		(65536.0f * 65536.0f / 18.0f) / (31 * 31),
+		(65536.0f * 65536.0f / 18.0f) / (39 * 39),
+		(65536.0f * 65536.0f / 18.0f) / (47 * 47),
+		(65536.0f * 65536.0f / 18.0f) / (63 * 63),
+		(65536.0f * 65536.0f / 18.0f) / (79 * 79),
+		(65536.0f * 65536.0f / 18.0f) / (95 * 95),
+		(65536.0f * 65536.0f / 18.0f) / (127 * 127),
+		(65536.0f * 65536.0f / 18.0f) / (159 * 159),
+		(65536.0f * 65536.0f / 18.0f) / (191 * 191),
+		(65536.0f * 65536.0f / 18.0f) / (255 * 255)
+	};
+
+	vfloat4 ep0 = ep.endpt0[partition_index];
+	vfloat4 ep1 = ep.endpt1[partition_index];
+
+	float ep1_min = hmin_rgb_s(ep1);
+	ep1_min = astc::max(ep1_min, 0.0f);
+
+	float error_weight_rgbsum = hadd_rgb_s(error_weight);
+
+	float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f;
+	float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f;
+
+	// It is possible to get endpoint colors significantly outside [0,upper-limit] even if the
+	// input data are safely contained in [0,upper-limit]; we need to add an error term for this
+	vfloat4 offset(range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_alpha);
+	vfloat4 ep0_range_error_high = max(ep0 - offset, 0.0f);
+	vfloat4 ep1_range_error_high = max(ep1 - offset, 0.0f);
+
+	vfloat4 ep0_range_error_low = min(ep0, 0.0f);
+	vfloat4 ep1_range_error_low = min(ep1, 0.0f);
+
+	vfloat4 sum_range_error =
+		(ep0_range_error_low * ep0_range_error_low) +
+		(ep1_range_error_low * ep1_range_error_low) +
+		(ep0_range_error_high * ep0_range_error_high) +
+		(ep1_range_error_high * ep1_range_error_high);
+
+	float rgb_range_error = dot3_s(sum_range_error, error_weight)
+	                      * 0.5f * static_cast<float>(partition_size);
+	float alpha_range_error = sum_range_error.lane<3>() * error_weight.lane<3>()
+	                        * 0.5f * static_cast<float>(partition_size);
+
+	if (encode_hdr_rgb)
+	{
+
+		// Collect some statistics
+		float af, cf;
+		if (ep1.lane<0>() > ep1.lane<1>() && ep1.lane<0>() > ep1.lane<2>())
+		{
+			af = ep1.lane<0>();
+			cf = ep1.lane<0>() - ep0.lane<0>();
+		}
+		else if (ep1.lane<1>() > ep1.lane<2>())
+		{
+			af = ep1.lane<1>();
+			cf = ep1.lane<1>() - ep0.lane<1>();
+		}
+		else
+		{
+			af = ep1.lane<2>();
+			cf = ep1.lane<2>() - ep0.lane<2>();
+		}
+
+		// Estimate of color-component spread in high endpoint color
+		float bf = af - ep1_min;
+		vfloat4 prd = (ep1 - vfloat4(cf)).swz<0, 1, 2>();
+		vfloat4 pdif = prd - ep0.swz<0, 1, 2>();
+		// Estimate of color-component spread in low endpoint color
+		float df = hmax_s(abs(pdif));
+
+		int b = static_cast<int>(bf);
+		int c = static_cast<int>(cf);
+		int d = static_cast<int>(df);
+
+		// Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode
+		int rgbo_mode = 5;		// 7 bits per component
+		// mode 4: 8 7 6
+		if (b < 32768 && c < 16384)
+		{
+			rgbo_mode = 4;
+		}
+
+		// mode 3: 9 6 7
+		if (b < 8192 && c < 16384)
+		{
+			rgbo_mode = 3;
+		}
+
+		// mode 2: 10 5 8
+		if (b < 2048 && c < 16384)
+		{
+			rgbo_mode = 2;
+		}
+
+		// mode 1: 11 6 5
+		if (b < 2048 && c < 1024)
+		{
+			rgbo_mode = 1;
+		}
+
+		// mode 0: 11 5 7
+		if (b < 1024 && c < 4096)
+		{
+			rgbo_mode = 0;
+		}
+
+		// Determine which one of the 9 submodes is likely to be used in case of an RGB-mode.
+		int rgb_mode = 8;		// 8 bits per component, except 7 bits for blue
+
+		// mode 0: 9 7 6 7
+		if (b < 16384 && c < 8192 && d < 8192)
+		{
+			rgb_mode = 0;
+		}
+
+		// mode 1: 9 8 6 6
+		if (b < 32768 && c < 8192 && d < 4096)
+		{
+			rgb_mode = 1;
+		}
+
+		// mode 2: 10 6 7 7
+		if (b < 4096 && c < 8192 && d < 4096)
+		{
+			rgb_mode = 2;
+		}
+
+		// mode 3: 10 7 7 6
+		if (b < 8192 && c < 8192 && d < 2048)
+		{
+			rgb_mode = 3;
+		}
+
+		// mode 4: 11 8 6 5
+		if (b < 8192 && c < 2048 && d < 512)
+		{
+			rgb_mode = 4;
+		}
+
+		// mode 5: 11 6 8 6
+		if (b < 2048 && c < 8192 && d < 1024)
+		{
+			rgb_mode = 5;
+		}
+
+		// mode 6: 12 7 7 5
+		if (b < 2048 && c < 2048 && d < 256)
+		{
+			rgb_mode = 6;
+		}
+
+		// mode 7: 12 6 7 6
+		if (b < 1024 && c < 2048 && d < 512)
+		{
+			rgb_mode = 7;
+		}
+
+		static const float rgbo_error_scales[6] { 4.0f, 4.0f, 16.0f, 64.0f, 256.0f, 1024.0f };
+		static const float rgb_error_scales[9] { 64.0f, 64.0f, 16.0f, 16.0f, 4.0f, 4.0f, 1.0f, 1.0f, 384.0f };
+
+		float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f;  // Empirically determined ....
+		float mode11mult = rgb_error_scales[rgb_mode] * 0.010f;    // Empirically determined ....
+
+
+		float lum_high = hadd_rgb_s(ep1) * (1.0f / 3.0f);
+		float lum_low = hadd_rgb_s(ep0) * (1.0f / 3.0f);
+		float lumdif = lum_high - lum_low;
+		float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f;
+
+		mode23mult *= 0.0005f;  // Empirically determined ....
+
+		// Pick among the available HDR endpoint modes
+		for (int i = QUANT_2; i < QUANT_16; i++)
+		{
+			best_error[i][3] = ERROR_CALC_DEFAULT;
+			best_error[i][2] = ERROR_CALC_DEFAULT;
+			best_error[i][1] = ERROR_CALC_DEFAULT;
+			best_error[i][0] = ERROR_CALC_DEFAULT;
+
+			format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
+			format_of_choice[i][2] = FMT_HDR_RGB;
+			format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
+			format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
+		}
+
+		for (int i = QUANT_16; i <= QUANT_256; i++)
+		{
+			// The base_quant_error should depend on the scale-factor that would be used during
+			// actual encode of the color value
+
+			float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size);
+			float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
+			float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f;
+			float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
+
+			// For 8 integers, we have two encodings: one with HDR A and another one with LDR A
+
+			float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
+			best_error[i][3] = full_hdr_rgba_error;
+			format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
+
+			// For 6 integers, we have one HDR-RGB encoding
+			float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error;
+			best_error[i][2] = full_hdr_rgb_error;
+			format_of_choice[i][2] = FMT_HDR_RGB;
+
+			// For 4 integers, we have one HDR-RGB-Scale encoding
+			float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci.alpha_drop_error + eci.rgb_luma_error;
+
+			best_error[i][1] = hdr_rgb_scale_error;
+			format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
+
+			// For 2 integers, we assume luminance-with-large-range
+			float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci.alpha_drop_error + eci.luminance_error;
+			best_error[i][0] = hdr_luminance_error;
+			format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
+		}
+	}
+	else
+	{
+		for (int i = QUANT_2; i < QUANT_6; i++)
+		{
+			best_error[i][3] = ERROR_CALC_DEFAULT;
+			best_error[i][2] = ERROR_CALC_DEFAULT;
+			best_error[i][1] = ERROR_CALC_DEFAULT;
+			best_error[i][0] = ERROR_CALC_DEFAULT;
+
+			format_of_choice[i][3] = FMT_RGBA;
+			format_of_choice[i][2] = FMT_RGB;
+			format_of_choice[i][1] = FMT_RGB_SCALE;
+			format_of_choice[i][0] = FMT_LUMINANCE;
+		}
+
+		float base_quant_error_rgb = error_weight_rgbsum * static_cast<float>(partition_size);
+		float base_quant_error_a = error_weight.lane<3>() * static_cast<float>(partition_size);
+		float base_quant_error_rgba = base_quant_error_rgb + base_quant_error_a;
+
+		float error_scale_bc_rgba = eci.can_blue_contract ? 0.625f : 1.0f;
+		float error_scale_oe_rgba = eci.can_offset_encode ? 0.5f : 1.0f;
+
+		float error_scale_bc_rgb = eci.can_blue_contract ? 0.5f : 1.0f;
+		float error_scale_oe_rgb = eci.can_offset_encode ? 0.25f : 1.0f;
+
+		// Pick among the available LDR endpoint modes
+		for (int i = QUANT_6; i <= QUANT_256; i++)
+		{
+			// Offset encoding not possible at higher quant levels
+			if (i >= QUANT_192)
+			{
+				error_scale_oe_rgba = 1.0f;
+				error_scale_oe_rgb = 1.0f;
+			}
+
+			float base_quant_error = baseline_quant_error[i - QUANT_6];
+			float quant_error_rgb  = base_quant_error_rgb * base_quant_error;
+			float quant_error_rgba = base_quant_error_rgba * base_quant_error;
+
+			// 8 integers can encode as RGBA+RGBA
+			float full_ldr_rgba_error = quant_error_rgba
+			                          * error_scale_bc_rgba
+			                          * error_scale_oe_rgba
+			                          + rgb_range_error
+			                          + alpha_range_error;
+
+			best_error[i][3] = full_ldr_rgba_error;
+			format_of_choice[i][3] = FMT_RGBA;
+
+			// 6 integers can encode as RGB+RGB or RGBS+AA
+			float full_ldr_rgb_error = quant_error_rgb
+			                         * error_scale_bc_rgb
+			                         * error_scale_oe_rgb
+			                         + rgb_range_error
+			                         + eci.alpha_drop_error;
+
+			float rgbs_alpha_error = quant_error_rgba
+			                       + eci.rgb_scale_error
+			                       + rgb_range_error
+			                       + alpha_range_error;
+
+			if (rgbs_alpha_error < full_ldr_rgb_error)
+			{
+				best_error[i][2] = rgbs_alpha_error;
+				format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA;
+			}
+			else
+			{
+				best_error[i][2] = full_ldr_rgb_error;
+				format_of_choice[i][2] = FMT_RGB;
+			}
+
+			// 4 integers can encode as RGBS or LA+LA
+			float ldr_rgbs_error = quant_error_rgb
+			                     + rgb_range_error
+			                     + eci.alpha_drop_error
+			                     + eci.rgb_scale_error;
+
+			float lum_alpha_error = quant_error_rgba
+			                      + rgb_range_error
+			                      + alpha_range_error
+			                      + eci.luminance_error;
+
+			if (ldr_rgbs_error < lum_alpha_error)
+			{
+				best_error[i][1] = ldr_rgbs_error;
+				format_of_choice[i][1] = FMT_RGB_SCALE;
+			}
+			else
+			{
+				best_error[i][1] = lum_alpha_error;
+				format_of_choice[i][1] = FMT_LUMINANCE_ALPHA;
+			}
+
+			// 2 integers can encode as L+L
+			float luminance_error = quant_error_rgb
+			                      + rgb_range_error
+			                      + eci.alpha_drop_error
+			                      + eci.luminance_error;
+
+			best_error[i][0] = luminance_error;
+			format_of_choice[i][0] = FMT_LUMINANCE;
+		}
+	}
+}
+
+/**
+ * @brief For one partition compute the best format and quantization for a given bit count.
+ *
+ * @param      best_combined_error    The best error for each quant level and integer count.
+ * @param      best_combined_format   The best format for each quant level and integer count.
+ * @param      bits_available         The number of bits available for encoding.
+ * @param[out] best_quant_level       The output best color quant level.
+ * @param[out] best_format            The output best color format.
+ *
+ * @return The output error for the best pairing.
+ */
+static float one_partition_find_best_combination_for_bitcount(
+	const float best_combined_error[21][4],
+	const uint8_t best_combined_format[21][4],
+	int bits_available,
+	uint8_t& best_quant_level,
+	uint8_t& best_format
+) {
+	int best_integer_count = 0;
+	float best_integer_count_error = ERROR_CALC_DEFAULT;
+
+	for (int integer_count = 1; integer_count <= 4;  integer_count++)
+	{
+		// Compute the quantization level for a given number of integers and a given number of bits
+		int quant_level = quant_mode_table[integer_count][bits_available];
+
+		// Don't have enough bits to represent a given endpoint format at all!
+		if (quant_level < QUANT_6)
+		{
+			continue;
+		}
+
+		float integer_count_error = best_combined_error[quant_level][integer_count - 1];
+		if (integer_count_error < best_integer_count_error)
+		{
+			best_integer_count_error = integer_count_error;
+			best_integer_count = integer_count - 1;
+		}
+	}
+
+	int ql = quant_mode_table[best_integer_count + 1][bits_available];
+
+	best_quant_level = static_cast<uint8_t>(ql);
+	best_format = FMT_LUMINANCE;
+
+	if (ql >= QUANT_6)
+	{
+		best_format = best_combined_format[ql][best_integer_count];
+	}
+
+	return best_integer_count_error;
+}
+
+/**
+ * @brief For 2 partitions compute the best format combinations for every pair of quant mode and integer count.
+ *
+ * @param      best_error             The best error for a single endpoint quant level and integer count.
+ * @param      best_format            The best format for a single endpoint quant level and integer count.
+ * @param[out] best_combined_error    The best combined error pairings for the 2 partitions.
+ * @param[out] best_combined_format   The best combined format pairings for the 2 partitions.
+ */
+static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(
+	const float best_error[2][21][4],	// indexed by (partition, quant-level, integer-pair-count-minus-1)
+	const uint8_t best_format[2][21][4],
+	float best_combined_error[21][7],	// indexed by (quant-level, integer-pair-count-minus-2)
+	uint8_t best_combined_format[21][7][2]
+) {
+	for (int i = QUANT_2; i <= QUANT_256; i++)
+	{
+		for (int j = 0; j < 7; j++)
+		{
+			best_combined_error[i][j] = ERROR_CALC_DEFAULT;
+		}
+	}
+
+	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
+	{
+		for (int i = 0; i < 4; i++)	// integer-count for first endpoint-pair
+		{
+			for (int j = 0; j < 4; j++)	// integer-count for second endpoint-pair
+			{
+				int low2 = astc::min(i, j);
+				int high2 = astc::max(i, j);
+				if ((high2 - low2) > 1)
+				{
+					continue;
+				}
+
+				int intcnt = i + j;
+				float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j], 1e10f);
+				if (errorterm <= best_combined_error[quant][intcnt])
+				{
+					best_combined_error[quant][intcnt] = errorterm;
+					best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
+					best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
+				}
+			}
+		}
+	}
+}
+
+/**
+ * @brief For 2 partitions compute the best format and quantization for a given bit count.
+ *
+ * @param      best_combined_error    The best error for each quant level and integer count.
+ * @param      best_combined_format   The best format for each quant level and integer count.
+ * @param      bits_available         The number of bits available for encoding.
+ * @param[out] best_quant_level       The output best color quant level.
+ * @param[out] best_quant_level_mod   The output best color quant level assuming two more bits are available.
+ * @param[out] best_formats           The output best color formats.
+ *
+ * @return The output error for the best pairing.
+ */
+static float two_partitions_find_best_combination_for_bitcount(
+	float best_combined_error[21][7],
+	uint8_t best_combined_format[21][7][2],
+	int bits_available,
+	uint8_t& best_quant_level,
+	uint8_t& best_quant_level_mod,
+	uint8_t* best_formats
+) {
+	int best_integer_count = 0;
+	float best_integer_count_error = ERROR_CALC_DEFAULT;
+
+	for (int integer_count = 2; integer_count <= 8; integer_count++)
+	{
+		// Compute the quantization level for a given number of integers and a given number of bits
+		int quant_level = quant_mode_table[integer_count][bits_available];
+
+		// Don't have enough bits to represent a given endpoint format at all!
+		if (quant_level < QUANT_6)
+		{
+			break;
+		}
+
+		float integer_count_error = best_combined_error[quant_level][integer_count - 2];
+		if (integer_count_error < best_integer_count_error)
+		{
+			best_integer_count_error = integer_count_error;
+			best_integer_count = integer_count;
+		}
+	}
+
+	int ql = quant_mode_table[best_integer_count][bits_available];
+	int ql_mod = quant_mode_table[best_integer_count][bits_available + 2];
+
+	best_quant_level = static_cast<uint8_t>(ql);
+	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
+
+	if (ql >= QUANT_6)
+	{
+		for (int i = 0; i < 2; i++)
+		{
+			best_formats[i] = best_combined_format[ql][best_integer_count - 2][i];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < 2; i++)
+		{
+			best_formats[i] = FMT_LUMINANCE;
+		}
+	}
+
+	return best_integer_count_error;
+}
+
+/**
+ * @brief For 3 partitions compute the best format combinations for every pair of quant mode and integer count.
+ *
+ * @param      best_error             The best error for a single endpoint quant level and integer count.
+ * @param      best_format            The best format for a single endpoint quant level and integer count.
+ * @param[out] best_combined_error    The best combined error pairings for the 3 partitions.
+ * @param[out] best_combined_format   The best combined format pairings for the 3 partitions.
+ */
+static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(
+	const float best_error[3][21][4],	// indexed by (partition, quant-level, integer-count)
+	const uint8_t best_format[3][21][4],
+	float best_combined_error[21][10],
+	uint8_t best_combined_format[21][10][3]
+) {
+	for (int i = QUANT_2; i <= QUANT_256; i++)
+	{
+		for (int j = 0; j < 10; j++)
+		{
+			best_combined_error[i][j] = ERROR_CALC_DEFAULT;
+		}
+	}
+
+	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
+	{
+		for (int i = 0; i < 4; i++)	// integer-count for first endpoint-pair
+		{
+			for (int j = 0; j < 4; j++)	// integer-count for second endpoint-pair
+			{
+				int low2 = astc::min(i, j);
+				int high2 = astc::max(i, j);
+				if ((high2 - low2) > 1)
+				{
+					continue;
+				}
+
+				for (int k = 0; k < 4; k++)	// integer-count for third endpoint-pair
+				{
+					int low3 = astc::min(k, low2);
+					int high3 = astc::max(k, high2);
+					if ((high3 - low3) > 1)
+					{
+						continue;
+					}
+
+					int intcnt = i + j + k;
+					float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], 1e10f);
+					if (errorterm <= best_combined_error[quant][intcnt])
+					{
+						best_combined_error[quant][intcnt] = errorterm;
+						best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
+						best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
+						best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
+					}
+				}
+			}
+		}
+	}
+}
+
+/**
+ * @brief For 3 partitions compute the best format and quantization for a given bit count.
+ *
+ * @param      best_combined_error    The best error for each quant level and integer count.
+ * @param      best_combined_format   The best format for each quant level and integer count.
+ * @param      bits_available         The number of bits available for encoding.
+ * @param[out] best_quant_level       The output best color quant level.
+ * @param[out] best_quant_level_mod   The output best color quant level assuming two more bits are available.
+ * @param[out] best_formats           The output best color formats.
+ *
+ * @return The output error for the best pairing.
+ */
+static float three_partitions_find_best_combination_for_bitcount(
+	const float best_combined_error[21][10],
+	const uint8_t best_combined_format[21][10][3],
+	int bits_available,
+	uint8_t& best_quant_level,
+	uint8_t& best_quant_level_mod,
+	uint8_t* best_formats
+) {
+	int best_integer_count = 0;
+	float best_integer_count_error = ERROR_CALC_DEFAULT;
+
+	for (int integer_count = 3; integer_count <= 9; integer_count++)
+	{
+		// Compute the quantization level for a given number of integers and a given number of bits
+		int quant_level = quant_mode_table[integer_count][bits_available];
+
+		// Don't have enough bits to represent a given endpoint format at all!
+		if (quant_level < QUANT_6)
+		{
+			break;
+		}
+
+		float integer_count_error = best_combined_error[quant_level][integer_count - 3];
+		if (integer_count_error < best_integer_count_error)
+		{
+			best_integer_count_error = integer_count_error;
+			best_integer_count = integer_count;
+		}
+	}
+
+	int ql = quant_mode_table[best_integer_count][bits_available];
+	int ql_mod = quant_mode_table[best_integer_count][bits_available + 5];
+
+	best_quant_level = static_cast<uint8_t>(ql);
+	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
+
+	if (ql >= QUANT_6)
+	{
+		for (int i = 0; i < 3; i++)
+		{
+			best_formats[i] = best_combined_format[ql][best_integer_count - 3][i];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < 3; i++)
+		{
+			best_formats[i] = FMT_LUMINANCE;
+		}
+	}
+
+	return best_integer_count_error;
+}
+
+/**
+ * @brief For 4 partitions compute the best format combinations for every pair of quant mode and integer count.
+ *
+ * @param      best_error             The best error for a single endpoint quant level and integer count.
+ * @param      best_format            The best format for a single endpoint quant level and integer count.
+ * @param[out] best_combined_error    The best combined error pairings for the 4 partitions.
+ * @param[out] best_combined_format   The best combined format pairings for the 4 partitions.
+ */
+static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(
+	const float best_error[4][21][4],	// indexed by (partition, quant-level, integer-count)
+	const uint8_t best_format[4][21][4],
+	float best_combined_error[21][13],
+	uint8_t best_combined_format[21][13][4]
+) {
+	for (int i = QUANT_2; i <= QUANT_256; i++)
+	{
+		for (int j = 0; j < 13; j++)
+		{
+			best_combined_error[i][j] = ERROR_CALC_DEFAULT;
+		}
+	}
+
+	for (int quant = QUANT_6; quant <= QUANT_256; quant++)
+	{
+		for (int i = 0; i < 4; i++)	// integer-count for first endpoint-pair
+		{
+			for (int j = 0; j < 4; j++)	// integer-count for second endpoint-pair
+			{
+				int low2 = astc::min(i, j);
+				int high2 = astc::max(i, j);
+				if ((high2 - low2) > 1)
+				{
+					continue;
+				}
+
+				for (int k = 0; k < 4; k++)	// integer-count for third endpoint-pair
+				{
+					int low3 = astc::min(k, low2);
+					int high3 = astc::max(k, high2);
+					if ((high3 - low3) > 1)
+					{
+						continue;
+					}
+
+					for (int l = 0; l < 4; l++)	// integer-count for fourth endpoint-pair
+					{
+						int low4 = astc::min(l, low3);
+						int high4 = astc::max(l, high3);
+						if ((high4 - low4) > 1)
+						{
+							continue;
+						}
+
+						int intcnt = i + j + k + l;
+						float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], 1e10f);
+						if (errorterm <= best_combined_error[quant][intcnt])
+						{
+							best_combined_error[quant][intcnt] = errorterm;
+							best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
+							best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
+							best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
+							best_combined_format[quant][intcnt][3] = best_format[3][quant][l];
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+/**
+ * @brief For 4 partitions compute the best format and quantization for a given bit count.
+ *
+ * @param      best_combined_error    The best error for each quant level and integer count.
+ * @param      best_combined_format   The best format for each quant level and integer count.
+ * @param      bits_available         The number of bits available for encoding.
+ * @param[out] best_quant_level       The output best color quant level.
+ * @param[out] best_quant_level_mod   The output best color quant level assuming two more bits are available.
+ * @param[out] best_formats           The output best color formats.
+ *
+ * @return best_error The output error for the best pairing.
+ */
+static float four_partitions_find_best_combination_for_bitcount(
+	const float best_combined_error[21][13],
+	const uint8_t best_combined_format[21][13][4],
+	int bits_available,
+	uint8_t& best_quant_level,
+	uint8_t& best_quant_level_mod,
+	uint8_t* best_formats
+) {
+	int best_integer_count = 0;
+	float best_integer_count_error = ERROR_CALC_DEFAULT;
+
+	for (int integer_count = 4; integer_count <= 9; integer_count++)
+	{
+		// Compute the quantization level for a given number of integers and a given number of bits
+		int quant_level = quant_mode_table[integer_count][bits_available];
+
+		// Don't have enough bits to represent a given endpoint format at all!
+		if (quant_level < QUANT_6)
+		{
+			break;
+		}
+
+		float integer_count_error = best_combined_error[quant_level][integer_count - 4];
+		if (integer_count_error < best_integer_count_error)
+		{
+			best_integer_count_error = integer_count_error;
+			best_integer_count = integer_count;
+		}
+	}
+
+	int ql = quant_mode_table[best_integer_count][bits_available];
+	int ql_mod = quant_mode_table[best_integer_count][bits_available + 8];
+
+	best_quant_level = static_cast<uint8_t>(ql);
+	best_quant_level_mod = static_cast<uint8_t>(ql_mod);
+
+	if (ql >= QUANT_6)
+	{
+		for (int i = 0; i < 4; i++)
+		{
+			best_formats[i] = best_combined_format[ql][best_integer_count - 4][i];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < 4; i++)
+		{
+			best_formats[i] = FMT_LUMINANCE;
+		}
+	}
+
+	return best_integer_count_error;
+}
+
+/* See header for documentation. */
+unsigned int compute_ideal_endpoint_formats(
+	const partition_info& pi,
+	const image_block& blk,
+	const endpoints& ep,
+	 // bitcounts and errors computed for the various quantization methods
+	const int8_t* qwt_bitcounts,
+	const float* qwt_errors,
+	unsigned int tune_candidate_limit,
+	unsigned int start_block_mode,
+	unsigned int end_block_mode,
+	// output data
+	uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
+	int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
+	quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
+	quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
+	compression_working_buffers& tmpbuf
+) {
+	int partition_count = pi.partition_count;
+
+	promise(partition_count > 0);
+
+	bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[0]);
+	bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[0]);
+
+	// Compute the errors that result from various encoding choices (such as using luminance instead
+	// of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
+	encoding_choice_errors eci[BLOCK_MAX_PARTITIONS];
+	compute_encoding_choice_errors(blk, pi, ep, eci);
+
+	float best_error[BLOCK_MAX_PARTITIONS][21][4];
+	uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
+	for (int i = 0; i < partition_count; i++)
+	{
+		compute_color_error_for_every_integer_count_and_quant_level(
+		    encode_hdr_rgb, encode_hdr_alpha, i,
+		    pi, eci[i], ep, blk.channel_weight, best_error[i],
+		    format_of_choice[i]);
+	}
+
+	float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
+	uint8_t* best_quant_levels = tmpbuf.best_quant_levels;
+	uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
+	uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
+
+	// Ensure that the first iteration understep contains data that will never be picked
+	vfloat clear_error(ERROR_CALC_DEFAULT);
+	vint clear_quant(0);
+
+	unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
+	storea(clear_error, errors_of_best_combination + packed_start_block_mode);
+	store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
+	store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
+
+	// Ensure that last iteration overstep contains data that will never be picked
+	unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
+	storea(clear_error, errors_of_best_combination + packed_end_block_mode);
+	store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
+	store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
+
+	// Track a scalar best to avoid expensive search at least once ...
+	float error_of_best_combination = ERROR_CALC_DEFAULT;
+	int index_of_best_combination = -1;
+
+	// The block contains 1 partition
+	if (partition_count == 1)
+	{
+		for (unsigned int i = start_block_mode; i < end_block_mode; i++)
+		{
+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
+			{
+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
+				continue;
+			}
+
+			float error_of_best = one_partition_find_best_combination_for_bitcount(
+			    best_error[0], format_of_choice[0], qwt_bitcounts[i],
+			    best_quant_levels[i], best_ep_formats[i][0]);
+
+			float total_error = error_of_best + qwt_errors[i];
+			errors_of_best_combination[i] = total_error;
+			best_quant_levels_mod[i] = best_quant_levels[i];
+
+			if (total_error < error_of_best_combination)
+			{
+				error_of_best_combination = total_error;
+				index_of_best_combination = i;
+			}
+		}
+	}
+	// The block contains 2 partitions
+	else if (partition_count == 2)
+	{
+		float combined_best_error[21][7];
+		uint8_t formats_of_choice[21][7][2];
+
+		two_partitions_find_best_combination_for_every_quantization_and_integer_count(
+		    best_error, format_of_choice, combined_best_error, formats_of_choice);
+
+		assert(start_block_mode == 0);
+		for (unsigned int i = 0; i < end_block_mode; i++)
+		{
+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
+			{
+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
+				continue;
+			}
+
+			float error_of_best = two_partitions_find_best_combination_for_bitcount(
+			    combined_best_error, formats_of_choice, qwt_bitcounts[i],
+			    best_quant_levels[i], best_quant_levels_mod[i],
+			    best_ep_formats[i]);
+
+			float total_error = error_of_best + qwt_errors[i];
+			errors_of_best_combination[i] = total_error;
+
+			if (total_error < error_of_best_combination)
+			{
+				error_of_best_combination = total_error;
+				index_of_best_combination = i;
+			}
+		}
+	}
+	// The block contains 3 partitions
+	else if (partition_count == 3)
+	{
+		float combined_best_error[21][10];
+		uint8_t formats_of_choice[21][10][3];
+
+		three_partitions_find_best_combination_for_every_quantization_and_integer_count(
+		    best_error, format_of_choice, combined_best_error, formats_of_choice);
+
+		assert(start_block_mode == 0);
+		for (unsigned int i = 0; i < end_block_mode; i++)
+		{
+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
+			{
+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
+				continue;
+			}
+
+			float error_of_best = three_partitions_find_best_combination_for_bitcount(
+			    combined_best_error, formats_of_choice, qwt_bitcounts[i],
+			    best_quant_levels[i], best_quant_levels_mod[i],
+			    best_ep_formats[i]);
+
+			float total_error = error_of_best + qwt_errors[i];
+			errors_of_best_combination[i] = total_error;
+
+			if (total_error < error_of_best_combination)
+			{
+				error_of_best_combination = total_error;
+				index_of_best_combination = i;
+			}
+		}
+	}
+	// The block contains 4 partitions
+	else // if (partition_count == 4)
+	{
+		assert(partition_count == 4);
+		float combined_best_error[21][13];
+		uint8_t formats_of_choice[21][13][4];
+
+		four_partitions_find_best_combination_for_every_quantization_and_integer_count(
+		    best_error, format_of_choice, combined_best_error, formats_of_choice);
+
+		assert(start_block_mode == 0);
+		for (unsigned int i = 0; i < end_block_mode; i++)
+		{
+			if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
+			{
+				errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
+				continue;
+			}
+
+			float error_of_best = four_partitions_find_best_combination_for_bitcount(
+			    combined_best_error, formats_of_choice, qwt_bitcounts[i],
+			    best_quant_levels[i], best_quant_levels_mod[i],
+			    best_ep_formats[i]);
+
+			float total_error = error_of_best + qwt_errors[i];
+			errors_of_best_combination[i] = total_error;
+
+			if (total_error < error_of_best_combination)
+			{
+				error_of_best_combination = total_error;
+				index_of_best_combination = i;
+			}
+		}
+	}
+
+	int best_error_weights[TUNE_MAX_TRIAL_CANDIDATES];
+
+	// Fast path the first result and avoid the list search for trial 0
+	best_error_weights[0] = index_of_best_combination;
+	if (index_of_best_combination >= 0)
+	{
+		errors_of_best_combination[index_of_best_combination] = ERROR_CALC_DEFAULT;
+	}
+
+	// Search the remaining results and pick the best candidate modes for trial 1+
+	for (unsigned int i = 1; i < tune_candidate_limit; i++)
+	{
+		vint vbest_error_index(-1);
+		vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
+
+		start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
+		vint lane_ids = vint::lane_id() + vint(start_block_mode);
+		for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
+		{
+			vfloat err = vfloat(errors_of_best_combination + j);
+			vmask mask = err < vbest_ep_error;
+			vbest_ep_error = select(vbest_ep_error, err, mask);
+			vbest_error_index = select(vbest_error_index, lane_ids, mask);
+			lane_ids += vint(ASTCENC_SIMD_WIDTH);
+		}
+
+		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
+		vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
+		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
+		vbest_error_index = hmin(vbest_error_index);
+		int best_error_index = vbest_error_index.lane<0>();
+
+		best_error_weights[i] = best_error_index;
+
+		// Max the error for this candidate so we don't pick it again
+		if (best_error_index >= 0)
+		{
+			errors_of_best_combination[best_error_index] = ERROR_CALC_DEFAULT;
+		}
+		// Early-out if no more candidates are valid
+		else
+		{
+			break;
+		}
+	}
+
+	for (unsigned int i = 0; i < tune_candidate_limit; i++)
+	{
+		if (best_error_weights[i] < 0)
+		{
+			return i;
+		}
+
+		block_mode[i] = best_error_weights[i];
+
+		quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]);
+		quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]);
+
+		assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256);
+		assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256);
+
+		for (int j = 0; j < partition_count; j++)
+		{
+			partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j];
+		}
+	}
+
+	return tune_candidate_limit;
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_platform_isa_detection.cpp b/thirdparty/astcenc/astcenc_platform_isa_detection.cpp
new file mode 100644
index 0000000000..8ed98437ea
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_platform_isa_detection.cpp
@@ -0,0 +1,166 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Platform-specific function implementations.
+ *
+ * This module contains functions for querying the host extended ISA support.
+ */
+
+// Include before the defines below to pick up any auto-setup based on compiler
+// built-in config, if not being set explicitly by the build system
+#include "astcenc_internal.h"
+
+#if (ASTCENC_SSE > 0)    || (ASTCENC_AVX > 0) || \
+    (ASTCENC_POPCNT > 0) || (ASTCENC_F16C > 0)
+
+static bool g_init { false };
+
+/** Does this CPU support SSE 4.1? Set to -1 if not yet initialized. */
+static bool g_cpu_has_sse41 { false };
+
+/** Does this CPU support AVX2? Set to -1 if not yet initialized. */
+static bool g_cpu_has_avx2 { false };
+
+/** Does this CPU support POPCNT? Set to -1 if not yet initialized. */
+static bool g_cpu_has_popcnt { false };
+
+/** Does this CPU support F16C? Set to -1 if not yet initialized. */
+static bool g_cpu_has_f16c { false };
+
+/* ============================================================================
+   Platform code for Visual Studio
+============================================================================ */
+#if !defined(__clang__) && defined(_MSC_VER)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#include <intrin.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	int data[4];
+
+	__cpuid(data, 0);
+	int num_id = data[0];
+
+	if (num_id >= 1)
+	{
+		__cpuidex(data, 1, 0);
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	if (num_id >= 7)
+	{
+		__cpuidex(data, 7, 0);
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	MemoryBarrier();
+	g_init = true;
+}
+
+/* ============================================================================
+   Platform code for GCC and Clang
+============================================================================ */
+#else
+#include <cpuid.h>
+
+/**
+ * @brief Detect platform CPU ISA support and update global trackers.
+ */
+static void detect_cpu_isa()
+{
+	unsigned int data[4];
+
+	if (__get_cpuid_count(1, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// SSE41 = Bank 1, ECX, bit 19
+		g_cpu_has_sse41 = data[2] & (1 << 19) ? true : false;
+		// POPCNT = Bank 1, ECX, bit 23
+		g_cpu_has_popcnt = data[2] & (1 << 23) ? true : false;
+		// F16C = Bank 1, ECX, bit 29
+		g_cpu_has_f16c = data[2] & (1 << 29) ? true : false;
+	}
+
+	g_cpu_has_avx2 = 0;
+	if (__get_cpuid_count(7, 0, &data[0], &data[1], &data[2], &data[3]))
+	{
+		// AVX2 = Bank 7, EBX, bit 5
+		g_cpu_has_avx2 = data[1] & (1 << 5) ? true : false;
+	}
+
+	// Ensure state bits are updated before init flag is updated
+	__sync_synchronize();
+	g_init = true;
+}
+#endif
+
+/* See header for documentation. */
+bool cpu_supports_popcnt()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_popcnt;
+}
+
+/* See header for documentation. */
+bool cpu_supports_f16c()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_f16c;
+}
+
+/* See header for documentation. */
+bool cpu_supports_sse41()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_sse41;
+}
+
+/* See header for documentation. */
+bool cpu_supports_avx2()
+{
+	if (!g_init)
+	{
+		detect_cpu_isa();
+	}
+
+	return g_cpu_has_avx2;
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_quantization.cpp b/thirdparty/astcenc/astcenc_quantization.cpp
new file mode 100644
index 0000000000..478a21ead7
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_quantization.cpp
@@ -0,0 +1,904 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions and data tables for numeric quantization..
+ */
+
+#include "astcenc_internal.h"
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+// Starts from QUANT_6
+// Not scrambled
+const uint8_t color_unquant_to_uquant_tables[17][256] {
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 153, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 204, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,  36,
+		 36,  36,  36,  36,  36,  36,  36,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,
+		 73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73,  73, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109, 109,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146, 146,
+		146, 146, 146, 146, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182, 182,
+		182, 182, 182, 182, 182, 182, 182, 182, 182, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219,
+		219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 219, 255, 255, 255,
+		255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,
+		 28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  28,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,  56,
+		 56,  56,  56,  56,  56,  56,  56,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,  84,
+		 84,  84,  84, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113, 113,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142,
+		142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 142, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171, 171,
+		171, 171, 171, 171, 171, 171, 171, 171, 171, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199, 199,
+		199, 199, 199, 199, 199, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227, 227,
+		227, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  23,  23,  23,  23,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  23,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  46,
+		 46,  46,  46,  46,  46,  46,  46,  46,  46,  46,  69,  69,  69,  69,  69,  69,
+		 69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,  69,
+		 69,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,  92,
+		 92,  92,  92,  92,  92,  92,  92,  92,  92, 116, 116, 116, 116, 116, 116, 116,
+		116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116, 116,
+		139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139, 139,
+		139, 139, 139, 139, 139, 139, 139, 163, 163, 163, 163, 163, 163, 163, 163, 163,
+		163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 163, 186,
+		186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186, 186,
+		186, 186, 186, 186, 186, 186, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209,
+		209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 209, 232, 232, 232,
+		232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232, 232,
+		232, 232, 232, 232, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,  17,  17,  17,  17,  17,  17,  17,
+		 17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  34,  34,  34,  34,  34,  34,
+		 34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  34,  51,  51,  51,  51,  51,
+		 51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  51,  68,  68,  68,  68,
+		 68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  68,  85,  85,  85,
+		 85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85,  85, 102, 102,
+		102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 102, 119,
+		119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119, 119,
+		136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136, 136,
+		136, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153, 153,
+		153, 153, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170, 170,
+		170, 170, 170, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
+		187, 187, 187, 187, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
+		204, 204, 204, 204, 204, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221, 221,
+		221, 221, 221, 221, 221, 221, 238, 238, 238, 238, 238, 238, 238, 238, 238, 238,
+		238, 238, 238, 238, 238, 238, 238, 255, 255, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,  27,
+		 27,  27,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,  40,
+		 54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  54,  67,  67,  67,
+		 67,  67,  67,  67,  67,  67,  67,  67,  67,  67,  80,  80,  80,  80,  80,  80,
+		 80,  80,  80,  80,  80,  80,  80,  80,  94,  94,  94,  94,  94,  94,  94,  94,
+		 94,  94,  94,  94,  94, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107, 107,
+		107, 107, 107, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 148, 148, 148,
+		148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 161, 161, 161, 161, 161,
+		161, 161, 161, 161, 161, 161, 161, 161, 175, 175, 175, 175, 175, 175, 175, 175,
+		175, 175, 175, 175, 175, 175, 188, 188, 188, 188, 188, 188, 188, 188, 188, 188,
+		188, 188, 188, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201, 201,
+		215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 215, 228, 228,
+		228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 228, 242, 242, 242, 242, 242,
+		242, 242, 242, 242, 242, 242, 242, 242, 242, 255, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   0,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  33,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  33,  44,  44,  44,  44,  44,  44,  44,  44,  44,
+		 44,  44,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  55,  66,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  66,  77,  77,  77,  77,  77,  77,  77,  77,
+		 77,  77,  77,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  88,  99,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99,  99, 110, 110, 110, 110, 110, 110, 110,
+		110, 110, 110, 110, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121, 121,
+		134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 134, 145, 145, 145, 145,
+		145, 145, 145, 145, 145, 145, 145, 156, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 156, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 167, 178, 178, 178,
+		178, 178, 178, 178, 178, 178, 178, 178, 189, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 189, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 200, 211, 211,
+		211, 211, 211, 211, 211, 211, 211, 211, 211, 222, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 222, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 233, 244,
+		244, 244, 244, 244, 244, 244, 244, 244, 244, 244, 255, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,  16,  16,  16,
+		 16,  16,  16,  16,  16,  24,  24,  24,  24,  24,  24,  24,  24,  33,  33,  33,
+		 33,  33,  33,  33,  33,  33,  41,  41,  41,  41,  41,  41,  41,  41,  49,  49,
+		 49,  49,  49,  49,  49,  49,  57,  57,  57,  57,  57,  57,  57,  57,  66,  66,
+		 66,  66,  66,  66,  66,  66,  66,  74,  74,  74,  74,  74,  74,  74,  74,  82,
+		 82,  82,  82,  82,  82,  82,  82,  90,  90,  90,  90,  90,  90,  90,  90,  99,
+		 99,  99,  99,  99,  99,  99,  99,  99, 107, 107, 107, 107, 107, 107, 107, 107,
+		115, 115, 115, 115, 115, 115, 115, 115, 123, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 132, 140, 140, 140, 140, 140, 140, 140, 140,
+		148, 148, 148, 148, 148, 148, 148, 148, 156, 156, 156, 156, 156, 156, 156, 156,
+		156, 165, 165, 165, 165, 165, 165, 165, 165, 173, 173, 173, 173, 173, 173, 173,
+		173, 181, 181, 181, 181, 181, 181, 181, 181, 189, 189, 189, 189, 189, 189, 189,
+		189, 189, 198, 198, 198, 198, 198, 198, 198, 198, 206, 206, 206, 206, 206, 206,
+		206, 206, 214, 214, 214, 214, 214, 214, 214, 214, 222, 222, 222, 222, 222, 222,
+		222, 222, 222, 231, 231, 231, 231, 231, 231, 231, 231, 239, 239, 239, 239, 239,
+		239, 239, 239, 247, 247, 247, 247, 247, 247, 247, 247, 255, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   0,   6,   6,   6,   6,   6,   6,  13,  13,  13,  13,  13,  13,
+		 13,  19,  19,  19,  19,  19,  19,  26,  26,  26,  26,  26,  26,  26,  32,  32,
+		 32,  32,  32,  32,  39,  39,  39,  39,  39,  39,  39,  45,  45,  45,  45,  45,
+		 45,  52,  52,  52,  52,  52,  52,  52,  58,  58,  58,  58,  58,  58,  65,  65,
+		 65,  65,  65,  65,  65,  71,  71,  71,  71,  71,  71,  78,  78,  78,  78,  78,
+		 78,  78,  84,  84,  84,  84,  84,  84,  91,  91,  91,  91,  91,  91,  91,  97,
+		 97,  97,  97,  97,  97, 104, 104, 104, 104, 104, 104, 104, 110, 110, 110, 110,
+		110, 110, 117, 117, 117, 117, 117, 117, 117, 123, 123, 123, 123, 123, 123, 123,
+		132, 132, 132, 132, 132, 132, 132, 138, 138, 138, 138, 138, 138, 138, 145, 145,
+		145, 145, 145, 145, 151, 151, 151, 151, 151, 151, 151, 158, 158, 158, 158, 158,
+		158, 164, 164, 164, 164, 164, 164, 164, 171, 171, 171, 171, 171, 171, 177, 177,
+		177, 177, 177, 177, 177, 184, 184, 184, 184, 184, 184, 190, 190, 190, 190, 190,
+		190, 190, 197, 197, 197, 197, 197, 197, 203, 203, 203, 203, 203, 203, 203, 210,
+		210, 210, 210, 210, 210, 216, 216, 216, 216, 216, 216, 216, 223, 223, 223, 223,
+		223, 223, 229, 229, 229, 229, 229, 229, 229, 236, 236, 236, 236, 236, 236, 242,
+		242, 242, 242, 242, 242, 242, 249, 249, 249, 249, 249, 249, 255, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   5,   5,   5,   5,   5,   5,  11,  11,  11,  11,  11,  16,  16,
+		 16,  16,  16,  21,  21,  21,  21,  21,  21,  27,  27,  27,  27,  27,  32,  32,
+		 32,  32,  32,  32,  38,  38,  38,  38,  38,  43,  43,  43,  43,  43,  48,  48,
+		 48,  48,  48,  48,  54,  54,  54,  54,  54,  59,  59,  59,  59,  59,  59,  65,
+		 65,  65,  65,  65,  70,  70,  70,  70,  70,  70,  76,  76,  76,  76,  76,  81,
+		 81,  81,  81,  81,  86,  86,  86,  86,  86,  86,  92,  92,  92,  92,  92,  97,
+		 97,  97,  97,  97,  97, 103, 103, 103, 103, 103, 108, 108, 108, 108, 108, 113,
+		113, 113, 113, 113, 113, 119, 119, 119, 119, 119, 124, 124, 124, 124, 124, 124,
+		131, 131, 131, 131, 131, 131, 136, 136, 136, 136, 136, 142, 142, 142, 142, 142,
+		142, 147, 147, 147, 147, 147, 152, 152, 152, 152, 152, 158, 158, 158, 158, 158,
+		158, 163, 163, 163, 163, 163, 169, 169, 169, 169, 169, 169, 174, 174, 174, 174,
+		174, 179, 179, 179, 179, 179, 185, 185, 185, 185, 185, 185, 190, 190, 190, 190,
+		190, 196, 196, 196, 196, 196, 196, 201, 201, 201, 201, 201, 207, 207, 207, 207,
+		207, 207, 212, 212, 212, 212, 212, 217, 217, 217, 217, 217, 223, 223, 223, 223,
+		223, 223, 228, 228, 228, 228, 228, 234, 234, 234, 234, 234, 234, 239, 239, 239,
+		239, 239, 244, 244, 244, 244, 244, 250, 250, 250, 250, 250, 250, 255, 255, 255
+	},
+	{
+		  0,   0,   0,   4,   4,   4,   4,   8,   8,   8,   8,  12,  12,  12,  12,  16,
+		 16,  16,  16,  20,  20,  20,  20,  24,  24,  24,  24,  28,  28,  28,  28,  32,
+		 32,  32,  32,  36,  36,  36,  36,  40,  40,  40,  40,  44,  44,  44,  44,  48,
+		 48,  48,  48,  52,  52,  52,  52,  56,  56,  56,  56,  60,  60,  60,  60,  65,
+		 65,  65,  65,  65,  69,  69,  69,  69,  73,  73,  73,  73,  77,  77,  77,  77,
+		 81,  81,  81,  81,  85,  85,  85,  85,  89,  89,  89,  89,  93,  93,  93,  93,
+		 97,  97,  97,  97, 101, 101, 101, 101, 105, 105, 105, 105, 109, 109, 109, 109,
+		113, 113, 113, 113, 117, 117, 117, 117, 121, 121, 121, 121, 125, 125, 125, 125,
+		130, 130, 130, 130, 134, 134, 134, 134, 138, 138, 138, 138, 142, 142, 142, 142,
+		146, 146, 146, 146, 150, 150, 150, 150, 154, 154, 154, 154, 158, 158, 158, 158,
+		162, 162, 162, 162, 166, 166, 166, 166, 170, 170, 170, 170, 174, 174, 174, 174,
+		178, 178, 178, 178, 182, 182, 182, 182, 186, 186, 186, 186, 190, 190, 190, 190,
+		190, 195, 195, 195, 195, 199, 199, 199, 199, 203, 203, 203, 203, 207, 207, 207,
+		207, 211, 211, 211, 211, 215, 215, 215, 215, 219, 219, 219, 219, 223, 223, 223,
+		223, 227, 227, 227, 227, 231, 231, 231, 231, 235, 235, 235, 235, 239, 239, 239,
+		239, 243, 243, 243, 243, 247, 247, 247, 247, 251, 251, 251, 251, 255, 255, 255
+	},
+	{
+		  0,   0,   3,   3,   3,   6,   6,   6,   9,   9,   9,   9,  13,  13,  13,  16,
+		 16,  16,  19,  19,  19,  22,  22,  22,  25,  25,  25,  25,  29,  29,  29,  32,
+		 32,  32,  35,  35,  35,  38,  38,  38,  38,  42,  42,  42,  45,  45,  45,  48,
+		 48,  48,  51,  51,  51,  54,  54,  54,  54,  58,  58,  58,  61,  61,  61,  64,
+		 64,  64,  67,  67,  67,  67,  71,  71,  71,  74,  74,  74,  77,  77,  77,  80,
+		 80,  80,  83,  83,  83,  83,  87,  87,  87,  90,  90,  90,  93,  93,  93,  96,
+		 96,  96,  96, 100, 100, 100, 103, 103, 103, 106, 106, 106, 109, 109, 109, 112,
+		112, 112, 112, 116, 116, 116, 119, 119, 119, 122, 122, 122, 125, 125, 125, 125,
+		130, 130, 130, 130, 133, 133, 133, 136, 136, 136, 139, 139, 139, 143, 143, 143,
+		143, 146, 146, 146, 149, 149, 149, 152, 152, 152, 155, 155, 155, 159, 159, 159,
+		159, 162, 162, 162, 165, 165, 165, 168, 168, 168, 172, 172, 172, 172, 175, 175,
+		175, 178, 178, 178, 181, 181, 181, 184, 184, 184, 188, 188, 188, 188, 191, 191,
+		191, 194, 194, 194, 197, 197, 197, 201, 201, 201, 201, 204, 204, 204, 207, 207,
+		207, 210, 210, 210, 213, 213, 213, 217, 217, 217, 217, 220, 220, 220, 223, 223,
+		223, 226, 226, 226, 230, 230, 230, 230, 233, 233, 233, 236, 236, 236, 239, 239,
+		239, 242, 242, 242, 246, 246, 246, 246, 249, 249, 249, 252, 252, 252, 255, 255
+	},
+	{
+		  0,   0,   2,   2,   5,   5,   5,   8,   8,   8,  10,  10,  13,  13,  13,  16,
+		 16,  16,  18,  18,  21,  21,  21,  24,  24,  24,  26,  26,  29,  29,  29,  32,
+		 32,  32,  35,  35,  35,  37,  37,  40,  40,  40,  43,  43,  43,  45,  45,  48,
+		 48,  48,  51,  51,  51,  53,  53,  56,  56,  56,  59,  59,  59,  61,  61,  64,
+		 64,  64,  67,  67,  67,  70,  70,  70,  72,  72,  75,  75,  75,  78,  78,  78,
+		 80,  80,  83,  83,  83,  86,  86,  86,  88,  88,  91,  91,  91,  94,  94,  94,
+		 96,  96,  99,  99,  99, 102, 102, 102, 104, 104, 107, 107, 107, 110, 110, 110,
+		112, 112, 115, 115, 115, 118, 118, 118, 120, 120, 123, 123, 123, 126, 126, 126,
+		129, 129, 129, 132, 132, 132, 135, 135, 137, 137, 137, 140, 140, 140, 143, 143,
+		145, 145, 145, 148, 148, 148, 151, 151, 153, 153, 153, 156, 156, 156, 159, 159,
+		161, 161, 161, 164, 164, 164, 167, 167, 169, 169, 169, 172, 172, 172, 175, 175,
+		177, 177, 177, 180, 180, 180, 183, 183, 185, 185, 185, 188, 188, 188, 191, 191,
+		191, 194, 194, 196, 196, 196, 199, 199, 199, 202, 202, 204, 204, 204, 207, 207,
+		207, 210, 210, 212, 212, 212, 215, 215, 215, 218, 218, 220, 220, 220, 223, 223,
+		223, 226, 226, 226, 229, 229, 231, 231, 231, 234, 234, 234, 237, 237, 239, 239,
+		239, 242, 242, 242, 245, 245, 247, 247, 247, 250, 250, 250, 253, 253, 255, 255
+	},
+	{
+		  0,   0,   2,   2,   4,   4,   6,   6,   8,   8,  10,  10,  12,  12,  14,  14,
+		 16,  16,  18,  18,  20,  20,  22,  22,  24,  24,  26,  26,  28,  28,  30,  30,
+		 32,  32,  34,  34,  36,  36,  38,  38,  40,  40,  42,  42,  44,  44,  46,  46,
+		 48,  48,  50,  50,  52,  52,  54,  54,  56,  56,  58,  58,  60,  60,  62,  62,
+		 64,  64,  66,  66,  68,  68,  70,  70,  72,  72,  74,  74,  76,  76,  78,  78,
+		 80,  80,  82,  82,  84,  84,  86,  86,  88,  88,  90,  90,  92,  92,  94,  94,
+		 96,  96,  98,  98, 100, 100, 102, 102, 104, 104, 106, 106, 108, 108, 110, 110,
+		112, 112, 114, 114, 116, 116, 118, 118, 120, 120, 122, 122, 124, 124, 126, 126,
+		129, 129, 131, 131, 133, 133, 135, 135, 137, 137, 139, 139, 141, 141, 143, 143,
+		145, 145, 147, 147, 149, 149, 151, 151, 153, 153, 155, 155, 157, 157, 159, 159,
+		161, 161, 163, 163, 165, 165, 167, 167, 169, 169, 171, 171, 173, 173, 175, 175,
+		177, 177, 179, 179, 181, 181, 183, 183, 185, 185, 187, 187, 189, 189, 191, 191,
+		193, 193, 195, 195, 197, 197, 199, 199, 201, 201, 203, 203, 205, 205, 207, 207,
+		209, 209, 211, 211, 213, 213, 215, 215, 217, 217, 219, 219, 221, 221, 223, 223,
+		225, 225, 227, 227, 229, 229, 231, 231, 233, 233, 235, 235, 237, 237, 239, 239,
+		241, 241, 243, 243, 245, 245, 247, 247, 249, 249, 251, 251, 253, 253, 255, 255
+	},
+	{
+		  0,   1,   1,   3,   4,   4,   6,   6,   8,   9,   9,  11,  12,  12,  14,  14,
+		 16,  17,  17,  19,  20,  20,  22,  22,  24,  25,  25,  27,  28,  28,  30,  30,
+		 32,  33,  33,  35,  36,  36,  38,  38,  40,  41,  41,  43,  44,  44,  46,  46,
+		 48,  49,  49,  51,  52,  52,  54,  54,  56,  57,  57,  59,  60,  60,  62,  62,
+		 64,  65,  65,  67,  68,  68,  70,  70,  72,  73,  73,  75,  76,  76,  78,  78,
+		 80,  81,  81,  83,  84,  84,  86,  86,  88,  89,  89,  91,  92,  92,  94,  94,
+		 96,  97,  97,  99, 100, 100, 102, 102, 104, 105, 105, 107, 108, 108, 110, 110,
+		112, 113, 113, 115, 116, 116, 118, 118, 120, 121, 121, 123, 124, 124, 126, 126,
+		129, 129, 131, 131, 132, 134, 134, 135, 137, 137, 139, 139, 140, 142, 142, 143,
+		145, 145, 147, 147, 148, 150, 150, 151, 153, 153, 155, 155, 156, 158, 158, 159,
+		161, 161, 163, 163, 164, 166, 166, 167, 169, 169, 171, 171, 172, 174, 174, 175,
+		177, 177, 179, 179, 180, 182, 182, 183, 185, 185, 187, 187, 188, 190, 190, 191,
+		193, 193, 195, 195, 196, 198, 198, 199, 201, 201, 203, 203, 204, 206, 206, 207,
+		209, 209, 211, 211, 212, 214, 214, 215, 217, 217, 219, 219, 220, 222, 222, 223,
+		225, 225, 227, 227, 228, 230, 230, 231, 233, 233, 235, 235, 236, 238, 238, 239,
+		241, 241, 243, 243, 244, 246, 246, 247, 249, 249, 251, 251, 252, 254, 254, 255
+	},
+	{
+		  0,   1,   2,   2,   4,   5,   6,   6,   8,   9,  10,  10,  12,  13,  14,  14,
+		 16,  17,  18,  18,  20,  21,  22,  22,  24,  25,  26,  26,  28,  29,  30,  30,
+		 32,  33,  34,  34,  36,  37,  38,  38,  40,  41,  42,  42,  44,  45,  46,  46,
+		 48,  49,  50,  50,  52,  53,  54,  54,  56,  57,  58,  58,  60,  61,  62,  62,
+		 64,  65,  66,  66,  68,  69,  70,  70,  72,  73,  74,  74,  76,  77,  78,  78,
+		 80,  81,  82,  82,  84,  85,  86,  86,  88,  89,  90,  90,  92,  93,  94,  94,
+		 96,  97,  98,  98, 100, 101, 102, 102, 104, 105, 106, 106, 108, 109, 110, 110,
+		112, 113, 114, 114, 116, 117, 118, 118, 120, 121, 122, 122, 124, 125, 126, 126,
+		129, 129, 130, 131, 133, 133, 134, 135, 137, 137, 138, 139, 141, 141, 142, 143,
+		145, 145, 146, 147, 149, 149, 150, 151, 153, 153, 154, 155, 157, 157, 158, 159,
+		161, 161, 162, 163, 165, 165, 166, 167, 169, 169, 170, 171, 173, 173, 174, 175,
+		177, 177, 178, 179, 181, 181, 182, 183, 185, 185, 186, 187, 189, 189, 190, 191,
+		193, 193, 194, 195, 197, 197, 198, 199, 201, 201, 202, 203, 205, 205, 206, 207,
+		209, 209, 210, 211, 213, 213, 214, 215, 217, 217, 218, 219, 221, 221, 222, 223,
+		225, 225, 226, 227, 229, 229, 230, 231, 233, 233, 234, 235, 237, 237, 238, 239,
+		241, 241, 242, 243, 245, 245, 246, 247, 249, 249, 250, 251, 253, 253, 254, 255
+	},
+	{
+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+	}
+};
+
+// Starts from QUANT_6
+// Scrambled
+const uint8_t color_uquant_to_scrambled_pquant_tables[17][256] {
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+		  0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,
+		  2,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,
+		  1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   4,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   7,
+		  7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,
+		  9,   9,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,  15
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   0,   4,   4,   4,   4,   4,   4,   4,   4,   4,
+		  4,   4,   4,   4,   4,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,   8,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,  12,
+		 16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,   2,   2,   2,   6,   6,   6,   6,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  10,  10,  10,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,
+		 19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  15,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  11,  11,  11,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,   7,   7,   7,   7,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,
+		 13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,  13,   9,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   5,   5,   5,   5,   5,
+		  5,   5,   5,   5,   5,   5,   5,   5,   5,   1,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   0,   8,   8,   8,   8,   8,   8,   8,   8,   8,   8,
+		  8,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,  16,   2,   2,   2,   2,
+		  2,   2,   2,   2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  10,  10,
+		 10,  10,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,  18,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  12,  12,
+		 12,  12,  12,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,  20,   6,   6,
+		  6,   6,   6,   6,   6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,
+		 14,  14,  14,  14,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,  22,
+		 23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  23,  15,  15,  15,  15,
+		 15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,   7,   7,   7,   7,
+		  7,   7,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  21,  13,  13,  13,
+		 13,  13,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,   5,   5,   5,
+		  5,   5,   5,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  19,  11,  11,
+		 11,  11,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,   3,   3,   3,
+		  3,   3,   3,   3,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,  17,   9,
+		  9,   9,   9,   9,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,
+		  2,   2,   2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,
+		  4,   4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,
+		  6,   6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   7,   7,   7,   8,   8,
+		  8,   8,   8,   8,   8,   8,   8,   9,   9,   9,   9,   9,   9,   9,   9,  10,
+		 10,  10,  10,  10,  10,  10,  10,  11,  11,  11,  11,  11,  11,  11,  11,  12,
+		 12,  12,  12,  12,  12,  12,  12,  12,  13,  13,  13,  13,  13,  13,  13,  13,
+		 14,  14,  14,  14,  14,  14,  14,  14,  15,  15,  15,  15,  15,  15,  15,  15,
+		 16,  16,  16,  16,  16,  16,  16,  16,  17,  17,  17,  17,  17,  17,  17,  17,
+		 18,  18,  18,  18,  18,  18,  18,  18,  19,  19,  19,  19,  19,  19,  19,  19,
+		 19,  20,  20,  20,  20,  20,  20,  20,  20,  21,  21,  21,  21,  21,  21,  21,
+		 21,  22,  22,  22,  22,  22,  22,  22,  22,  23,  23,  23,  23,  23,  23,  23,
+		 23,  23,  24,  24,  24,  24,  24,  24,  24,  24,  25,  25,  25,  25,  25,  25,
+		 25,  25,  26,  26,  26,  26,  26,  26,  26,  26,  27,  27,  27,  27,  27,  27,
+		 27,  27,  27,  28,  28,  28,  28,  28,  28,  28,  28,  29,  29,  29,  29,  29,
+		 29,  29,  29,  30,  30,  30,  30,  30,  30,  30,  30,  31,  31,  31,  31,  31
+	},
+	{
+		  0,   0,   0,   0,   8,   8,   8,   8,   8,   8,  16,  16,  16,  16,  16,  16,
+		 16,  24,  24,  24,  24,  24,  24,  32,  32,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,   2,  10,  10,  10,  10,  10,  10,  10,  18,  18,  18,  18,  18,
+		 18,  26,  26,  26,  26,  26,  26,  26,  34,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,   4,   4,  12,  12,  12,  12,  12,  12,  20,  20,  20,  20,  20,
+		 20,  20,  28,  28,  28,  28,  28,  28,  36,  36,  36,  36,  36,  36,  36,   6,
+		  6,   6,   6,   6,   6,  14,  14,  14,  14,  14,  14,  14,  22,  22,  22,  22,
+		 22,  22,  30,  30,  30,  30,  30,  30,  30,  38,  38,  38,  38,  38,  38,  38,
+		 39,  39,  39,  39,  39,  39,  39,  31,  31,  31,  31,  31,  31,  31,  23,  23,
+		 23,  23,  23,  23,  15,  15,  15,  15,  15,  15,  15,   7,   7,   7,   7,   7,
+		  7,  37,  37,  37,  37,  37,  37,  37,  29,  29,  29,  29,  29,  29,  21,  21,
+		 21,  21,  21,  21,  21,  13,  13,  13,  13,  13,  13,   5,   5,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  35,  27,  27,  27,  27,  27,  27,  27,  19,
+		 19,  19,  19,  19,  19,  11,  11,  11,  11,  11,  11,  11,   3,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  33,  33,  25,  25,  25,  25,  25,  25,  17,
+		 17,  17,  17,  17,  17,  17,   9,   9,   9,   9,   9,   9,   1,   1,   1,   1
+	},
+	{
+		  0,   0,   0,  16,  16,  16,  16,  16,  16,  32,  32,  32,  32,  32,   2,   2,
+		  2,   2,   2,  18,  18,  18,  18,  18,  18,  34,  34,  34,  34,  34,   4,   4,
+		  4,   4,   4,   4,  20,  20,  20,  20,  20,  36,  36,  36,  36,  36,   6,   6,
+		  6,   6,   6,   6,  22,  22,  22,  22,  22,  38,  38,  38,  38,  38,  38,   8,
+		  8,   8,   8,   8,  24,  24,  24,  24,  24,  24,  40,  40,  40,  40,  40,  10,
+		 10,  10,  10,  10,  26,  26,  26,  26,  26,  26,  42,  42,  42,  42,  42,  12,
+		 12,  12,  12,  12,  12,  28,  28,  28,  28,  28,  44,  44,  44,  44,  44,  14,
+		 14,  14,  14,  14,  14,  30,  30,  30,  30,  30,  46,  46,  46,  46,  46,  46,
+		 47,  47,  47,  47,  47,  47,  31,  31,  31,  31,  31,  15,  15,  15,  15,  15,
+		 15,  45,  45,  45,  45,  45,  29,  29,  29,  29,  29,  13,  13,  13,  13,  13,
+		 13,  43,  43,  43,  43,  43,  27,  27,  27,  27,  27,  27,  11,  11,  11,  11,
+		 11,  41,  41,  41,  41,  41,  25,  25,  25,  25,  25,  25,   9,   9,   9,   9,
+		  9,  39,  39,  39,  39,  39,  39,  23,  23,  23,  23,  23,   7,   7,   7,   7,
+		  7,   7,  37,  37,  37,  37,  37,  21,  21,  21,  21,  21,   5,   5,   5,   5,
+		  5,   5,  35,  35,  35,  35,  35,  19,  19,  19,  19,  19,  19,   3,   3,   3,
+		  3,   3,  33,  33,  33,  33,  33,  17,  17,  17,  17,  17,  17,   1,   1,   1
+	},
+	{
+		  0,   0,   0,   1,   1,   1,   1,   2,   2,   2,   2,   3,   3,   3,   3,   4,
+		  4,   4,   4,   5,   5,   5,   5,   6,   6,   6,   6,   7,   7,   7,   7,   8,
+		  8,   8,   8,   9,   9,   9,   9,  10,  10,  10,  10,  11,  11,  11,  11,  12,
+		 12,  12,  12,  13,  13,  13,  13,  14,  14,  14,  14,  15,  15,  15,  15,  16,
+		 16,  16,  16,  16,  17,  17,  17,  17,  18,  18,  18,  18,  19,  19,  19,  19,
+		 20,  20,  20,  20,  21,  21,  21,  21,  22,  22,  22,  22,  23,  23,  23,  23,
+		 24,  24,  24,  24,  25,  25,  25,  25,  26,  26,  26,  26,  27,  27,  27,  27,
+		 28,  28,  28,  28,  29,  29,  29,  29,  30,  30,  30,  30,  31,  31,  31,  31,
+		 32,  32,  32,  32,  33,  33,  33,  33,  34,  34,  34,  34,  35,  35,  35,  35,
+		 36,  36,  36,  36,  37,  37,  37,  37,  38,  38,  38,  38,  39,  39,  39,  39,
+		 40,  40,  40,  40,  41,  41,  41,  41,  42,  42,  42,  42,  43,  43,  43,  43,
+		 44,  44,  44,  44,  45,  45,  45,  45,  46,  46,  46,  46,  47,  47,  47,  47,
+		 47,  48,  48,  48,  48,  49,  49,  49,  49,  50,  50,  50,  50,  51,  51,  51,
+		 51,  52,  52,  52,  52,  53,  53,  53,  53,  54,  54,  54,  54,  55,  55,  55,
+		 55,  56,  56,  56,  56,  57,  57,  57,  57,  58,  58,  58,  58,  59,  59,  59,
+		 59,  60,  60,  60,  60,  61,  61,  61,  61,  62,  62,  62,  62,  63,  63,  63
+	},
+	{
+		  0,   0,  16,  16,  16,  32,  32,  32,  48,  48,  48,  48,  64,  64,  64,   2,
+		  2,   2,  18,  18,  18,  34,  34,  34,  50,  50,  50,  50,  66,  66,  66,   4,
+		  4,   4,  20,  20,  20,  36,  36,  36,  36,  52,  52,  52,  68,  68,  68,   6,
+		  6,   6,  22,  22,  22,  38,  38,  38,  38,  54,  54,  54,  70,  70,  70,   8,
+		  8,   8,  24,  24,  24,  24,  40,  40,  40,  56,  56,  56,  72,  72,  72,  10,
+		 10,  10,  26,  26,  26,  26,  42,  42,  42,  58,  58,  58,  74,  74,  74,  12,
+		 12,  12,  12,  28,  28,  28,  44,  44,  44,  60,  60,  60,  76,  76,  76,  14,
+		 14,  14,  14,  30,  30,  30,  46,  46,  46,  62,  62,  62,  78,  78,  78,  78,
+		 79,  79,  79,  79,  63,  63,  63,  47,  47,  47,  31,  31,  31,  15,  15,  15,
+		 15,  77,  77,  77,  61,  61,  61,  45,  45,  45,  29,  29,  29,  13,  13,  13,
+		 13,  75,  75,  75,  59,  59,  59,  43,  43,  43,  27,  27,  27,  27,  11,  11,
+		 11,  73,  73,  73,  57,  57,  57,  41,  41,  41,  25,  25,  25,  25,   9,   9,
+		  9,  71,  71,  71,  55,  55,  55,  39,  39,  39,  39,  23,  23,  23,   7,   7,
+		  7,  69,  69,  69,  53,  53,  53,  37,  37,  37,  37,  21,  21,  21,   5,   5,
+		  5,  67,  67,  67,  51,  51,  51,  51,  35,  35,  35,  19,  19,  19,   3,   3,
+		  3,  65,  65,  65,  49,  49,  49,  49,  33,  33,  33,  17,  17,  17,   1,   1
+	},
+	{
+		  0,   0,  32,  32,  64,  64,  64,   2,   2,   2,  34,  34,  66,  66,  66,   4,
+		  4,   4,  36,  36,  68,  68,  68,   6,   6,   6,  38,  38,  70,  70,  70,   8,
+		  8,   8,  40,  40,  40,  72,  72,  10,  10,  10,  42,  42,  42,  74,  74,  12,
+		 12,  12,  44,  44,  44,  76,  76,  14,  14,  14,  46,  46,  46,  78,  78,  16,
+		 16,  16,  48,  48,  48,  80,  80,  80,  18,  18,  50,  50,  50,  82,  82,  82,
+		 20,  20,  52,  52,  52,  84,  84,  84,  22,  22,  54,  54,  54,  86,  86,  86,
+		 24,  24,  56,  56,  56,  88,  88,  88,  26,  26,  58,  58,  58,  90,  90,  90,
+		 28,  28,  60,  60,  60,  92,  92,  92,  30,  30,  62,  62,  62,  94,  94,  94,
+		 95,  95,  95,  63,  63,  63,  31,  31,  93,  93,  93,  61,  61,  61,  29,  29,
+		 91,  91,  91,  59,  59,  59,  27,  27,  89,  89,  89,  57,  57,  57,  25,  25,
+		 87,  87,  87,  55,  55,  55,  23,  23,  85,  85,  85,  53,  53,  53,  21,  21,
+		 83,  83,  83,  51,  51,  51,  19,  19,  81,  81,  81,  49,  49,  49,  17,  17,
+		 17,  79,  79,  47,  47,  47,  15,  15,  15,  77,  77,  45,  45,  45,  13,  13,
+		 13,  75,  75,  43,  43,  43,  11,  11,  11,  73,  73,  41,  41,  41,   9,   9,
+		  9,  71,  71,  71,  39,  39,   7,   7,   7,  69,  69,  69,  37,  37,   5,   5,
+		  5,  67,  67,  67,  35,  35,   3,   3,   3,  65,  65,  65,  33,  33,   1,   1
+	},
+	{
+		  0,   0,   1,   1,   2,   2,   3,   3,   4,   4,   5,   5,   6,   6,   7,   7,
+		  8,   8,   9,   9,  10,  10,  11,  11,  12,  12,  13,  13,  14,  14,  15,  15,
+		 16,  16,  17,  17,  18,  18,  19,  19,  20,  20,  21,  21,  22,  22,  23,  23,
+		 24,  24,  25,  25,  26,  26,  27,  27,  28,  28,  29,  29,  30,  30,  31,  31,
+		 32,  32,  33,  33,  34,  34,  35,  35,  36,  36,  37,  37,  38,  38,  39,  39,
+		 40,  40,  41,  41,  42,  42,  43,  43,  44,  44,  45,  45,  46,  46,  47,  47,
+		 48,  48,  49,  49,  50,  50,  51,  51,  52,  52,  53,  53,  54,  54,  55,  55,
+		 56,  56,  57,  57,  58,  58,  59,  59,  60,  60,  61,  61,  62,  62,  63,  63,
+		 64,  64,  65,  65,  66,  66,  67,  67,  68,  68,  69,  69,  70,  70,  71,  71,
+		 72,  72,  73,  73,  74,  74,  75,  75,  76,  76,  77,  77,  78,  78,  79,  79,
+		 80,  80,  81,  81,  82,  82,  83,  83,  84,  84,  85,  85,  86,  86,  87,  87,
+		 88,  88,  89,  89,  90,  90,  91,  91,  92,  92,  93,  93,  94,  94,  95,  95,
+		 96,  96,  97,  97,  98,  98,  99,  99, 100, 100, 101, 101, 102, 102, 103, 103,
+		104, 104, 105, 105, 106, 106, 107, 107, 108, 108, 109, 109, 110, 110, 111, 111,
+		112, 112, 113, 113, 114, 114, 115, 115, 116, 116, 117, 117, 118, 118, 119, 119,
+		120, 120, 121, 121, 122, 122, 123, 123, 124, 124, 125, 125, 126, 126, 127, 127
+	},
+	{
+		  0,  32,  32,  64,  96,  96, 128, 128,   2,  34,  34,  66,  98,  98, 130, 130,
+		  4,  36,  36,  68, 100, 100, 132, 132,   6,  38,  38,  70, 102, 102, 134, 134,
+		  8,  40,  40,  72, 104, 104, 136, 136,  10,  42,  42,  74, 106, 106, 138, 138,
+		 12,  44,  44,  76, 108, 108, 140, 140,  14,  46,  46,  78, 110, 110, 142, 142,
+		 16,  48,  48,  80, 112, 112, 144, 144,  18,  50,  50,  82, 114, 114, 146, 146,
+		 20,  52,  52,  84, 116, 116, 148, 148,  22,  54,  54,  86, 118, 118, 150, 150,
+		 24,  56,  56,  88, 120, 120, 152, 152,  26,  58,  58,  90, 122, 122, 154, 154,
+		 28,  60,  60,  92, 124, 124, 156, 156,  30,  62,  62,  94, 126, 126, 158, 158,
+		159, 159, 127, 127,  95,  63,  63,  31, 157, 157, 125, 125,  93,  61,  61,  29,
+		155, 155, 123, 123,  91,  59,  59,  27, 153, 153, 121, 121,  89,  57,  57,  25,
+		151, 151, 119, 119,  87,  55,  55,  23, 149, 149, 117, 117,  85,  53,  53,  21,
+		147, 147, 115, 115,  83,  51,  51,  19, 145, 145, 113, 113,  81,  49,  49,  17,
+		143, 143, 111, 111,  79,  47,  47,  15, 141, 141, 109, 109,  77,  45,  45,  13,
+		139, 139, 107, 107,  75,  43,  43,  11, 137, 137, 105, 105,  73,  41,  41,   9,
+		135, 135, 103, 103,  71,  39,  39,   7, 133, 133, 101, 101,  69,  37,  37,   5,
+		131, 131,  99,  99,  67,  35,  35,   3, 129, 129,  97,  97,  65,  33,  33,   1
+	},
+	{
+		  0,  64, 128, 128,   2,  66, 130, 130,   4,  68, 132, 132,   6,  70, 134, 134,
+		  8,  72, 136, 136,  10,  74, 138, 138,  12,  76, 140, 140,  14,  78, 142, 142,
+		 16,  80, 144, 144,  18,  82, 146, 146,  20,  84, 148, 148,  22,  86, 150, 150,
+		 24,  88, 152, 152,  26,  90, 154, 154,  28,  92, 156, 156,  30,  94, 158, 158,
+		 32,  96, 160, 160,  34,  98, 162, 162,  36, 100, 164, 164,  38, 102, 166, 166,
+		 40, 104, 168, 168,  42, 106, 170, 170,  44, 108, 172, 172,  46, 110, 174, 174,
+		 48, 112, 176, 176,  50, 114, 178, 178,  52, 116, 180, 180,  54, 118, 182, 182,
+		 56, 120, 184, 184,  58, 122, 186, 186,  60, 124, 188, 188,  62, 126, 190, 190,
+		191, 191, 127,  63, 189, 189, 125,  61, 187, 187, 123,  59, 185, 185, 121,  57,
+		183, 183, 119,  55, 181, 181, 117,  53, 179, 179, 115,  51, 177, 177, 113,  49,
+		175, 175, 111,  47, 173, 173, 109,  45, 171, 171, 107,  43, 169, 169, 105,  41,
+		167, 167, 103,  39, 165, 165, 101,  37, 163, 163,  99,  35, 161, 161,  97,  33,
+		159, 159,  95,  31, 157, 157,  93,  29, 155, 155,  91,  27, 153, 153,  89,  25,
+		151, 151,  87,  23, 149, 149,  85,  21, 147, 147,  83,  19, 145, 145,  81,  17,
+		143, 143,  79,  15, 141, 141,  77,  13, 139, 139,  75,  11, 137, 137,  73,   9,
+		135, 135,  71,   7, 133, 133,  69,   5, 131, 131,  67,   3, 129, 129,  65,   1
+	},
+	{
+		  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+		 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+		 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+		 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+		 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+		 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+		 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+		112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+		128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+		144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+		160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+		176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+		192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+		208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+		224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+		240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+	}
+};
+
+#endif
+
+// Starts from QUANT_6
+// Scrambled
+static const uint8_t color_scrambled_pquant_to_uquant_q6[6] {
+	  0, 255,  51, 204, 102, 153
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q8[8] {
+	  0,  36,  73, 109, 146, 182, 219, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q10[10] {
+	  0, 255,  28, 227,  56, 199,  84, 171, 113, 142
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q12[12] {
+	  0, 255,  69, 186,  23, 232,  92, 163,  46, 209, 116, 139
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q16[16] {
+	  0,  17,  34,  51,  68,  85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q20[20] {
+	  0, 255,  67, 188,  13, 242,  80, 175,  27, 228,  94, 161,  40, 215, 107, 148,
+	 54, 201, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q24[24] {
+	  0, 255,  33, 222,  66, 189,  99, 156,  11, 244,  44, 211,  77, 178, 110, 145,
+	 22, 233,  55, 200,  88, 167, 121, 134
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q32[32] {
+	  0,   8,  16,  24,  33,  41,  49,  57,  66,  74,  82,  90,  99, 107, 115, 123,
+	132, 140, 148, 156, 165, 173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q40[40] {
+	  0, 255,  32, 223,  65, 190,  97, 158,   6, 249,  39, 216,  71, 184, 104, 151,
+	 13, 242,  45, 210,  78, 177, 110, 145,  19, 236,  52, 203,  84, 171, 117, 138,
+	 26, 229,  58, 197,  91, 164, 123, 132
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q48[48] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  65, 190,  81, 174,  97, 158, 113, 142,
+	  5, 250,  21, 234,  38, 217,  54, 201,  70, 185,  86, 169, 103, 152, 119, 136,
+	 11, 244,  27, 228,  43, 212,  59, 196,  76, 179,  92, 163, 108, 147, 124, 131
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q64[64] {
+	  0,   4,   8,  12,  16,  20,  24,  28,  32,  36,  40,  44,  48,  52,  56,  60,
+	 65,  69,  73,  77,  81,  85,  89,  93,  97, 101, 105, 109, 113, 117, 121, 125,
+	130, 134, 138, 142, 146, 150, 154, 158, 162, 166, 170, 174, 178, 182, 186, 190,
+	195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235, 239, 243, 247, 251, 255,
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q80[80] {
+	  0, 255,  16, 239,  32, 223,  48, 207,  64, 191,  80, 175,  96, 159, 112, 143,
+	  3, 252,  19, 236,  35, 220,  51, 204,  67, 188,  83, 172, 100, 155, 116, 139,
+	  6, 249,  22, 233,  38, 217,  54, 201,  71, 184,  87, 168, 103, 152, 119, 136,
+	  9, 246,  25, 230,  42, 213,  58, 197,  74, 181,  90, 165, 106, 149, 122, 133,
+	 13, 242,  29, 226,  45, 210,  61, 194,  77, 178,  93, 162, 109, 146, 125, 130
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q96[96] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  2, 253,  10, 245,  18, 237,  26, 229,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  5, 250,  13, 242,  21, 234,  29, 226,  37, 218,  45, 210,  53, 202,  61, 194,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q128[128] {
+	  0,   2,   4,   6,   8,  10,  12,  14,  16,  18,  20,  22,  24,  26,  28,  30,
+	 32,  34,  36,  38,  40,  42,  44,  46,  48,  50,  52,  54,  56,  58,  60,  62,
+	 64,  66,  68,  70,  72,  74,  76,  78,  80,  82,  84,  86,  88,  90,  92,  94,
+	 96,  98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
+	129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159,
+	161, 163, 165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191,
+	193, 195, 197, 199, 201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223,
+	225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q160[160] {
+	  0, 255,   8, 247,  16, 239,  24, 231,  32, 223,  40, 215,  48, 207,  56, 199,
+	 64, 191,  72, 183,  80, 175,  88, 167,  96, 159, 104, 151, 112, 143, 120, 135,
+	  1, 254,   9, 246,  17, 238,  25, 230,  33, 222,  41, 214,  49, 206,  57, 198,
+	 65, 190,  73, 182,  81, 174,  89, 166,  97, 158, 105, 150, 113, 142, 121, 134,
+	  3, 252,  11, 244,  19, 236,  27, 228,  35, 220,  43, 212,  51, 204,  59, 196,
+	 67, 188,  75, 180,  83, 172,  91, 164,  99, 156, 107, 148, 115, 140, 123, 132,
+	  4, 251,  12, 243,  20, 235,  28, 227,  36, 219,  44, 211,  52, 203,  60, 195,
+	 68, 187,  76, 179,  84, 171,  92, 163, 100, 155, 108, 147, 116, 139, 124, 131,
+	  6, 249,  14, 241,  22, 233,  30, 225,  38, 217,  46, 209,  54, 201,  62, 193,
+	 70, 185,  78, 177,  86, 169,  94, 161, 102, 153, 110, 145, 118, 137, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q192[192] {
+	  0, 255,   4, 251,   8, 247,  12, 243,  16, 239,  20, 235,  24, 231,  28, 227,
+	 32, 223,  36, 219,  40, 215,  44, 211,  48, 207,  52, 203,  56, 199,  60, 195,
+	 64, 191,  68, 187,  72, 183,  76, 179,  80, 175,  84, 171,  88, 167,  92, 163,
+	 96, 159, 100, 155, 104, 151, 108, 147, 112, 143, 116, 139, 120, 135, 124, 131,
+	  1, 254,   5, 250,   9, 246,  13, 242,  17, 238,  21, 234,  25, 230,  29, 226,
+	 33, 222,  37, 218,  41, 214,  45, 210,  49, 206,  53, 202,  57, 198,  61, 194,
+	 65, 190,  69, 186,  73, 182,  77, 178,  81, 174,  85, 170,  89, 166,  93, 162,
+	 97, 158, 101, 154, 105, 150, 109, 146, 113, 142, 117, 138, 121, 134, 125, 130,
+	  2, 253,   6, 249,  10, 245,  14, 241,  18, 237,  22, 233,  26, 229,  30, 225,
+	 34, 221,  38, 217,  42, 213,  46, 209,  50, 205,  54, 201,  58, 197,  62, 193,
+	 66, 189,  70, 185,  74, 181,  78, 177,  82, 173,  86, 169,  90, 165,  94, 161,
+	 98, 157, 102, 153, 106, 149, 110, 145, 114, 141, 118, 137, 122, 133, 126, 129
+};
+
+static const uint8_t color_scrambled_pquant_to_uquant_q256[256] {
+	  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+	 16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+	 32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+	 48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+	 64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+	 80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+	 96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+	112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+	128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+	144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+	160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+	176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+	192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+	208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+	224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+	240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255
+};
+
+const uint8_t* color_scrambled_pquant_to_uquant_tables[17] {
+	color_scrambled_pquant_to_uquant_q6,
+	color_scrambled_pquant_to_uquant_q8,
+	color_scrambled_pquant_to_uquant_q10,
+	color_scrambled_pquant_to_uquant_q12,
+	color_scrambled_pquant_to_uquant_q16,
+	color_scrambled_pquant_to_uquant_q20,
+	color_scrambled_pquant_to_uquant_q24,
+	color_scrambled_pquant_to_uquant_q32,
+	color_scrambled_pquant_to_uquant_q40,
+	color_scrambled_pquant_to_uquant_q48,
+	color_scrambled_pquant_to_uquant_q64,
+	color_scrambled_pquant_to_uquant_q80,
+	color_scrambled_pquant_to_uquant_q96,
+	color_scrambled_pquant_to_uquant_q128,
+	color_scrambled_pquant_to_uquant_q160,
+	color_scrambled_pquant_to_uquant_q192,
+	color_scrambled_pquant_to_uquant_q256
+};
+
+// The quant_mode_table[integer_count/2][bits] gives us the quantization level for a given integer
+// count and number of bits that the integer may fit into.
+const int8_t quant_mode_table[10][128] {
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+    },
+    {
+         -1, -1,  0,  0,  2,  3,  5,  6,  8,  9, 11, 12, 14, 15, 17, 18,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1,  0,  0,  0,  1,  2,  2,  3,  4,  5,  5,  6,  7,
+          8,  8,  9, 10, 11, 11, 12, 13, 14, 14, 15, 16, 17, 17, 18, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  1,  1,  2,  2,  3,  3,
+          4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10, 10, 11, 11,
+         12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  1,  1,  1,
+          2,  2,  2,  3,  3,  4,  4,  4,  5,  5,  5,  6,  6,  7,  7,  7,
+          8,  8,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 13,
+         14, 14, 14, 15, 15, 16, 16, 16, 17, 17, 17, 18, 18, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,  0,  0,
+          1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  4,  4,  4,  4,  5,  5,
+          5,  5,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9, 10, 10,
+         10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14,
+         15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,  0,  0,
+          0,  0,  0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+          4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,
+          8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11, 11, 11,
+         12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15,
+         16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0,  0,
+          0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  2,  2,  2,  2,
+          2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  6,
+          6,  6,  6,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  9,  9,  9,
+          9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13,
+         13, 13, 13, 13, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16,
+         16, 16, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19,
+         20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+          2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,
+          5,  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  7,  7,
+          8,  8,  8,  8,  8,  8,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10,
+         11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13,
+         14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+         17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 19, 19
+    },
+    {
+         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+         -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,
+          1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  4,
+          4,  4,  4,  4,  4,  4,  5,  5,  5,  5,  5,  5,  6,  6,  6,  6,
+          6,  7,  7,  7,  7,  7,  7,  7,  8,  8,  8,  8,  8,  8,  9,  9,
+          9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11,
+         12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
+         14, 14, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 17, 17
+    }
+};
diff --git a/thirdparty/astcenc/astcenc_symbolic_physical.cpp b/thirdparty/astcenc/astcenc_symbolic_physical.cpp
new file mode 100644
index 0000000000..80221a6013
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_symbolic_physical.cpp
@@ -0,0 +1,534 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Functions for converting between symbolic and physical encodings.
+ */
+
+#include "astcenc_internal.h"
+
+#include <cassert>
+
+/**
+ * @brief Write up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so
+ * may span two separate bytes in memory.
+ *
+ * @param         value       The value to write.
+ * @param         bitcount    The number of bits to write, starting from LSB.
+ * @param         bitoffset   The bit offset to store at, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to write to.
+ */
+static inline void write_bits(
+	int value,
+	int bitcount,
+	int bitoffset,
+	uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	value &= mask;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	value <<= bitoffset;
+	mask <<= bitoffset;
+	mask = ~mask;
+
+	ptr[0] &= mask;
+	ptr[0] |= value;
+	ptr[1] &= mask >> 8;
+	ptr[1] |= value >> 8;
+}
+
+/**
+ * @brief Read up to 8 bits at an arbitrary bit offset.
+ *
+ * The stored value is at most 8 bits, but can be stored at an offset of between 0 and 7 bits so may
+ * span two separate bytes in memory.
+ *
+ * @param         bitcount    The number of bits to read.
+ * @param         bitoffset   The bit offset to read from, between 0 and 7.
+ * @param[in,out] ptr         The data pointer to read from.
+ *
+ * @return The read value.
+ */
+static inline int read_bits(
+	int bitcount,
+	int bitoffset,
+	const uint8_t* ptr
+) {
+	int mask = (1 << bitcount) - 1;
+	ptr += bitoffset >> 3;
+	bitoffset &= 7;
+	int value = ptr[0] | (ptr[1] << 8);
+	value >>= bitoffset;
+	value &= mask;
+	return value;
+}
+
+/**
+ * @brief Reverse bits in a byte.
+ *
+ * @param p   The value to reverse.
+  *
+ * @return The reversed result.
+ */
+static inline int bitrev8(int p)
+{
+	p = ((p & 0x0F) << 4) | ((p >> 4) & 0x0F);
+	p = ((p & 0x33) << 2) | ((p >> 2) & 0x33);
+	p = ((p & 0x55) << 1) | ((p >> 1) & 0x55);
+	return p;
+}
+
+/* See header for documentation. */
+void symbolic_to_physical(
+	const block_size_descriptor& bsd,
+	const symbolic_compressed_block& scb,
+	physical_compressed_block& pcb
+) {
+	assert(scb.block_type != SYM_BTYPE_ERROR);
+
+	// Constant color block using UNORM16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_U16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8] { 0xFC, 0xFD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb.data[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	// Constant color block using FP16 colors
+	if (scb.block_type == SYM_BTYPE_CONST_F16)
+	{
+		// There is currently no attempt to coalesce larger void-extents
+		static const uint8_t cbytes[8]  { 0xFC, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF };
+		for (unsigned int i = 0; i < 8; i++)
+		{
+			pcb.data[i] = cbytes[i];
+		}
+
+		for (unsigned int i = 0; i < BLOCK_MAX_COMPONENTS; i++)
+		{
+			pcb.data[2 * i + 8] = scb.constant_color[i] & 0xFF;
+			pcb.data[2 * i + 9] = (scb.constant_color[i] >> 8) & 0xFF;
+		}
+
+		return;
+	}
+
+	unsigned int partition_count = scb.partition_count;
+
+	// Compress the weights.
+	// They are encoded as an ordinary integer-sequence, then bit-reversed
+	uint8_t weightbuf[16] { 0 };
+
+	const auto& bm = bsd.get_block_mode(scb.block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+	int weight_count = di.weight_count;
+	quant_method weight_quant_method = bm.get_weight_quant_mode();
+	float weight_quant_levels = static_cast<float>(get_quant_level(weight_quant_method));
+	int is_dual_plane = bm.is_dual_plane;
+
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	uint8_t weights[64];
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i] = qat.scramble_map[qwi];
+
+			uqw = static_cast<float>(scb.weights[i + WEIGHTS_PLANE2_OFFSET]);
+			qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			qwi = static_cast<int>(qw + 0.5f);
+			weights[2 * i + 1] = qat.scramble_map[qwi];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			float uqw = static_cast<float>(scb.weights[i]);
+			float qw = (uqw / 64.0f) * (weight_quant_levels - 1.0f);
+			int qwi = static_cast<int>(qw + 0.5f);
+			weights[i] = qat.scramble_map[qwi];
+		}
+	}
+
+	encode_ise(weight_quant_method, real_weight_count, weights, weightbuf, 0);
+
+	for (int i = 0; i < 16; i++)
+	{
+		pcb.data[i] = static_cast<uint8_t>(bitrev8(weightbuf[15 - i]));
+	}
+
+	write_bits(scb.block_mode, 11, 0, pcb.data);
+	write_bits(partition_count - 1, 2, 11, pcb.data);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	// Encode partition index and color endpoint types for blocks with 2+ partitions
+	if (partition_count > 1)
+	{
+		write_bits(scb.partition_index, 6, 13, pcb.data);
+		write_bits(scb.partition_index >> 6, PARTITION_INDEX_BITS - 6, 19, pcb.data);
+
+		if (scb.color_formats_matched)
+		{
+			write_bits(scb.color_formats[0] << 2, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
+		}
+		else
+		{
+			// Check endpoint types for each partition to determine the lowest class present
+			int low_class = 4;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int class_of_format = scb.color_formats[i] >> 2;
+				low_class = astc::min(class_of_format, low_class);
+			}
+
+			if (low_class == 3)
+			{
+				low_class = 2;
+			}
+
+			int encoded_type = low_class + 1;
+			int bitpos = 2;
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int classbit_of_format = (scb.color_formats[i] >> 2) - low_class;
+				encoded_type |= classbit_of_format << bitpos;
+				bitpos++;
+			}
+
+			for (unsigned int i = 0; i < partition_count; i++)
+			{
+				int lowbits_of_format = scb.color_formats[i] & 3;
+				encoded_type |= lowbits_of_format << bitpos;
+				bitpos += 2;
+			}
+
+			int encoded_type_lowpart = encoded_type & 0x3F;
+			int encoded_type_highpart = encoded_type >> 6;
+			int encoded_type_highpart_size = (3 * partition_count) - 4;
+			int encoded_type_highpart_pos = 128 - bits_for_weights - encoded_type_highpart_size;
+			write_bits(encoded_type_lowpart, 6, 13 + PARTITION_INDEX_BITS, pcb.data);
+			write_bits(encoded_type_highpart, encoded_type_highpart_size, encoded_type_highpart_pos, pcb.data);
+			below_weights_pos -= encoded_type_highpart_size;
+		}
+	}
+	else
+	{
+		write_bits(scb.color_formats[0], 4, 13, pcb.data);
+	}
+
+	// In dual-plane mode, encode the color component of the second plane of weights
+	if (is_dual_plane)
+	{
+		write_bits(scb.plane2_component, 2, below_weights_pos - 2, pcb.data);
+	}
+
+	// Encode the color components
+	uint8_t values_to_encode[32];
+	int valuecount_to_encode = 0;
+
+	const uint8_t* pack_table = color_uquant_to_scrambled_pquant_tables[scb.quant_mode - QUANT_6];
+	for (unsigned int i = 0; i < scb.partition_count; i++)
+	{
+		int vals = 2 * (scb.color_formats[i] >> 2) + 2;
+		assert(vals <= 8);
+		for (int j = 0; j < vals; j++)
+		{
+			values_to_encode[j + valuecount_to_encode] = pack_table[scb.color_values[i][j]];
+		}
+		valuecount_to_encode += vals;
+	}
+
+	encode_ise(scb.get_color_quant_mode(), valuecount_to_encode, values_to_encode, pcb.data,
+	           scb.partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS);
+}
+
+/* See header for documentation. */
+void physical_to_symbolic(
+	const block_size_descriptor& bsd,
+	const physical_compressed_block& pcb,
+	symbolic_compressed_block& scb
+) {
+	uint8_t bswapped[16];
+
+	scb.block_type = SYM_BTYPE_NONCONST;
+
+	// Extract header fields
+	int block_mode = read_bits(11, 0, pcb.data);
+	if ((block_mode & 0x1FF) == 0x1FC)
+	{
+		// Constant color block
+
+		// Check what format the data has
+		if (block_mode & 0x200)
+		{
+			scb.block_type = SYM_BTYPE_CONST_F16;
+		}
+		else
+		{
+			scb.block_type = SYM_BTYPE_CONST_U16;
+		}
+
+		scb.partition_count = 0;
+		for (int i = 0; i < 4; i++)
+		{
+			scb.constant_color[i] = pcb.data[2 * i + 8] | (pcb.data[2 * i + 9] << 8);
+		}
+
+		// Additionally, check that the void-extent
+		if (bsd.zdim == 1)
+		{
+			// 2D void-extent
+			int rsvbits = read_bits(2, 10, pcb.data);
+			if (rsvbits != 3)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+
+			int vx_low_s = read_bits(8, 12, pcb.data) | (read_bits(5, 12 + 8, pcb.data) << 8);
+			int vx_high_s = read_bits(8, 25, pcb.data) | (read_bits(5, 25 + 8, pcb.data) << 8);
+			int vx_low_t = read_bits(8, 38, pcb.data) | (read_bits(5, 38 + 8, pcb.data) << 8);
+			int vx_high_t = read_bits(8, 51, pcb.data) | (read_bits(5, 51 + 8, pcb.data) << 8);
+
+			int all_ones = vx_low_s == 0x1FFF && vx_high_s == 0x1FFF && vx_low_t == 0x1FFF && vx_high_t == 0x1FFF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+		else
+		{
+			// 3D void-extent
+			int vx_low_s = read_bits(9, 10, pcb.data);
+			int vx_high_s = read_bits(9, 19, pcb.data);
+			int vx_low_t = read_bits(9, 28, pcb.data);
+			int vx_high_t = read_bits(9, 37, pcb.data);
+			int vx_low_p = read_bits(9, 46, pcb.data);
+			int vx_high_p = read_bits(9, 55, pcb.data);
+
+			int all_ones = vx_low_s == 0x1FF && vx_high_s == 0x1FF && vx_low_t == 0x1FF && vx_high_t == 0x1FF && vx_low_p == 0x1FF && vx_high_p == 0x1FF;
+
+			if ((vx_low_s >= vx_high_s || vx_low_t >= vx_high_t || vx_low_p >= vx_high_p) && !all_ones)
+			{
+				scb.block_type = SYM_BTYPE_ERROR;
+				return;
+			}
+		}
+
+		return;
+	}
+
+	unsigned int packed_index = bsd.block_mode_packed_index[block_mode];
+	if (packed_index == BLOCK_BAD_BLOCK_MODE)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	const auto& bm = bsd.get_block_mode(block_mode);
+	const auto& di = bsd.get_decimation_info(bm.decimation_mode);
+
+	int weight_count = di.weight_count;
+	promise(weight_count > 0);
+
+	quant_method weight_quant_method = static_cast<quant_method>(bm.quant_mode);
+	int is_dual_plane = bm.is_dual_plane;
+
+	int real_weight_count = is_dual_plane ? 2 * weight_count : weight_count;
+
+	int partition_count = read_bits(2, 11, pcb.data) + 1;
+	promise(partition_count > 0);
+
+	scb.block_mode = static_cast<uint16_t>(block_mode);
+	scb.partition_count = static_cast<uint8_t>(partition_count);
+
+	for (int i = 0; i < 16; i++)
+	{
+		bswapped[i] = static_cast<uint8_t>(bitrev8(pcb.data[15 - i]));
+	}
+
+	int bits_for_weights = get_ise_sequence_bitcount(real_weight_count, weight_quant_method);
+
+	int below_weights_pos = 128 - bits_for_weights;
+
+	uint8_t indices[64];
+	const auto& qat = quant_and_xfer_tables[weight_quant_method];
+
+	decode_ise(weight_quant_method, real_weight_count, bswapped, indices, 0);
+
+	if (is_dual_plane)
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[2 * i]];
+			scb.weights[i + WEIGHTS_PLANE2_OFFSET] = qat.unscramble_and_unquant_map[indices[2 * i + 1]];
+		}
+	}
+	else
+	{
+		for (int i = 0; i < weight_count; i++)
+		{
+			scb.weights[i] = qat.unscramble_and_unquant_map[indices[i]];
+		}
+	}
+
+	if (is_dual_plane && partition_count == 4)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	scb.color_formats_matched = 0;
+
+	// Determine the format of each endpoint pair
+	int color_formats[BLOCK_MAX_PARTITIONS];
+	int encoded_type_highpart_size = 0;
+	if (partition_count == 1)
+	{
+		color_formats[0] = read_bits(4, 13, pcb.data);
+		scb.partition_index = 0;
+	}
+	else
+	{
+		encoded_type_highpart_size = (3 * partition_count) - 4;
+		below_weights_pos -= encoded_type_highpart_size;
+		int encoded_type = read_bits(6, 13 + PARTITION_INDEX_BITS, pcb.data) | (read_bits(encoded_type_highpart_size, below_weights_pos, pcb.data) << 6);
+		int baseclass = encoded_type & 0x3;
+		if (baseclass == 0)
+		{
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (encoded_type >> 2) & 0xF;
+			}
+
+			below_weights_pos += encoded_type_highpart_size;
+			scb.color_formats_matched = 1;
+			encoded_type_highpart_size = 0;
+		}
+		else
+		{
+			int bitpos = 2;
+			baseclass--;
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] = (((encoded_type >> bitpos) & 1) + baseclass) << 2;
+				bitpos++;
+			}
+
+			for (int i = 0; i < partition_count; i++)
+			{
+				color_formats[i] |= (encoded_type >> bitpos) & 3;
+				bitpos += 2;
+			}
+		}
+		scb.partition_index = static_cast<uint16_t>(read_bits(6, 13, pcb.data) | (read_bits(PARTITION_INDEX_BITS - 6, 19, pcb.data) << 6));
+	}
+
+	for (int i = 0; i < partition_count; i++)
+	{
+		scb.color_formats[i] = static_cast<uint8_t>(color_formats[i]);
+	}
+
+	// Determine number of color endpoint integers
+	int color_integer_count = 0;
+	for (int i = 0; i < partition_count; i++)
+	{
+		int endpoint_class = color_formats[i] >> 2;
+		color_integer_count += (endpoint_class + 1) * 2;
+	}
+
+	if (color_integer_count > 18)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Determine the color endpoint format to use
+	static const int color_bits_arr[5] { -1, 115 - 4, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS, 113 - 4 - PARTITION_INDEX_BITS };
+	int color_bits = color_bits_arr[partition_count] - bits_for_weights - encoded_type_highpart_size;
+	if (is_dual_plane)
+	{
+		color_bits -= 2;
+	}
+
+	if (color_bits < 0)
+	{
+		color_bits = 0;
+	}
+
+	int color_quant_level = quant_mode_table[color_integer_count >> 1][color_bits];
+	if (color_quant_level < QUANT_6)
+	{
+		scb.block_type = SYM_BTYPE_ERROR;
+		return;
+	}
+
+	// Unpack the integer color values and assign to endpoints
+	scb.quant_mode = static_cast<quant_method>(color_quant_level);
+
+	uint8_t values_to_decode[32];
+	decode_ise(static_cast<quant_method>(color_quant_level), color_integer_count, pcb.data,
+	           values_to_decode, (partition_count == 1 ? 17 : 19 + PARTITION_INDEX_BITS));
+
+	int valuecount_to_decode = 0;
+	const uint8_t* unpack_table = color_scrambled_pquant_to_uquant_tables[scb.quant_mode - QUANT_6];
+	for (int i = 0; i < partition_count; i++)
+	{
+		int vals = 2 * (color_formats[i] >> 2) + 2;
+		for (int j = 0; j < vals; j++)
+		{
+			scb.color_values[i][j] = unpack_table[values_to_decode[j + valuecount_to_decode]];
+		}
+		valuecount_to_decode += vals;
+	}
+
+	// Fetch component for second-plane in the case of dual plane of weights.
+	scb.plane2_component = -1;
+	if (is_dual_plane)
+	{
+		scb.plane2_component = static_cast<int8_t>(read_bits(2, below_weights_pos - 2, pcb.data));
+	}
+}
diff --git a/thirdparty/astcenc/astcenc_vecmathlib.h b/thirdparty/astcenc/astcenc_vecmathlib.h
new file mode 100644
index 0000000000..d48f1d73ea
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_vecmathlib.h
@@ -0,0 +1,570 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2022 Arm Limited
+// Copyright 2008 Jose Fonseca
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/*
+ * This module implements vector support for floats, ints, and vector lane
+ * control masks. It provides access to both explicit vector width types, and
+ * flexible N-wide types where N can be determined at compile time.
+ *
+ * The design of this module encourages use of vector length agnostic code, via
+ * the vint, vfloat, and vmask types. These will take on the widest SIMD vector
+ * with that is available at compile time. The current vector width is
+ * accessible for e.g. loop strides via the ASTCENC_SIMD_WIDTH constant.
+ *
+ * Explicit scalar types are accessible via the vint1, vfloat1, vmask1 types.
+ * These are provided primarily for prototyping and algorithm debug of VLA
+ * implementations.
+ *
+ * Explicit 4-wide types are accessible via the vint4, vfloat4, and vmask4
+ * types. These are provided for use by VLA code, but are also expected to be
+ * used as a fixed-width type and will supported a reference C++ fallback for
+ * use on platforms without SIMD intrinsics.
+ *
+ * Explicit 8-wide types are accessible via the vint8, vfloat8, and vmask8
+ * types. These are provide for use by VLA code, and are not expected to be
+ * used as a fixed-width type in normal code. No reference C implementation is
+ * provided on platforms without underlying SIMD intrinsics.
+ *
+ * With the current implementation ISA support is provided for:
+ *
+ *     * 1-wide for scalar reference.
+ *     * 4-wide for Armv8-A NEON.
+ *     * 4-wide for x86-64 SSE2.
+ *     * 4-wide for x86-64 SSE4.1.
+ *     * 8-wide for x86-64 AVX2.
+ */
+
+#ifndef ASTC_VECMATHLIB_H_INCLUDED
+#define ASTC_VECMATHLIB_H_INCLUDED
+
+#if ASTCENC_SSE != 0 || ASTCENC_AVX != 0
+	#include <immintrin.h>
+#elif ASTCENC_NEON != 0
+	#include <arm_neon.h>
+#endif
+
+#if !defined(__clang__) && defined(_MSC_VER)
+	#define ASTCENC_SIMD_INLINE __forceinline
+	#define ASTCENC_NO_INLINE
+#elif defined(__GNUC__) && !defined(__clang__)
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#else
+	#define ASTCENC_SIMD_INLINE __attribute__((always_inline, nodebug)) inline
+	#define ASTCENC_NO_INLINE __attribute__ ((noinline))
+#endif
+
+#if ASTCENC_AVX >= 2
+	/* If we have AVX2 expose 8-wide VLA. */
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+	#include "astcenc_vecmathlib_avx2_8.h"
+
+	#define ASTCENC_SIMD_WIDTH 8
+
+	using vfloat = vfloat8;
+
+	#if defined(ASTCENC_NO_INVARIANCE)
+		using vfloatacc = vfloat8;
+	#else
+		using vfloatacc = vfloat4;
+	#endif
+
+	using vint = vint8;
+	using vmask = vmask8;
+
+	constexpr auto loada = vfloat8::loada;
+	constexpr auto load1 = vfloat8::load1;
+
+#elif ASTCENC_SSE >= 20
+	/* If we have SSE expose 4-wide VLA, and 4-wide fixed width. */
+	#include "astcenc_vecmathlib_sse_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+
+#elif ASTCENC_NEON > 0
+	/* If we have NEON expose 4-wide VLA. */
+	#include "astcenc_vecmathlib_neon_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+
+#else
+	// If we have nothing expose 4-wide VLA, and 4-wide fixed width.
+
+	// Note: We no longer expose the 1-wide scalar fallback because it is not
+	// invariant with the 4-wide path due to algorithms that use horizontal
+	// operations that accumulate a local vector sum before accumulating into
+	// a running sum.
+	//
+	// For 4 items adding into an accumulator using 1-wide vectors the sum is:
+	//
+	//     result = ((((sum + l0) + l1) + l2) + l3)
+	//
+    // ... whereas the accumulator for a 4-wide vector sum is:
+	//
+	//     result = sum + ((l0 + l2) + (l1 + l3))
+	//
+	// In "normal maths" this is the same, but the floating point reassociation
+	// differences mean that these will not produce the same result.
+
+	#include "astcenc_vecmathlib_none_4.h"
+	#include "astcenc_vecmathlib_common_4.h"
+
+	#define ASTCENC_SIMD_WIDTH 4
+
+	using vfloat = vfloat4;
+	using vfloatacc = vfloat4;
+	using vint = vint4;
+	using vmask = vmask4;
+
+	constexpr auto loada = vfloat4::loada;
+	constexpr auto load1 = vfloat4::load1;
+#endif
+
+/**
+ * @brief Round a count down to the largest multiple of 8.
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_8(unsigned int count)
+{
+	return count & static_cast<unsigned int>(~(8 - 1));
+}
+
+/**
+ * @brief Round a count down to the largest multiple of 4.
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_4(unsigned int count)
+{
+	return count & static_cast<unsigned int>(~(4 - 1));
+}
+
+/**
+ * @brief Round a count down to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_down_to_simd_multiple_vla(unsigned int count)
+{
+	return count & static_cast<unsigned int>(~(ASTCENC_SIMD_WIDTH - 1));
+}
+
+/**
+ * @brief Round a count up to the largest multiple of the SIMD width.
+ *
+ * Assumption that the vector width is a power of two ...
+ *
+ * @param count   The unrounded value.
+ *
+ * @return The rounded value.
+ */
+ASTCENC_SIMD_INLINE unsigned int round_up_to_simd_multiple_vla(unsigned int count)
+{
+	unsigned int multiples = (count + ASTCENC_SIMD_WIDTH - 1) / ASTCENC_SIMD_WIDTH;
+	return multiples * ASTCENC_SIMD_WIDTH;
+}
+
+/**
+ * @brief Return @c a with lanes negated if the @c b lane is negative.
+ */
+ASTCENC_SIMD_INLINE vfloat change_sign(vfloat a, vfloat b)
+{
+	vint ia = float_as_int(a);
+	vint ib = float_as_int(b);
+	vint sign_mask(static_cast<int>(0x80000000));
+	vint r = ia ^ (ib & sign_mask);
+	return int_as_float(r);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan(x).
+ *
+ * Max error of this implementation is 0.004883.
+ */
+ASTCENC_SIMD_INLINE vfloat atan(vfloat x)
+{
+	vmask c = abs(x) > vfloat(1.0f);
+	vfloat z = change_sign(vfloat(astc::PI_OVER_TWO), x);
+	vfloat y = select(x, vfloat(1.0f) / x, c);
+	y = y / (y * y * vfloat(0.28f) + vfloat(1.0f));
+	return select(y, z - y, c);
+}
+
+/**
+ * @brief Return fast, but approximate, vector atan2(x, y).
+ */
+ASTCENC_SIMD_INLINE vfloat atan2(vfloat y, vfloat x)
+{
+	vfloat z = atan(abs(y / x));
+	vmask xmask = vmask(float_as_int(x).m);
+	return change_sign(select_msb(z, vfloat(astc::PI) - z, xmask), y);
+}
+
+/*
+ * @brief Factory that returns a unit length 4 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit4()
+{
+	return vfloat4(0.5f);
+}
+
+/**
+ * @brief Factory that returns a unit length 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit3()
+{
+	float val = 0.577350258827209473f;
+	return vfloat4(val, val, val, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a unit length 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 unit2()
+{
+	float val = 0.707106769084930420f;
+	return vfloat4(val, val, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 3 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat3(float a, float b, float c)
+{
+	return vfloat4(a, b, c, 0.0f);
+}
+
+/**
+ * @brief Factory that returns a 2 component vfloat4.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 vfloat2(float a, float b)
+{
+	return vfloat4(a, b, 0.0f, 0.0f);
+}
+
+/**
+ * @brief Normalize a non-zero length vector to unit length.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize(vfloat4 a)
+{
+	vfloat4 length = dot(a, a);
+	return a / sqrt(length);
+}
+
+/**
+ * @brief Normalize a vector, returning @c safe if len is zero.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 normalize_safe(vfloat4 a, vfloat4 safe)
+{
+	vfloat4 length = dot(a, a);
+	if (length.lane<0>() != 0.0f)
+	{
+		return a / sqrt(length);
+	}
+
+	return safe;
+}
+
+
+
+#define POLY0(x, c0)                     (                                     c0)
+#define POLY1(x, c0, c1)                 ((POLY0(x, c1) * x)                 + c0)
+#define POLY2(x, c0, c1, c2)             ((POLY1(x, c1, c2) * x)             + c0)
+#define POLY3(x, c0, c1, c2, c3)         ((POLY2(x, c1, c2, c3) * x)         + c0)
+#define POLY4(x, c0, c1, c2, c3, c4)     ((POLY3(x, c1, c2, c3, c4) * x)     + c0)
+#define POLY5(x, c0, c1, c2, c3, c4, c5) ((POLY4(x, c1, c2, c3, c4, c5) * x) + c0)
+
+/**
+ * @brief Compute an approximate exp2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 exp2(vfloat4 x)
+{
+	x = clamp(-126.99999f, 129.0f, x);
+
+	vint4 ipart = float_to_int(x - 0.5f);
+	vfloat4 fpart = x - int_to_float(ipart);
+
+	// Integer contrib, using 1 << ipart
+	vfloat4 iexp = int_as_float(lsl<23>(ipart + 127));
+
+	// Fractional contrib, using polynomial fit of 2^x in range [-0.5, 0.5)
+	vfloat4 fexp = POLY5(fpart,
+	                     9.9999994e-1f,
+	                     6.9315308e-1f,
+	                     2.4015361e-1f,
+	                     5.5826318e-2f,
+	                     8.9893397e-3f,
+	                     1.8775767e-3f);
+
+	return iexp * fexp;
+}
+
+/**
+ * @brief Compute an approximate log2(x) for each lane in the vector.
+ *
+ * Based on 5th degree minimax polynomials, ported from this blog
+ * https://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html
+ */
+static ASTCENC_SIMD_INLINE vfloat4 log2(vfloat4 x)
+{
+	vint4 exp(0x7F800000);
+	vint4 mant(0x007FFFFF);
+	vint4 one(0x3F800000);
+
+	vint4 i = float_as_int(x);
+
+	vfloat4 e = int_to_float(lsr<23>(i & exp) - 127);
+
+	vfloat4 m = int_as_float((i & mant) | one);
+
+	// Polynomial fit of log2(x)/(x - 1), for x in range [1, 2)
+	vfloat4 p = POLY4(m,
+	                  2.8882704548164776201f,
+	                 -2.52074962577807006663f,
+	                  1.48116647521213171641f,
+	                 -0.465725644288844778798f,
+	                  0.0596515482674574969533f);
+
+	// Increases the polynomial degree, but ensures that log2(1) == 0
+	p = p * (m - 1.0f);
+
+	return p + e;
+}
+
+/**
+ * @brief Compute an approximate pow(x, y) for each lane in the vector.
+ *
+ * Power function based on the exp2(log2(x) * y) transform.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 pow(vfloat4 x, vfloat4 y)
+{
+	vmask4 zero_mask = y == vfloat4(0.0f);
+	vfloat4 estimate = exp2(log2(x) * y);
+
+	// Guarantee that y == 0 returns exactly 1.0f
+	return select(estimate, vfloat4(1.0f), zero_mask);
+}
+
+/**
+ * @brief Count the leading zeros for each lane in @c a.
+ *
+ * Valid for all data values of @c a; will return a per-lane value [0, 32].
+ */
+static ASTCENC_SIMD_INLINE vint4 clz(vint4 a)
+{
+	// This function is a horrible abuse of floating point exponents to convert
+	// the original integer value into a 2^N encoding we can recover easily.
+
+	// Convert to float without risk of rounding up by keeping only top 8 bits.
+	// This trick is is guaranteed to keep top 8 bits and clear the 9th.
+	a = (~lsr<8>(a)) & a;
+	a = float_as_int(int_to_float(a));
+
+	// Extract and unbias exponent
+	a = vint4(127 + 31) - lsr<23>(a);
+
+	// Clamp result to a valid 32-bit range
+	return clamp(0, 32, a);
+}
+
+/**
+ * @brief Return lanewise 2^a for each lane in @c a.
+ *
+ * Use of signed int means that this is only valid for values in range [0, 31].
+ */
+static ASTCENC_SIMD_INLINE vint4 two_to_the_n(vint4 a)
+{
+	// 2^30 is the largest signed number than can be represented
+	assert(all(a < vint4(31)));
+
+	// This function is a horrible abuse of floating point to use the exponent
+	// and float conversion to generate a 2^N multiple.
+
+	// Bias the exponent
+	vint4 exp = a + 127;
+	exp = lsl<23>(exp);
+
+	// Reinterpret the bits as a float, and then convert to an int
+	vfloat4 f = int_as_float(exp);
+	return float_to_int(f);
+}
+
+/**
+ * @brief Convert unorm16 [0, 65535] to float16 in range [0, 1].
+ */
+static ASTCENC_SIMD_INLINE vint4 unorm16_to_sf16(vint4 p)
+{
+	vint4 fp16_one = vint4(0x3C00);
+	vint4 fp16_small = lsl<8>(p);
+
+	vmask4 is_one = p == vint4(0xFFFF);
+	vmask4 is_small = p < vint4(4);
+
+	// Manually inline clz() on Visual Studio to avoid release build codegen bug
+	// see https://github.com/ARM-software/astc-encoder/issues/259
+#if !defined(__clang__) && defined(_MSC_VER)
+	vint4 a = (~lsr<8>(p)) & p;
+	a = float_as_int(int_to_float(a));
+	a = vint4(127 + 31) - lsr<23>(a);
+	vint4 lz = clamp(0, 32, a) - 16;
+#else
+	vint4 lz = clz(p) - 16;
+#endif
+
+	p = p * two_to_the_n(lz + 1);
+	p = p & vint4(0xFFFF);
+
+	p = lsr<6>(p);
+
+	p = p | lsl<10>(vint4(14) - lz);
+
+	vint4 r = select(p, fp16_one, is_one);
+	r = select(r, fp16_small, is_small);
+	return r;
+}
+
+/**
+ * @brief Convert 16-bit LNS to float16.
+ */
+static ASTCENC_SIMD_INLINE vint4 lns_to_sf16(vint4 p)
+{
+	vint4 mc = p & 0x7FF;
+	vint4 ec = lsr<11>(p);
+
+	vint4 mc_512 = mc * 3;
+	vmask4 mask_512 = mc < vint4(512);
+
+	vint4 mc_1536 = mc * 4 - 512;
+	vmask4 mask_1536 = mc < vint4(1536);
+
+	vint4 mc_else = mc * 5 - 2048;
+
+	vint4 mt = mc_else;
+	mt = select(mt, mc_1536, mask_1536);
+	mt = select(mt, mc_512, mask_512);
+
+	vint4 res = lsl<10>(ec) | lsr<3>(mt);
+	return min(res, vint4(0x7BFF));
+}
+
+/**
+ * @brief Extract mantissa and exponent of a float value.
+ *
+ * @param      a      The input value.
+ * @param[out] exp    The output exponent.
+ *
+ * @return The mantissa.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 frexp(vfloat4 a, vint4& exp)
+{
+	// Interpret the bits as an integer
+	vint4 ai = float_as_int(a);
+
+	// Extract and unbias the exponent
+	exp = (lsr<23>(ai) & 0xFF) - 126;
+
+	// Extract and unbias the mantissa
+	vint4 manti = (ai &  static_cast<int>(0x807FFFFF)) | 0x3F000000;
+	return int_as_float(manti);
+}
+
+/**
+ * @brief Convert float to 16-bit LNS.
+ */
+static ASTCENC_SIMD_INLINE vfloat4 float_to_lns(vfloat4 a)
+{
+	vint4 exp;
+	vfloat4 mant = frexp(a, exp);
+
+	// Do these early before we start messing about ...
+	vmask4 mask_underflow_nan = ~(a > vfloat4(1.0f / 67108864.0f));
+	vmask4 mask_infinity = a >= vfloat4(65536.0f);
+
+	// If input is smaller than 2^-14, multiply by 2^25 and don't bias.
+	vmask4 exp_lt_m13 = exp < vint4(-13);
+
+	vfloat4 a1a = a * 33554432.0f;
+	vint4 expa = vint4::zero();
+
+	vfloat4 a1b = (mant - 0.5f) * 4096;
+	vint4 expb = exp + 14;
+
+	a = select(a1b, a1a, exp_lt_m13);
+	exp = select(expb, expa, exp_lt_m13);
+
+	vmask4 a_lt_384 = a < vfloat4(384.0f);
+	vmask4 a_lt_1408 = a <= vfloat4(1408.0f);
+
+	vfloat4 a2a = a * (4.0f / 3.0f);
+	vfloat4 a2b = a + 128.0f;
+	vfloat4 a2c = (a + 512.0f) * (4.0f / 5.0f);
+
+	a = a2c;
+	a = select(a, a2b, a_lt_1408);
+	a = select(a, a2a, a_lt_384);
+
+	a = a + (int_to_float(exp) * 2048.0f) + 1.0f;
+
+	a = select(a, vfloat4(65535.0f), mask_infinity);
+	a = select(a, vfloat4::zero(), mask_underflow_nan);
+
+	return a;
+}
+
+namespace astc
+{
+
+static ASTCENC_SIMD_INLINE float pow(float x, float y)
+{
+	return pow(vfloat4(x), vfloat4(y)).lane<0>();
+}
+
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_H_INCLUDED
diff --git a/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
new file mode 100644
index 0000000000..a785aca75b
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_vecmathlib_avx2_8.h
@@ -0,0 +1,1204 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 8x32-bit vectors, implemented using AVX2.
+ *
+ * This module implements 8-wide 32-bit float, int, and mask vectors for x86
+ * AVX2.
+ *
+ * There is a baseline level of functionality provided by all vector widths and
+ * implementations. This is implemented using identical function signatures,
+ * modulo data type, so we can use them as substitutable implementations in VLA
+ * code.
+ */
+
+#ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
+#define ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+// Define convenience intrinsics that are missing on older compilers
+#define astcenc_mm256_set_m128i(m, n) _mm256_insertf128_si256(_mm256_castsi128_si256((n)), (m), 1)
+
+// ============================================================================
+// vfloat8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide floats.
+ */
+struct vfloat8
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat8() = default;
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(const float *p)
+	{
+		m = _mm256_loadu_ps(p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(float a)
+	{
+		m = _mm256_set1_ps(a);
+	}
+
+	/**
+	 * @brief Construct from 8 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(
+		float a, float b, float c, float d,
+		float e, float f, float g, float h)
+	{
+		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat8(__m256 a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+	#if !defined(__clang__) && defined(_MSC_VER)
+		return m.m256_f32[l];
+	#else
+		union { __m256 m; float f[8]; } cvt;
+		cvt.m = m;
+		return cvt.f[l];
+	#endif
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 zero()
+	{
+		return vfloat8(_mm256_setzero_ps());
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 load1(const float* p)
+	{
+		return vfloat8(_mm256_broadcast_ss(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 loada(const float* p)
+	{
+		return vfloat8(_mm256_load_ps(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
+	{
+		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	__m256 m;
+};
+
+// ============================================================================
+// vint8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide ints.
+ */
+struct vint8
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint8() = default;
+
+	/**
+	 * @brief Construct from 8 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(const int *p)
+	{
+		m = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+	}
+
+	/**
+	 * @brief Construct from 8 uint8_t loaded from an unaligned address.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
+	{
+		// _mm_loadu_si64 would be nicer syntax, but missing on older GCC
+		m = _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(*reinterpret_cast<const long long*>(p)));
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using vfloat4::zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(int a)
+	{
+		m = _mm256_set1_epi32(a);
+	}
+
+	/**
+	 * @brief Construct from 8 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(
+		int a, int b, int c, int d,
+		int e, int f, int g, int h)
+	{
+		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint8(__m256i a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar from a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE int lane() const
+	{
+	#if !defined(__clang__) && defined(_MSC_VER)
+		return m.m256i_i32[l];
+	#else
+		union { __m256i m; int f[8]; } cvt;
+		cvt.m = m;
+		return cvt.f[l];
+	#endif
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 zero()
+	{
+		return vint8(_mm256_setzero_si256());
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 load1(const int* p)
+	{
+		__m128i a = _mm_set1_epi32(*p);
+		return vint8(_mm256_broadcastd_epi32(a));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 32B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 loada(const int* p)
+	{
+		return vint8(_mm256_load_si256(reinterpret_cast<const __m256i*>(p)));
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint8 lane_id()
+	{
+		return vint8(_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	__m256i m;
+};
+
+// ============================================================================
+// vmask8 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 8-wide control plane masks.
+ */
+struct vmask8
+{
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask8(__m256 a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask8(__m256i a)
+	{
+		m = _mm256_castsi256_ps(a);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask8(bool a)
+	{
+		vint8 mask(a == false ? 0 : -1);
+		m = _mm256_castsi256_ps(mask.m);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	__m256 m;
+};
+
+// ============================================================================
+// vmask8 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator|(vmask8 a, vmask8 b)
+{
+	return vmask8(_mm256_or_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator&(vmask8 a, vmask8 b)
+{
+	return vmask8(_mm256_and_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator^(vmask8 a, vmask8 b)
+{
+	return vmask8(_mm256_xor_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask8 operator~(vmask8 a)
+{
+	return vmask8(_mm256_xor_si256(_mm256_castps_si256(a.m), _mm256_set1_epi32(-1)));
+}
+
+/**
+ * @brief Return a 8-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask8 a)
+{
+	return static_cast<unsigned int>(_mm256_movemask_ps(a.m));
+}
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask8 a)
+{
+	return mask(a) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask8 a)
+{
+	return mask(a) == 0xFF;
+}
+
+// ============================================================================
+// vint8 operators and functions
+// ============================================================================
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint8 operator+(vint8 a, vint8 b)
+{
+	return vint8(_mm256_add_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vint8& operator+=(vint8& a, const vint8& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint8 operator-(vint8 a, vint8 b)
+{
+	return vint8(_mm256_sub_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vint8 operator*(vint8 a, vint8 b)
+{
+	return vint8(_mm256_mullo_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint8 operator~(vint8 a)
+{
+	return vint8(_mm256_xor_si256(a.m, _mm256_set1_epi32(-1)));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint8 operator|(vint8 a, vint8 b)
+{
+	return vint8(_mm256_or_si256(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint8 operator&(vint8 a, vint8 b)
+{
+	return vint8(_mm256_and_si256(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint8 operator^(vint8 a, vint8 b)
+{
+	return vint8(_mm256_xor_si256(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator==(vint8 a, vint8 b)
+{
+	return vmask8(_mm256_cmpeq_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator!=(vint8 a, vint8 b)
+{
+	return ~vmask8(_mm256_cmpeq_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<(vint8 a, vint8 b)
+{
+	return vmask8(_mm256_cmpgt_epi32(b.m, a.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>(vint8 a, vint8 b)
+{
+	return vmask8(_mm256_cmpgt_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Logical shift left.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 lsl(vint8 a)
+{
+	return vint8(_mm256_slli_epi32(a.m, s));
+}
+
+/**
+ * @brief Arithmetic shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 asr(vint8 a)
+{
+	return vint8(_mm256_srai_epi32(a.m, s));
+}
+
+/**
+ * @brief Logical shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint8 lsr(vint8 a)
+{
+	return vint8(_mm256_srli_epi32(a.m, s));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint8 min(vint8 a, vint8 b)
+{
+	return vint8(_mm256_min_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint8 max(vint8 a, vint8 b)
+{
+	return vint8(_mm256_max_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
+{
+	__m128i m = _mm_min_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
+	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
+	m = _mm_min_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
+	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
+
+	__m256i r = astcenc_mm256_set_m128i(m, m);
+	vint8 vmin(r);
+	return vmin;
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
+{
+	__m128i m = _mm_max_epi32(_mm256_extracti128_si256(a.m, 0), _mm256_extracti128_si256(a.m, 1));
+	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,3,2)));
+	m = _mm_max_epi32(m, _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,1)));
+	m = _mm_shuffle_epi32(m, _MM_SHUFFLE(0,0,0,0));
+
+	__m256i r = astcenc_mm256_set_m128i(m, m);
+	vint8 vmax(r);
+	return vmax;
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint8 a, int* p)
+{
+	_mm256_store_si256(reinterpret_cast<__m256i*>(p), a.m);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vint8 a, int* p)
+{
+	_mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.m);
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
+{
+	// This is the most logical implementation, but the convenience intrinsic
+	// is missing on older compilers (supported in g++ 9 and clang++ 9).
+	// _mm_storeu_si64(ptr, _mm256_extracti128_si256(v.m, 0))
+	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
+}
+
+/**
+ * @brief Gather N (vector width) indices from the array.
+ */
+ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
+{
+	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE vint8 pack_low_bytes(vint8 v)
+{
+	__m256i shuf = _mm256_set_epi8(0, 0, 0, 0,  0,  0,  0,  0,
+	                               0, 0, 0, 0, 28, 24, 20, 16,
+	                               0, 0, 0, 0,  0,  0,  0,  0,
+	                               0, 0, 0, 0, 12,  8,  4,  0);
+	__m256i a = _mm256_shuffle_epi8(v.m, shuf);
+	__m128i a0 = _mm256_extracti128_si256(a, 0);
+	__m128i a1 = _mm256_extracti128_si256(a, 1);
+	__m128i b = _mm_unpacklo_epi32(a0, a1);
+
+	__m256i r = astcenc_mm256_set_m128i(b, b);
+	return vint8(r);
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint8 select(vint8 a, vint8 b, vmask8 cond)
+{
+	__m256i condi = _mm256_castps_si256(cond.m);
+	return vint8(_mm256_blendv_epi8(a.m, b.m, condi));
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator+(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(_mm256_add_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vfloat8& operator+=(vfloat8& a, const vfloat8& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator-(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(_mm256_sub_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(_mm256_mul_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(vfloat8 a, float b)
+{
+	return vfloat8(_mm256_mul_ps(a.m, _mm256_set1_ps(b)));
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator*(float a, vfloat8 b)
+{
+	return vfloat8(_mm256_mul_ps(_mm256_set1_ps(a), b.m));
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(_mm256_div_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by scalar division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(vfloat8 a, float b)
+{
+	return vfloat8(_mm256_div_ps(a.m, _mm256_set1_ps(b)));
+}
+
+
+/**
+ * @brief Overload: scalar by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat8 operator/(float a, vfloat8 b)
+{
+	return vfloat8(_mm256_div_ps(_mm256_set1_ps(a), b.m));
+}
+
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator==(vfloat8 a, vfloat8 b)
+{
+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_EQ_OQ));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator!=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_NEQ_OQ));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<(vfloat8 a, vfloat8 b)
+{
+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LT_OQ));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>(vfloat8 a, vfloat8 b)
+{
+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GT_OQ));
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator<=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_LE_OQ));
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask8 operator>=(vfloat8 a, vfloat8 b)
+{
+	return vmask8(_mm256_cmp_ps(a.m, b.m, _CMP_GE_OQ));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(_mm256_min_ps(a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 min(vfloat8 a, float b)
+{
+	return min(a, vfloat8(b));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, vfloat8 b)
+{
+	return vfloat8(_mm256_max_ps(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 max(vfloat8 a, float b)
+{
+	return max(a, vfloat8(b));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	a.m = _mm256_max_ps(a.m, _mm256_set1_ps(min));
+	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
+	return a;
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and max.
+ *
+ * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
+ * be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
+{
+	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
+	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
+	return a;
+}
+
+/**
+ * @brief Return a clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat8 clampzo(vfloat8 a)
+{
+	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
+	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(1.0f));
+	return a;
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 abs(vfloat8 a)
+{
+	__m256 msk = _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff));
+	return vfloat8(_mm256_and_ps(a.m, msk));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ */
+ASTCENC_SIMD_INLINE vfloat8 round(vfloat8 a)
+{
+	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+	return vfloat8(_mm256_round_ps(a.m, flags));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
+{
+	__m128 vlow = _mm256_castps256_ps128(a.m);
+	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
+	vlow = _mm_min_ps(vlow, vhigh);
+
+	// First do an horizontal reduction.
+	__m128 shuf = _mm_shuffle_ps(vlow, vlow, _MM_SHUFFLE(2, 3, 0, 1));
+	__m128 mins = _mm_min_ps(vlow, shuf);
+	shuf = _mm_movehl_ps(shuf, mins);
+	mins = _mm_min_ss(mins, shuf);
+
+	// This is the most logical implementation, but the convenience intrinsic
+	// is missing on older compilers (supported in g++ 9 and clang++ 9).
+	//__m256i r = _mm256_set_m128(m, m)
+	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(mins), mins, 1);
+
+	return vfloat8(_mm256_permute_ps(r, 0));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
+{
+	__m128 vlow = _mm256_castps256_ps128(a.m);
+	__m128 vhigh = _mm256_extractf128_ps(a.m, 1);
+	vhigh = _mm_max_ps(vlow, vhigh);
+
+	// First do an horizontal reduction.
+	__m128 shuf = _mm_shuffle_ps(vhigh, vhigh, _MM_SHUFFLE(2, 3, 0, 1));
+	__m128 maxs = _mm_max_ps(vhigh, shuf);
+	shuf = _mm_movehl_ps(shuf,maxs);
+	maxs = _mm_max_ss(maxs, shuf);
+
+	// This is the most logical implementation, but the convenience intrinsic
+	// is missing on older compilers (supported in g++ 9 and clang++ 9).
+	//__m256i r = _mm256_set_m128(m, m)
+	__m256 r = _mm256_insertf128_ps(_mm256_castps128_ps256(maxs), maxs, 1);
+	return vfloat8(_mm256_permute_ps(r, 0));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
+{
+	return hmax(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hadd_s(vfloat8 a)
+{
+	// Two sequential 4-wide adds gives invariance with 4-wide code
+	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
+	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
+	return hadd_s(lo) + hadd_s(hi);
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat8 select(vfloat8 a, vfloat8 b, vmask8 cond)
+{
+	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat8 select_msb(vfloat8 a, vfloat8 b, vmask8 cond)
+{
+	return vfloat8(_mm256_blendv_ps(a.m, b.m, cond.m));
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector, folded 4-wide.
+ *
+ * This is invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a)
+{
+	vfloat4 lo(_mm256_extractf128_ps(a.m, 0));
+	haccumulate(accum, lo);
+
+	vfloat4 hi(_mm256_extractf128_ps(a.m, 1));
+	haccumulate(accum, hi);
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector.
+ *
+ * This is NOT invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a)
+{
+	accum += a;
+}
+
+/**
+ * @brief Accumulate masked lane-wise sums for a vector, folded 4-wide.
+ *
+ * This is invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat8 a, vmask8 m)
+{
+	a = select(vfloat8::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Accumulate masked lane-wise sums for a vector.
+ *
+ * This is NOT invariant with 4-wide implementations.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat8& accum, vfloat8 a, vmask8 m)
+{
+	a = select(vfloat8::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 sqrt(vfloat8 a)
+{
+	return vfloat8(_mm256_sqrt_ps(a.m));
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat8 gatherf(const float* base, vint8 indices)
+{
+	return vfloat8(_mm256_i32gather_ps(base, indices.m, 4));
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat8 a, float* p)
+{
+	_mm256_storeu_ps(p, a.m);
+}
+
+/**
+ * @brief Store a vector to a 32B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat8 a, float* p)
+{
+	_mm256_store_ps(p, a.m);
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint8 float_to_int(vfloat8 a)
+{
+	return vint8(_mm256_cvttps_epi32(a.m));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint8 float_to_int_rtn(vfloat8 a)
+{
+	a = round(a);
+	return vint8(_mm256_cvttps_epi32(a.m));
+}
+
+
+/**
+ * @brief Return a float value for an integer vector.
+ */
+ASTCENC_SIMD_INLINE vfloat8 int_to_float(vint8 a)
+{
+	return vfloat8(_mm256_cvtepi32_ps(a.m));
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint8 float_as_int(vfloat8 a)
+{
+	return vint8(_mm256_castps_si256(a.m));
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat8 int_as_float(vint8 a)
+{
+	return vfloat8(_mm256_castsi256_ps(a.m));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint8& t0p)
+{
+	// AVX2 duplicates the table within each 128-bit lane
+	__m128i t0n = t0.m;
+	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint8& t0p, vint8& t1p)
+{
+	// AVX2 duplicates the table within each 128-bit lane
+	__m128i t0n = t0.m;
+	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+
+	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
+	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vint8& t0p, vint8& t1p, vint8& t2p, vint8& t3p)
+{
+	// AVX2 duplicates the table within each 128-bit lane
+	__m128i t0n = t0.m;
+	t0p = vint8(astcenc_mm256_set_m128i(t0n, t0n));
+
+	__m128i t1n = _mm_xor_si128(t0.m, t1.m);
+	t1p = vint8(astcenc_mm256_set_m128i(t1n, t1n));
+
+	__m128i t2n = _mm_xor_si128(t1.m, t2.m);
+	t2p = vint8(astcenc_mm256_set_m128i(t2n, t2n));
+
+	__m128i t3n = _mm_xor_si128(t2.m, t3.m);
+	t3p = vint8(astcenc_mm256_set_m128i(t3n, t3n));
+}
+
+/**
+ * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 idx)
+{
+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
+
+	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	return vint8(result);
+}
+
+/**
+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 idx)
+{
+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
+
+	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
+
+	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	result = _mm256_xor_si256(result, result2);
+	return vint8(result);
+}
+
+/**
+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3, vint8 idx)
+{
+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	__m256i idxx = _mm256_or_si256(idx.m, _mm256_set1_epi32(static_cast<int>(0xFFFFFF00)));
+
+	__m256i result = _mm256_shuffle_epi8(t0.m, idxx);
+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
+
+	__m256i result2 = _mm256_shuffle_epi8(t1.m, idxx);
+	result = _mm256_xor_si256(result, result2);
+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
+
+	result2 = _mm256_shuffle_epi8(t2.m, idxx);
+	result = _mm256_xor_si256(result, result2);
+	idxx = _mm256_sub_epi8(idxx, _mm256_set1_epi8(16));
+
+	result2 = _mm256_shuffle_epi8(t3.m, idxx);
+	result = _mm256_xor_si256(result, result2);
+
+	return vint8(result);
+}
+
+/**
+ * @brief Return a vector of interleaved RGBA data.
+ *
+ * Input vectors have the value stored in the bottom 8 bits of each lane,
+ * with high  bits set to zero.
+ *
+ * Output vector stores a single RGBA texel packed in each lane.
+ */
+ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
+{
+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
+}
+
+/**
+ * @brief Store a vector, skipping masked lanes.
+ *
+ * All masked lanes must be at the end of vector, after all non-masked lanes.
+ */
+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint8 data, vmask8 mask)
+{
+	_mm256_maskstore_epi32(base, _mm256_castps_si256(mask.m), data.m);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint8 a)
+{
+	alignas(ASTCENC_VECALIGN) int v[8];
+	storea(a, v);
+	printf("v8_i32:\n  %8d %8d %8d %8d %8d %8d %8d %8d\n",
+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void printx(vint8 a)
+{
+	alignas(ASTCENC_VECALIGN) int v[8];
+	storea(a, v);
+	printf("v8_i32:\n  %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat8 a)
+{
+	alignas(ASTCENC_VECALIGN) float v[8];
+	storea(a, v);
+	printf("v8_f32:\n  %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f %0.4f\n",
+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
+	       static_cast<double>(v[2]), static_cast<double>(v[3]),
+	       static_cast<double>(v[4]), static_cast<double>(v[5]),
+	       static_cast<double>(v[6]), static_cast<double>(v[7]));
+}
+
+/**
+ * @brief Debug function to print a vector of masks.
+ */
+ASTCENC_SIMD_INLINE void print(vmask8 a)
+{
+	print(select(vint8(0), vint8(1), a));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_AVX2_8_H_INCLUDED
diff --git a/thirdparty/astcenc/astcenc_vecmathlib_common_4.h b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
new file mode 100644
index 0000000000..86ee4fd3e1
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_vecmathlib_common_4.h
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2020-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Generic 4x32-bit vector functions.
+ *
+ * This module implements generic 4-wide vector functions that are valid for
+ * all instruction sets, typically implemented using lower level 4-wide
+ * operations that are ISA-specific.
+ */
+
+#ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+#define ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+// ============================================================================
+// vmask4 operators and functions
+// ============================================================================
+
+/**
+ * @brief True if any lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool any(vmask4 a)
+{
+	return mask(a) != 0;
+}
+
+/**
+ * @brief True if all lanes are enabled, false otherwise.
+ */
+ASTCENC_SIMD_INLINE bool all(vmask4 a)
+{
+	return mask(a) == 0xF;
+}
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, int b)
+{
+	return a + vint4(b);
+}
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vint4& operator+=(vint4& a, const vint4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, int b)
+{
+	return a - vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, int b)
+{
+	return a * vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, int b)
+{
+	return a | vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, int b)
+{
+	return a & vint4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, int b)
+{
+	return a ^ vint4(b);
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ */
+ASTCENC_SIMD_INLINE vint4 clamp(int minv, int maxv, vint4 a)
+{
+	return min(max(a, vint4(minv)), vint4(maxv));
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector incremental addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4& operator+=(vfloat4& a, const vfloat4& b)
+{
+	a = a + b;
+	return a;
+}
+
+/**
+ * @brief Overload: vector by scalar addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, float b)
+{
+	return a + vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, float b)
+{
+	return a - vfloat4(b);
+}
+
+/**
+ * @brief Overload: vector by scalar multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, float b)
+{
+	return a * vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(float a, vfloat4 b)
+{
+	return vfloat4(a) * b;
+}
+
+/**
+ * @brief Overload: vector by scalar division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, float b)
+{
+	return a / vfloat4(b);
+}
+
+/**
+ * @brief Overload: scalar by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(float a, vfloat4 b)
+{
+	return vfloat4(a) / b;
+}
+
+/**
+ * @brief Return the min vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, float b)
+{
+	return min(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the max vector of a vector and a scalar.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, float b)
+{
+	return max(a, vfloat4(b));
+}
+
+/**
+ * @brief Return the clamped value between min and max.
+ *
+ * It is assumed that neither @c min nor @c max are NaN values. If @c a is NaN
+ * then @c min will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, minv), maxv);
+}
+
+/**
+ * @brief Return the clamped value between 0.0f and max.
+ *
+ * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
+ * be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, vfloat4::zero()), maxv);
+}
+
+/**
+ * @brief Return the clamped value between 0.0f and 1.0f.
+ *
+ * If @c a is NaN then zero will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 clampzo(vfloat4 a)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return min(max(a, vfloat4::zero()), 1.0f);
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmin_s(vfloat4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal min of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hmin_rgb_s(vfloat4 a)
+{
+	a.set_lane<3>(a.lane<0>());
+	return hmin_s(a);
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hmax_s(vfloat4 a)
+{
+	return hmax(a).lane<0>();
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a)
+{
+	accum = accum + a;
+}
+
+/**
+ * @brief Accumulate lane-wise sums for a masked vector.
+ */
+ASTCENC_SIMD_INLINE void haccumulate(vfloat4& accum, vfloat4 a, vmask4 m)
+{
+	a = select(vfloat4::zero(), a, m);
+	haccumulate(accum, a);
+}
+
+/**
+ * @brief Return the horizontal sum of RGB vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hadd_rgb_s(vfloat4 a)
+{
+	return a.lane<0>() + a.lane<1>() + a.lane<2>();
+}
+
+#if !defined(ASTCENC_USE_NATIVE_DOT_PRODUCT)
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_s(m);
+}
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return vfloat4(hadd_s(m));
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	return hadd_rgb_s(m);
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
+{
+	vfloat4 m = a * b;
+	float d3 = hadd_rgb_s(m);
+	return vfloat4(d3, d3, d3, 0.0f);
+}
+
+#endif
+
+#if !defined(ASTCENC_USE_NATIVE_POPCOUNT)
+
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+static inline int popcount(uint64_t v)
+{
+	uint64_t mask1 = 0x5555555555555555ULL;
+	uint64_t mask2 = 0x3333333333333333ULL;
+	uint64_t mask3 = 0x0F0F0F0F0F0F0F0FULL;
+	v -= (v >> 1) & mask1;
+	v = (v & mask2) + ((v >> 2) & mask2);
+	v += v >> 4;
+	v &= mask3;
+	v *= 0x0101010101010101ULL;
+	v >>= 56;
+	return static_cast<int>(v);
+}
+
+#endif
+
+/**
+ * @brief Apply signed bit transfer.
+ *
+ * @param input0   The first encoded endpoint.
+ * @param input1   The second encoded endpoint.
+ */
+static ASTCENC_SIMD_INLINE void bit_transfer_signed(
+	vint4& input0,
+	vint4& input1
+) {
+	input1 = lsr<1>(input1) | (input0 & 0x80);
+	input0 = lsr<1>(input0) & 0x3F;
+
+	vmask4 mask = (input0 & 0x20) != vint4::zero();
+	input0 = select(input0, input0 - 0x40, mask);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void print(vint4 a)
+{
+	alignas(16) int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %8d %8d %8d %8d\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of ints.
+ */
+ASTCENC_SIMD_INLINE void printx(vint4 a)
+{
+	alignas(16) int v[4];
+	storea(a, v);
+	printf("v4_i32:\n  %08x %08x %08x %08x\n",
+	       v[0], v[1], v[2], v[3]);
+}
+
+/**
+ * @brief Debug function to print a vector of floats.
+ */
+ASTCENC_SIMD_INLINE void print(vfloat4 a)
+{
+	alignas(16) float v[4];
+	storea(a, v);
+	printf("v4_f32:\n  %0.4f %0.4f %0.4f %0.4f\n",
+	       static_cast<double>(v[0]), static_cast<double>(v[1]),
+	       static_cast<double>(v[2]), static_cast<double>(v[3]));
+}
+
+/**
+ * @brief Debug function to print a vector of masks.
+ */
+ASTCENC_SIMD_INLINE void print(vmask4 a)
+{
+	print(select(vint4(0), vint4(1), a));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_COMMON_4_H_INCLUDED
diff --git a/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
new file mode 100644
index 0000000000..e742eae6cb
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_vecmathlib_neon_4.h
@@ -0,0 +1,1072 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 4x32-bit vectors, implemented using Armv8-A NEON.
+ *
+ * This module implements 4-wide 32-bit float, int, and mask vectors for
+ * Armv8-A NEON.
+ *
+ * There is a baseline level of functionality provided by all vector widths and
+ * implementations. This is implemented using identical function signatures,
+ * modulo data type, so we can use them as substitutable implementations in VLA
+ * code.
+ *
+ * The 4-wide vectors are also used as a fixed-width type, and significantly
+ * extend the functionality above that available to VLA code.
+ */
+
+#ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
+#define ASTC_VECMATHLIB_NEON_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+// ============================================================================
+// vfloat4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide floats.
+ */
+struct vfloat4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat4() = default;
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
+	{
+		m = vld1q_f32(p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
+	{
+		m = vdupq_n_f32(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
+	{
+		float v[4] { a, b, c, d };
+		m = vld1q_f32(v);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+		return vgetq_lane_f32(m, l);
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
+	{
+		m = vsetq_lane_f32(a, m, l);
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 zero()
+	{
+		return vfloat4(vdupq_n_f32(0.0f));
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
+	{
+		return vfloat4(vld1q_dup_f32(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
+	{
+		return vfloat4(vld1q_f32(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
+	{
+		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
+		return vfloat4(vld1q_f32(data));
+	}
+
+	/**
+	 * @brief Return a swizzled float 2.
+	 */
+	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		return vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
+	}
+
+	/**
+	 * @brief Return a swizzled float 3.
+	 */
+	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
+	}
+
+	/**
+	 * @brief Return a swizzled float 4.
+	 */
+	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	float32x4_t m;
+};
+
+// ============================================================================
+// vint4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide ints.
+ */
+struct vint4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint4() = default;
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
+	{
+		m = vld1q_s32(p);
+	}
+
+	/**
+	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
+	{
+		// Cast is safe - NEON loads are allowed to be unaligned
+		uint32x2_t t8 = vld1_dup_u32(reinterpret_cast<const uint32_t*>(p));
+		uint16x4_t t16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(t8)));
+		m = vreinterpretq_s32_u32(vmovl_u16(t16));
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using vfloat4::zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a)
+	{
+		m = vdupq_n_s32(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
+	{
+		int v[4] { a, b, c, d };
+		m = vld1q_s32(v);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int32x4_t a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar from a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE int lane() const
+	{
+		return vgetq_lane_s32(m, l);
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
+	{
+		m = vsetq_lane_s32(a, m, l);
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 zero()
+	{
+		return vint4(0);
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
+	{
+		return vint4(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
+	{
+		return vint4(p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 lane_id()
+	{
+		alignas(16) static const int data[4] { 0, 1, 2, 3 };
+		return vint4(vld1q_s32(data));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	int32x4_t m;
+};
+
+// ============================================================================
+// vmask4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide control plane masks.
+ */
+struct vmask4
+{
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(uint32x4_t a)
+	{
+		m = a;
+	}
+
+#if !defined(_MSC_VER)
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(int32x4_t a)
+	{
+		m = vreinterpretq_u32_s32(a);
+	}
+#endif
+
+	/**
+	 * @brief Construct from 1 scalar value.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
+	{
+		m = vreinterpretq_u32_s32(vdupq_n_s32(a == true ? -1 : 0));
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
+	{
+		int v[4] {
+			a == true ? -1 : 0,
+			b == true ? -1 : 0,
+			c == true ? -1 : 0,
+			d == true ? -1 : 0
+		};
+
+		int32x4_t ms = vld1q_s32(v);
+		m = vreinterpretq_u32_s32(ms);
+	}
+
+	/**
+	 * @brief Get the scalar from a single lane.
+	 */
+	template <int32_t l> ASTCENC_SIMD_INLINE uint32_t lane() const
+	{
+		return vgetq_lane_u32(m, l);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	uint32x4_t m;
+};
+
+// ============================================================================
+// vmask4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
+{
+	return vmask4(vorrq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
+{
+	return vmask4(vandq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
+{
+	return vmask4(veorq_u32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
+{
+	return vmask4(vmvnq_u32(a.m));
+}
+
+/**
+ * @brief Return a 4-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
+{
+	static const int shifta[4] { 0, 1, 2, 3 };
+	static const int32x4_t shift = vld1q_s32(shifta);
+
+	uint32x4_t tmp = vshrq_n_u32(a.m, 31);
+	return vaddvq_u32(vshlq_u32(tmp, shift));
+}
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
+{
+	return vint4(vaddq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
+{
+	return vint4(vsubq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
+{
+	return vint4(vmulq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
+{
+	return vint4(vmvnq_s32(a.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
+{
+	return vint4(vorrq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
+{
+	return vint4(vandq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
+{
+	return vint4(veorq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
+{
+	return vmask4(vceqq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
+{
+	return ~vmask4(vceqq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
+{
+	return vmask4(vcltq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
+{
+	return vmask4(vcgtq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Logical shift left.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
+{
+	return vint4(vshlq_s32(a.m, vdupq_n_s32(s)));
+}
+
+/**
+ * @brief Logical shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
+{
+	uint32x4_t ua = vreinterpretq_u32_s32(a.m);
+	ua = vshlq_u32(ua, vdupq_n_s32(-s));
+	return vint4(vreinterpretq_s32_u32(ua));
+}
+
+/**
+ * @brief Arithmetic shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
+{
+	return vint4(vshlq_s32(a.m, vdupq_n_s32(-s)));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
+{
+	return vint4(vminq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
+{
+	return vint4(vmaxq_s32(a.m, b.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
+{
+	return vint4(vminvq_s32(a.m));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
+{
+	return vint4(vmaxvq_s32(a.m));
+}
+
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
+{
+	int32x2_t t = vadd_s32(vget_high_s32(a.m), vget_low_s32(a.m));
+	return vget_lane_s32(vpadd_s32(t, t), 0);
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
+{
+	vst1q_s32(p, a.m);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
+{
+	vst1q_s32(p, a.m);
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
+{
+	vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
+}
+
+/**
+ * @brief Gather N (vector width) indices from the array.
+ */
+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
+{
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	alignas(16) int vals[4];
+	vals[0] = base[idx[0]];
+	vals[1] = base[idx[1]];
+	vals[2] = base[idx[2]];
+	vals[3] = base[idx[3]];
+	return vint4(vals);
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+{
+	alignas(16) uint8_t shuf[16] {
+		0, 4, 8, 12,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0
+	};
+	uint8x16_t idx = vld1q_u8(shuf);
+	int8x16_t av = vreinterpretq_s8_s32(a.m);
+	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(av, idx)));
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
+{
+	return vint4(vbslq_s32(cond.m, b.m, a.m));
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vaddq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vsubq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vmulq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(vdivq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vceqq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vmvnq_u32(vceqq_f32(a.m, b.m)));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcltq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcgtq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcleq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(vcgeq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(vminnmq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(vmaxnmq_f32(a.m, b.m));
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
+{
+	float32x4_t zero = vdupq_n_f32(0.0f);
+	float32x4_t inv = vsubq_f32(zero, a.m);
+	return vfloat4(vmaxq_f32(a.m, inv));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ */
+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
+{
+	return vfloat4(vrndnq_f32(a.m));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
+{
+	return vfloat4(vminvq_f32(a.m));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
+{
+	return vfloat4(vmaxvq_f32(a.m));
+}
+
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
+{
+	// Perform halving add to ensure invariance; we cannot use vaddqv as this
+	// does (0 + 1 + 2 + 3) which is not invariant with x86 (0 + 2) + (1 + 3).
+	float32x2_t t = vadd_f32(vget_high_f32(a.m), vget_low_f32(a.m));
+	return vget_lane_f32(vpadd_f32(t, t), 0);
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
+{
+	return vfloat4(vsqrtq_f32(a.m));
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+	return vfloat4(vbslq_f32(cond.m, b.m, a.m));
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+	static const uint32x4_t msb = vdupq_n_u32(0x80000000u);
+	uint32x4_t mask = vcgeq_u32(cond.m, msb);
+	return vfloat4(vbslq_f32(mask, b.m, a.m));
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
+{
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	alignas(16) float vals[4];
+	vals[0] = base[idx[0]];
+	vals[1] = base[idx[1]];
+	vals[2] = base[idx[2]];
+	vals[3] = base[idx[3]];
+	return vfloat4(vals);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
+{
+	vst1q_f32(p, a.m);
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
+{
+	vst1q_f32(p, a.m);
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
+{
+	return vint4(vcvtq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
+{
+	a = round(a);
+	return vint4(vcvtq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a float value for an integer vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
+{
+	return vfloat4(vcvtq_f32_s32(a.m));
+}
+
+/**
+ * @brief Return a float16 value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
+{
+	// Generate float16 value
+	float16x4_t f16 = vcvt_f16_f32(a.m);
+
+	// Convert each 16-bit float pattern to a 32-bit pattern
+	uint16x4_t u16 = vreinterpret_u16_f16(f16);
+	uint32x4_t u32 = vmovl_u16(u16);
+	return vint4(vreinterpretq_s32_u32(u32));
+}
+
+/**
+ * @brief Return a float16 value for a float scalar, using round-to-nearest.
+ */
+static inline uint16_t float_to_float16(float a)
+{
+	vfloat4 av(a);
+	return static_cast<uint16_t>(float_to_float16(av).lane<0>());
+}
+
+/**
+ * @brief Return a float value for a float16 vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
+{
+	// Convert each 32-bit float pattern to a 16-bit pattern
+	uint32x4_t u32 = vreinterpretq_u32_s32(a.m);
+	uint16x4_t u16 = vmovn_u32(u32);
+	float16x4_t f16 = vreinterpret_f16_u16(u16);
+
+	// Generate float16 value
+	return vfloat4(vcvt_f32_f16(f16));
+}
+
+/**
+ * @brief Return a float value for a float16 scalar.
+ */
+ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
+{
+	vint4 av(a);
+	return float16_to_float(av).lane<0>();
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
+{
+	return vint4(vreinterpretq_s32_f32(a.m));
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
+{
+	return vfloat4(vreinterpretq_f32_s32(v.m));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
+{
+	t0p = t0;
+}
+
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
+{
+	t0p = t0;
+	t1p = t1;
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
+{
+	t0p = t0;
+	t1p = t1;
+	t2p = t2;
+	t3p = t3;
+}
+
+/**
+ * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+{
+	int8x16_t table {
+		vreinterpretq_s8_s32(t0.m)
+	};
+
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
+	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
+
+	return vint4(vreinterpretq_s32_s8(vqtbl1q_s8(table, idx_bytes)));
+}
+
+/**
+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
+{
+	int8x16x2_t table {
+		vreinterpretq_s8_s32(t0.m),
+		vreinterpretq_s8_s32(t1.m)
+	};
+
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
+	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
+
+	return vint4(vreinterpretq_s32_s8(vqtbl2q_s8(table, idx_bytes)));
+}
+
+/**
+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
+{
+	int8x16x4_t table {
+		vreinterpretq_s8_s32(t0.m),
+		vreinterpretq_s8_s32(t1.m),
+		vreinterpretq_s8_s32(t2.m),
+		vreinterpretq_s8_s32(t3.m)
+	};
+
+	// Set index byte above max index for unused bytes so table lookup returns zero
+	int32x4_t idx_masked = vorrq_s32(idx.m, vdupq_n_s32(0xFFFFFF00));
+	uint8x16_t idx_bytes = vreinterpretq_u8_s32(idx_masked);
+
+	return vint4(vreinterpretq_s32_s8(vqtbl4q_s8(table, idx_bytes)));
+}
+
+/**
+ * @brief Return a vector of interleaved RGBA data.
+ *
+ * Input vectors have the value stored in the bottom 8 bits of each lane,
+ * with high  bits set to zero.
+ *
+ * Output vector stores a single RGBA texel packed in each lane.
+ */
+ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
+{
+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
+}
+
+/**
+ * @brief Store a vector, skipping masked lanes.
+ *
+ * All masked lanes must be at the end of vector, after all non-masked lanes.
+ */
+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
+{
+	if (mask.lane<3>())
+	{
+		store(data, base);
+	}
+	else if (mask.lane<2>())
+	{
+		base[0] = data.lane<0>();
+		base[1] = data.lane<1>();
+		base[2] = data.lane<2>();
+	}
+	else if (mask.lane<1>())
+	{
+		base[0] = data.lane<0>();
+		base[1] = data.lane<1>();
+	}
+	else if (mask.lane<0>())
+	{
+		base[0] = data.lane<0>();
+	}
+}
+
+#define ASTCENC_USE_NATIVE_POPCOUNT 1
+
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+ASTCENC_SIMD_INLINE int popcount(uint64_t v)
+{
+	return static_cast<int>(vaddlv_u8(vcnt_u8(vcreate_u8(v))));
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_NEON_4_H_INCLUDED
diff --git a/thirdparty/astcenc/astcenc_vecmathlib_none_4.h b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
new file mode 100644
index 0000000000..d9b52be3e4
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_vecmathlib_none_4.h
@@ -0,0 +1,1169 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 4x32-bit vectors, implemented using plain C++.
+ *
+ * This module implements 4-wide 32-bit float, int, and mask vectors. This
+ * module provides a scalar fallback for VLA code, primarily useful for
+ * debugging VLA algorithms without the complexity of handling SIMD. Only the
+ * baseline level of functionality needed to support VLA is provided.
+ *
+ * Note that the vector conditional operators implemented by this module are
+ * designed to behave like SIMD conditional operators that generate lane masks.
+ * Rather than returning 0/1 booleans like normal C++ code they will return
+ * 0/-1 to give a full lane-width bitmask.
+ *
+ * Note that the documentation for this module still talks about "vectors" to
+ * help developers think about the implied VLA behavior when writing optimized
+ * paths.
+ */
+
+#ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
+#define ASTC_VECMATHLIB_NONE_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <cfenv>
+
+// ============================================================================
+// vfloat4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide floats.
+ */
+struct vfloat4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat4() = default;
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with wider VLA vectors if data is
+	 * aligned to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(const float* p)
+	{
+		m[0] = p[0];
+		m[1] = p[1];
+		m[2] = p[2];
+		m[3] = p[3];
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
+	{
+		m[0] = a;
+		m[1] = a;
+		m[2] = a;
+		m[3] = a;
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
+	{
+		m[0] = a;
+		m[1] = b;
+		m[2] = c;
+		m[3] = d;
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+		return m[l];
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
+	{
+		m[l] = a;
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 zero()
+	{
+		return vfloat4(0.0f);
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
+	{
+		return vfloat4(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
+	{
+		return vfloat4(p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
+	{
+		return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
+	}
+
+	/**
+	 * @brief Return a swizzled float 2.
+	 */
+	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		return  vfloat4(lane<l0>(), lane<l1>(), 0.0f, 0.0f);
+	}
+
+	/**
+	 * @brief Return a swizzled float 3.
+	 */
+	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), 0.0f);
+	}
+
+	/**
+	 * @brief Return a swizzled float 4.
+	 */
+	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		return vfloat4(lane<l0>(), lane<l1>(), lane<l2>(), lane<l3>());
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	float m[4];
+};
+
+// ============================================================================
+// vint4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide ints.
+ */
+struct vint4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint4() = default;
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using vint4::loada() which is better with wider VLA vectors
+	 * if data is aligned.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const int* p)
+	{
+		m[0] = p[0];
+		m[1] = p[1];
+		m[2] = p[2];
+		m[3] = p[3];
+	}
+
+	/**
+	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
+	{
+		m[0] = p[0];
+		m[1] = p[1];
+		m[2] = p[2];
+		m[3] = p[3];
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
+	{
+		m[0] = a;
+		m[1] = b;
+		m[2] = c;
+		m[3] = d;
+	}
+
+
+	/**
+	 * @brief Construct from 4 scalar values replicated across all lanes.
+	 *
+	 * Consider using vint4::zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a)
+	{
+		m[0] = a;
+		m[1] = a;
+		m[2] = a;
+		m[3] = a;
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE int lane() const
+	{
+		return m[l];
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
+	{
+		m[l] = a;
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 zero()
+	{
+		return vint4(0);
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
+	{
+		return vint4(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
+	{
+		return vint4(p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 lane_id()
+	{
+		return vint4(0, 1, 2, 3);
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	int m[4];
+};
+
+// ============================================================================
+// vmask4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide control plane masks.
+ */
+struct vmask4
+{
+	/**
+	 * @brief Construct from an existing mask value.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(int* p)
+	{
+		m[0] = p[0];
+		m[1] = p[1];
+		m[2] = p[2];
+		m[3] = p[3];
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
+	{
+		m[0] = a == false ? 0 : -1;
+		m[1] = a == false ? 0 : -1;
+		m[2] = a == false ? 0 : -1;
+		m[3] = a == false ? 0 : -1;
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
+	{
+		m[0] = a == false ? 0 : -1;
+		m[1] = b == false ? 0 : -1;
+		m[2] = c == false ? 0 : -1;
+		m[3] = d == false ? 0 : -1;
+	}
+
+
+	/**
+	 * @brief The vector ...
+	 */
+	int m[4];
+};
+
+// ============================================================================
+// vmask4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
+{
+	return vmask4(a.m[0] | b.m[0],
+	              a.m[1] | b.m[1],
+	              a.m[2] | b.m[2],
+	              a.m[3] | b.m[3]);
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
+{
+	return vmask4(a.m[0] & b.m[0],
+	              a.m[1] & b.m[1],
+	              a.m[2] & b.m[2],
+	              a.m[3] & b.m[3]);
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
+{
+	return vmask4(a.m[0] ^ b.m[0],
+	              a.m[1] ^ b.m[1],
+	              a.m[2] ^ b.m[2],
+	              a.m[3] ^ b.m[3]);
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
+{
+	return vmask4(~a.m[0],
+	              ~a.m[1],
+	              ~a.m[2],
+	              ~a.m[3]);
+}
+
+/**
+ * @brief Return a 1-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
+{
+	return ((a.m[0] >> 31) & 0x1) |
+	       ((a.m[1] >> 30) & 0x2) |
+	       ((a.m[2] >> 29) & 0x4) |
+	       ((a.m[3] >> 28) & 0x8);
+}
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] + b.m[0],
+	             a.m[1] + b.m[1],
+	             a.m[2] + b.m[2],
+	             a.m[3] + b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] - b.m[0],
+	             a.m[1] - b.m[1],
+	             a.m[2] - b.m[2],
+	             a.m[3] - b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] * b.m[0],
+	             a.m[1] * b.m[1],
+	             a.m[2] * b.m[2],
+	             a.m[3] * b.m[3]);
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
+{
+	return vint4(~a.m[0],
+	             ~a.m[1],
+	             ~a.m[2],
+	             ~a.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] | b.m[0],
+	             a.m[1] | b.m[1],
+	             a.m[2] | b.m[2],
+	             a.m[3] | b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] & b.m[0],
+	             a.m[1] & b.m[1],
+	             a.m[2] & b.m[2],
+	             a.m[3] & b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] ^ b.m[0],
+	             a.m[1] ^ b.m[1],
+	             a.m[2] ^ b.m[2],
+	             a.m[3] ^ b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
+{
+	return vmask4(a.m[0] == b.m[0],
+	              a.m[1] == b.m[1],
+	              a.m[2] == b.m[2],
+	              a.m[3] == b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
+{
+	return vmask4(a.m[0] != b.m[0],
+	              a.m[1] != b.m[1],
+	              a.m[2] != b.m[2],
+	              a.m[3] != b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
+{
+	return vmask4(a.m[0] < b.m[0],
+	              a.m[1] < b.m[1],
+	              a.m[2] < b.m[2],
+	              a.m[3] < b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
+{
+	return vmask4(a.m[0] > b.m[0],
+	              a.m[1] > b.m[1],
+	              a.m[2] > b.m[2],
+	              a.m[3] > b.m[3]);
+}
+
+/**
+ * @brief Logical shift left.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
+{
+	return vint4(a.m[0] << s,
+	             a.m[1] << s,
+	             a.m[2] << s,
+	             a.m[3] << s);
+}
+
+/**
+ * @brief Logical shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
+{
+	unsigned int as0 = static_cast<unsigned int>(a.m[0]) >> s;
+	unsigned int as1 = static_cast<unsigned int>(a.m[1]) >> s;
+	unsigned int as2 = static_cast<unsigned int>(a.m[2]) >> s;
+	unsigned int as3 = static_cast<unsigned int>(a.m[3]) >> s;
+
+	return vint4(static_cast<int>(as0),
+	             static_cast<int>(as1),
+	             static_cast<int>(as2),
+	             static_cast<int>(as3));
+}
+
+/**
+ * @brief Arithmetic shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
+{
+	return vint4(a.m[0] >> s,
+	             a.m[1] >> s,
+	             a.m[2] >> s,
+	             a.m[3] >> s);
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
+	             a.m[1] < b.m[1] ? a.m[1] : b.m[1],
+	             a.m[2] < b.m[2] ? a.m[2] : b.m[2],
+	             a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
+{
+	return vint4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
+	             a.m[1] > b.m[1] ? a.m[1] : b.m[1],
+	             a.m[2] > b.m[2] ? a.m[2] : b.m[2],
+	             a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
+}
+
+/**
+ * @brief Return the horizontal minimum of a single vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
+{
+	int b = std::min(a.m[0], a.m[1]);
+	int c = std::min(a.m[2], a.m[3]);
+	return vint4(std::min(b, c));
+}
+
+/**
+ * @brief Return the horizontal maximum of a single vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
+{
+	int b = std::max(a.m[0], a.m[1]);
+	int c = std::max(a.m[2], a.m[3]);
+	return vint4(std::max(b, c));
+}
+
+/**
+ * @brief Return the horizontal sum of vector lanes as a scalar.
+ */
+ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
+{
+	return a.m[0] + a.m[1] + a.m[2] + a.m[3];
+}
+
+/**
+ * @brief Store a vector to an aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
+{
+	p[0] = a.m[0];
+	p[1] = a.m[1];
+	p[2] = a.m[2];
+	p[3] = a.m[3];
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
+{
+	p[0] = a.m[0];
+	p[1] = a.m[1];
+	p[2] = a.m[2];
+	p[3] = a.m[3];
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
+{
+	int* pi = reinterpret_cast<int*>(p);
+	*pi = a.m[0];
+}
+
+/**
+ * @brief Gather N (vector width) indices from the array.
+ */
+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
+{
+	return vint4(base[indices.m[0]],
+	             base[indices.m[1]],
+	             base[indices.m[2]],
+	             base[indices.m[3]]);
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+{
+	int b0 = a.m[0] & 0xFF;
+	int b1 = a.m[1] & 0xFF;
+	int b2 = a.m[2] & 0xFF;
+	int b3 = a.m[3] & 0xFF;
+
+	int b = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
+	return vint4(b, 0, 0, 0);
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
+{
+	return vint4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
+	             (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
+	             (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
+	             (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(a.m[0] + b.m[0],
+	               a.m[1] + b.m[1],
+	               a.m[2] + b.m[2],
+	               a.m[3] + b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(a.m[0] - b.m[0],
+	               a.m[1] - b.m[1],
+	               a.m[2] - b.m[2],
+	               a.m[3] - b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(a.m[0] * b.m[0],
+	               a.m[1] * b.m[1],
+	               a.m[2] * b.m[2],
+	               a.m[3] * b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(a.m[0] / b.m[0],
+	               a.m[1] / b.m[1],
+	               a.m[2] / b.m[2],
+	               a.m[3] / b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
+{
+	return vmask4(a.m[0] == b.m[0],
+	              a.m[1] == b.m[1],
+	              a.m[2] == b.m[2],
+	              a.m[3] == b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(a.m[0] != b.m[0],
+	              a.m[1] != b.m[1],
+	              a.m[2] != b.m[2],
+	              a.m[3] != b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
+{
+	return vmask4(a.m[0] < b.m[0],
+	              a.m[1] < b.m[1],
+	              a.m[2] < b.m[2],
+	              a.m[3] < b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
+{
+	return vmask4(a.m[0] > b.m[0],
+	              a.m[1] > b.m[1],
+	              a.m[2] > b.m[2],
+	              a.m[3] > b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(a.m[0] <= b.m[0],
+	              a.m[1] <= b.m[1],
+	              a.m[2] <= b.m[2],
+	              a.m[3] <= b.m[3]);
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(a.m[0] >= b.m[0],
+	              a.m[1] >= b.m[1],
+	              a.m[2] >= b.m[2],
+	              a.m[3] >= b.m[3]);
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(a.m[0] < b.m[0] ? a.m[0] : b.m[0],
+	               a.m[1] < b.m[1] ? a.m[1] : b.m[1],
+	               a.m[2] < b.m[2] ? a.m[2] : b.m[2],
+	               a.m[3] < b.m[3] ? a.m[3] : b.m[3]);
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(a.m[0] > b.m[0] ? a.m[0] : b.m[0],
+	               a.m[1] > b.m[1] ? a.m[1] : b.m[1],
+	               a.m[2] > b.m[2] ? a.m[2] : b.m[2],
+	               a.m[3] > b.m[3] ? a.m[3] : b.m[3]);
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
+{
+	return vfloat4(std::abs(a.m[0]),
+	               std::abs(a.m[1]),
+	               std::abs(a.m[2]),
+	               std::abs(a.m[3]));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ */
+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
+{
+	assert(std::fegetround() == FE_TONEAREST);
+	return vfloat4(std::nearbyint(a.m[0]),
+	               std::nearbyint(a.m[1]),
+	               std::nearbyint(a.m[2]),
+	               std::nearbyint(a.m[3]));
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
+{
+	float tmp1 = std::min(a.m[0], a.m[1]);
+	float tmp2 = std::min(a.m[2], a.m[3]);
+	return vfloat4(std::min(tmp1, tmp2));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
+{
+	float tmp1 = std::max(a.m[0], a.m[1]);
+	float tmp2 = std::max(a.m[2], a.m[3]);
+	return vfloat4(std::max(tmp1, tmp2));
+}
+
+/**
+ * @brief Return the horizontal sum of a vector.
+ */
+ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
+{
+	// Use halving add, gives invariance with SIMD versions
+	return (a.m[0] + a.m[2]) + (a.m[1] + a.m[3]);
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
+{
+	return vfloat4(std::sqrt(a.m[0]),
+	               std::sqrt(a.m[1]),
+	               std::sqrt(a.m[2]),
+	               std::sqrt(a.m[3]));
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
+	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
+	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
+	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+	return vfloat4((cond.m[0] & static_cast<int>(0x80000000)) ? b.m[0] : a.m[0],
+	               (cond.m[1] & static_cast<int>(0x80000000)) ? b.m[1] : a.m[1],
+	               (cond.m[2] & static_cast<int>(0x80000000)) ? b.m[2] : a.m[2],
+	               (cond.m[3] & static_cast<int>(0x80000000)) ? b.m[3] : a.m[3]);
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
+{
+	return vfloat4(base[indices.m[0]],
+	               base[indices.m[1]],
+	               base[indices.m[2]],
+	               base[indices.m[3]]);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* ptr)
+{
+	ptr[0] = a.m[0];
+	ptr[1] = a.m[1];
+	ptr[2] = a.m[2];
+	ptr[3] = a.m[3];
+}
+
+/**
+ * @brief Store a vector to an aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* ptr)
+{
+	ptr[0] = a.m[0];
+	ptr[1] = a.m[1];
+	ptr[2] = a.m[2];
+	ptr[3] = a.m[3];
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
+{
+	return vint4(static_cast<int>(a.m[0]),
+	             static_cast<int>(a.m[1]),
+	             static_cast<int>(a.m[2]),
+	             static_cast<int>(a.m[3]));
+}
+
+/**f
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
+{
+	return vint4(static_cast<int>(a.m[0] + 0.5f),
+	             static_cast<int>(a.m[1] + 0.5f),
+	             static_cast<int>(a.m[2] + 0.5f),
+	             static_cast<int>(a.m[3] + 0.5f));
+}
+
+/**
+ * @brief Return a float value for a integer vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
+{
+	return vfloat4(static_cast<float>(a.m[0]),
+	               static_cast<float>(a.m[1]),
+	               static_cast<float>(a.m[2]),
+	               static_cast<float>(a.m[3]));
+}
+
+/**
+ * @brief Return a float16 value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
+{
+	return vint4(
+		float_to_sf16(a.lane<0>()),
+		float_to_sf16(a.lane<1>()),
+		float_to_sf16(a.lane<2>()),
+		float_to_sf16(a.lane<3>()));
+}
+
+/**
+ * @brief Return a float16 value for a float scalar, using round-to-nearest.
+ */
+static inline uint16_t float_to_float16(float a)
+{
+	return float_to_sf16(a);
+}
+
+/**
+ * @brief Return a float value for a float16 vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
+{
+	return vfloat4(
+		sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
+		sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
+		sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
+		sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
+}
+
+/**
+ * @brief Return a float value for a float16 scalar.
+ */
+ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
+{
+	return sf16_to_float(a);
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
+{
+	vint4 r;
+	memcpy(r.m, a.m, 4 * 4);
+	return r;
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 a)
+{
+	vfloat4 r;
+	memcpy(r.m, a.m, 4 * 4);
+	return r;
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
+{
+	t0p = t0;
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
+{
+	t0p = t0;
+	t1p = t1;
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
+{
+	t0p = t0;
+	t1p = t1;
+	t2p = t2;
+	t3p = t3;
+}
+
+/**
+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+{
+	uint8_t table[16];
+	storea(t0, reinterpret_cast<int*>(table +  0));
+
+	return vint4(table[idx.lane<0>()],
+	             table[idx.lane<1>()],
+	             table[idx.lane<2>()],
+	             table[idx.lane<3>()]);
+}
+
+
+/**
+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
+{
+	uint8_t table[32];
+	storea(t0, reinterpret_cast<int*>(table +  0));
+	storea(t1, reinterpret_cast<int*>(table + 16));
+
+	return vint4(table[idx.lane<0>()],
+	             table[idx.lane<1>()],
+	             table[idx.lane<2>()],
+	             table[idx.lane<3>()]);
+}
+
+/**
+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
+{
+	uint8_t table[64];
+	storea(t0, reinterpret_cast<int*>(table +  0));
+	storea(t1, reinterpret_cast<int*>(table + 16));
+	storea(t2, reinterpret_cast<int*>(table + 32));
+	storea(t3, reinterpret_cast<int*>(table + 48));
+
+	return vint4(table[idx.lane<0>()],
+	             table[idx.lane<1>()],
+	             table[idx.lane<2>()],
+	             table[idx.lane<3>()]);
+}
+
+/**
+ * @brief Return a vector of interleaved RGBA data.
+ *
+ * Input vectors have the value stored in the bottom 8 bits of each lane,
+ * with high  bits set to zero.
+ *
+ * Output vector stores a single RGBA texel packed in each lane.
+ */
+ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
+{
+	return r + lsl<8>(g) + lsl<16>(b) + lsl<24>(a);
+}
+
+/**
+ * @brief Store a vector, skipping masked lanes.
+ *
+ * All masked lanes must be at the end of vector, after all non-masked lanes.
+ */
+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
+{
+	if (mask.m[3])
+	{
+		store(data, base);
+	}
+	else if (mask.m[2])
+	{
+		base[0] = data.lane<0>();
+		base[1] = data.lane<1>();
+		base[2] = data.lane<2>();
+	}
+	else if (mask.m[1])
+	{
+		base[0] = data.lane<0>();
+		base[1] = data.lane<1>();
+	}
+	else if (mask.m[0])
+	{
+		base[0] = data.lane<0>();
+	}
+}
+
+#endif // #ifndef ASTC_VECMATHLIB_NONE_4_H_INCLUDED
diff --git a/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
new file mode 100644
index 0000000000..26dcc4a891
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_vecmathlib_sse_4.h
@@ -0,0 +1,1283 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2019-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief 4x32-bit vectors, implemented using SSE.
+ *
+ * This module implements 4-wide 32-bit float, int, and mask vectors for x86
+ * SSE. The implementation requires at least SSE2, but higher levels of SSE can
+ * be selected at compile time to improve performance.
+ *
+ * There is a baseline level of functionality provided by all vector widths and
+ * implementations. This is implemented using identical function signatures,
+ * modulo data type, so we can use them as substitutable implementations in VLA
+ * code.
+ *
+ * The 4-wide vectors are also used as a fixed-width type, and significantly
+ * extend the functionality above that available to VLA code.
+ */
+
+#ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED
+#define ASTC_VECMATHLIB_SSE_4_H_INCLUDED
+
+#ifndef ASTCENC_SIMD_INLINE
+	#error "Include astcenc_vecmathlib.h, do not include directly"
+#endif
+
+#include <cstdio>
+
+// ============================================================================
+// vfloat4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide floats.
+ */
+struct vfloat4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vfloat4() = default;
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(const float *p)
+	{
+		m = _mm_loadu_ps(p);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a)
+	{
+		m = _mm_set1_ps(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(float a, float b, float c, float d)
+	{
+		m = _mm_set_ps(d, c, b, a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vfloat4(__m128 a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(float a)
+	{
+#if ASTCENC_SSE >= 41
+		__m128 v = _mm_set1_ps(a);
+		m = _mm_insert_ps(m, v, l << 6 | l << 4);
+#else
+		alignas(16) float idx[4];
+		_mm_store_ps(idx, m);
+		idx[l] = a;
+		m = _mm_load_ps(idx);
+#endif
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 zero()
+	{
+		return vfloat4(_mm_setzero_ps());
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 load1(const float* p)
+	{
+		return vfloat4(_mm_load_ps1(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 loada(const float* p)
+	{
+		return vfloat4(_mm_load_ps(p));
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
+	{
+		return vfloat4(_mm_set_ps(3, 2, 1, 0));
+	}
+
+	/**
+	 * @brief Return a swizzled float 2.
+	 */
+	template <int l0, int l1> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2));
+		result.set_lane<2>(0.0f);
+		result.set_lane<3>(0.0f);
+		return result;
+	}
+
+	/**
+	 * @brief Return a swizzled float 3.
+	 */
+	template <int l0, int l1, int l2> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		vfloat4 result(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4));
+		result.set_lane<3>(0.0f);
+		return result;
+	}
+
+	/**
+	 * @brief Return a swizzled float 4.
+	 */
+	template <int l0, int l1, int l2, int l3> ASTCENC_SIMD_INLINE vfloat4 swz() const
+	{
+		return vfloat4(_mm_shuffle_ps(m, m, l0 | l1 << 2 | l2 << 4 | l3 << 6));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	__m128 m;
+};
+
+// ============================================================================
+// vint4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide ints.
+ */
+struct vint4
+{
+	/**
+	 * @brief Construct from zero-initialized value.
+	 */
+	ASTCENC_SIMD_INLINE vint4() = default;
+
+	/**
+	 * @brief Construct from 4 values loaded from an unaligned address.
+	 *
+	 * Consider using loada() which is better with vectors if data is aligned
+	 * to vector length.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const int *p)
+	{
+		m = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
+	}
+
+	/**
+	 * @brief Construct from 4 uint8_t loaded from an unaligned address.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(const uint8_t *p)
+	{
+		// _mm_loadu_si32 would be nicer syntax, but missing on older GCC
+		__m128i t = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(p));
+
+#if ASTCENC_SSE >= 41
+		m = _mm_cvtepu8_epi32(t);
+#else
+		t = _mm_unpacklo_epi8(t, _mm_setzero_si128());
+		m = _mm_unpacklo_epi16(t, _mm_setzero_si128());
+#endif
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value replicated across all lanes.
+	 *
+	 * Consider using vfloat4::zero() for constexpr zeros.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a)
+	{
+		m = _mm_set1_epi32(a);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(int a, int b, int c, int d)
+	{
+		m = _mm_set_epi32(d, c, b, a);
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vint4(__m128i a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Get the scalar from a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE int lane() const
+	{
+		return _mm_cvtsi128_si32(_mm_shuffle_epi32(m, l));
+	}
+
+	/**
+	 * @brief Set the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE void set_lane(int a)
+	{
+#if ASTCENC_SSE >= 41
+		m = _mm_insert_epi32(m, a, l);
+#else
+		alignas(16) int idx[4];
+		_mm_store_si128(reinterpret_cast<__m128i*>(idx), m);
+		idx[l] = a;
+		m = _mm_load_si128(reinterpret_cast<const __m128i*>(idx));
+#endif
+	}
+
+	/**
+	 * @brief Factory that returns a vector of zeros.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 zero()
+	{
+		return vint4(_mm_setzero_si128());
+	}
+
+	/**
+	 * @brief Factory that returns a replicated scalar loaded from memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 load1(const int* p)
+	{
+		return vint4(*p);
+	}
+
+	/**
+	 * @brief Factory that returns a vector loaded from 16B aligned memory.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 loada(const int* p)
+	{
+		return vint4(_mm_load_si128(reinterpret_cast<const __m128i*>(p)));
+	}
+
+	/**
+	 * @brief Factory that returns a vector containing the lane IDs.
+	 */
+	static ASTCENC_SIMD_INLINE vint4 lane_id()
+	{
+		return vint4(_mm_set_epi32(3, 2, 1, 0));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	__m128i m;
+};
+
+// ============================================================================
+// vmask4 data type
+// ============================================================================
+
+/**
+ * @brief Data type for 4-wide control plane masks.
+ */
+struct vmask4
+{
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(__m128 a)
+	{
+		m = a;
+	}
+
+	/**
+	 * @brief Construct from an existing SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(__m128i a)
+	{
+		m = _mm_castsi128_ps(a);
+	}
+
+	/**
+	 * @brief Construct from 1 scalar value.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(bool a)
+	{
+		vint4 mask(a == false ? 0 : -1);
+		m = _mm_castsi128_ps(mask.m);
+	}
+
+	/**
+	 * @brief Construct from 4 scalar values.
+	 *
+	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
+	 */
+	ASTCENC_SIMD_INLINE explicit vmask4(bool a, bool b, bool c, bool d)
+	{
+		vint4 mask(a == false ? 0 : -1,
+		           b == false ? 0 : -1,
+		           c == false ? 0 : -1,
+		           d == false ? 0 : -1);
+
+		m = _mm_castsi128_ps(mask.m);
+	}
+
+	/**
+	 * @brief Get the scalar value of a single lane.
+	 */
+	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	{
+		return _mm_cvtss_f32(_mm_shuffle_ps(m, m, l));
+	}
+
+	/**
+	 * @brief The vector ...
+	 */
+	__m128 m;
+};
+
+// ============================================================================
+// vmask4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: mask union (or).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator|(vmask4 a, vmask4 b)
+{
+	return vmask4(_mm_or_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask intersect (and).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator&(vmask4 a, vmask4 b)
+{
+	return vmask4(_mm_and_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask difference (xor).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator^(vmask4 a, vmask4 b)
+{
+	return vmask4(_mm_xor_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: mask invert (not).
+ */
+ASTCENC_SIMD_INLINE vmask4 operator~(vmask4 a)
+{
+	return vmask4(_mm_xor_si128(_mm_castps_si128(a.m), _mm_set1_epi32(-1)));
+}
+
+/**
+ * @brief Return a 4-bit mask code indicating mask status.
+ *
+ * bit0 = lane 0
+ */
+ASTCENC_SIMD_INLINE unsigned int mask(vmask4 a)
+{
+	return static_cast<unsigned int>(_mm_movemask_ps(a.m));
+}
+
+// ============================================================================
+// vint4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vint4 operator+(vint4 a, vint4 b)
+{
+	return vint4(_mm_add_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vint4 operator-(vint4 a, vint4 b)
+{
+	return vint4(_mm_sub_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vint4 operator*(vint4 a, vint4 b)
+{
+#if ASTCENC_SSE >= 41
+	return vint4(_mm_mullo_epi32 (a.m, b.m));
+#else
+	__m128i t1 = _mm_mul_epu32(a.m, b.m);
+	__m128i t2 = _mm_mul_epu32(
+	                 _mm_srli_si128(a.m, 4),
+	                 _mm_srli_si128(b.m, 4));
+	__m128i r =  _mm_unpacklo_epi32(
+	                 _mm_shuffle_epi32(t1, _MM_SHUFFLE (0, 0, 2, 0)),
+	                 _mm_shuffle_epi32(t2, _MM_SHUFFLE (0, 0, 2, 0)));
+	return vint4(r);
+#endif
+}
+
+/**
+ * @brief Overload: vector bit invert.
+ */
+ASTCENC_SIMD_INLINE vint4 operator~(vint4 a)
+{
+	return vint4(_mm_xor_si128(a.m, _mm_set1_epi32(-1)));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise or.
+ */
+ASTCENC_SIMD_INLINE vint4 operator|(vint4 a, vint4 b)
+{
+	return vint4(_mm_or_si128(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise and.
+ */
+ASTCENC_SIMD_INLINE vint4 operator&(vint4 a, vint4 b)
+{
+	return vint4(_mm_and_si128(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector bitwise xor.
+ */
+ASTCENC_SIMD_INLINE vint4 operator^(vint4 a, vint4 b)
+{
+	return vint4(_mm_xor_si128(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vint4 a, vint4 b)
+{
+	return vmask4(_mm_cmpeq_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vint4 a, vint4 b)
+{
+	return ~vmask4(_mm_cmpeq_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vint4 a, vint4 b)
+{
+	return vmask4(_mm_cmplt_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vint4 a, vint4 b)
+{
+	return vmask4(_mm_cmpgt_epi32(a.m, b.m));
+}
+
+/**
+ * @brief Logical shift left.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 lsl(vint4 a)
+{
+	return vint4(_mm_slli_epi32(a.m, s));
+}
+
+/**
+ * @brief Logical shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 lsr(vint4 a)
+{
+	return vint4(_mm_srli_epi32(a.m, s));
+}
+
+/**
+ * @brief Arithmetic shift right.
+ */
+template <int s> ASTCENC_SIMD_INLINE vint4 asr(vint4 a)
+{
+	return vint4(_mm_srai_epi32(a.m, s));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 min(vint4 a, vint4 b)
+{
+#if ASTCENC_SSE >= 41
+	return vint4(_mm_min_epi32(a.m, b.m));
+#else
+	vmask4 d = a < b;
+	__m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);
+	__m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);
+	return vint4(_mm_or_si128(ap,bp));
+#endif
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ */
+ASTCENC_SIMD_INLINE vint4 max(vint4 a, vint4 b)
+{
+#if ASTCENC_SSE >= 41
+	return vint4(_mm_max_epi32(a.m, b.m));
+#else
+	vmask4 d = a > b;
+	__m128i ap = _mm_and_si128(_mm_castps_si128(d.m), a.m);
+	__m128i bp = _mm_andnot_si128(_mm_castps_si128(d.m), b.m);
+	return vint4(_mm_or_si128(ap,bp));
+#endif
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmin(vint4 a)
+{
+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
+	a = min(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
+	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
+}
+
+/*
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vint4 hmax(vint4 a)
+{
+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 3, 2))));
+	a = max(a, vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 1))));
+	return vint4(_mm_shuffle_epi32(a.m, _MM_SHUFFLE(0, 0, 0, 0)));
+}
+
+/**
+ * @brief Return the horizontal sum of a vector as a scalar.
+ */
+ASTCENC_SIMD_INLINE int hadd_s(vint4 a)
+{
+	// Add top and bottom halves, lane 1/0
+	__m128i fold = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(a.m),
+	                                              _mm_castsi128_ps(a.m)));
+	__m128i t = _mm_add_epi32(a.m, fold);
+
+	// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
+	t = _mm_add_epi32(t, _mm_shuffle_epi32(t, 0x55));
+
+	return _mm_cvtsi128_si32(t);
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vint4 a, int* p)
+{
+	_mm_store_si128(reinterpret_cast<__m128i*>(p), a.m);
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vint4 a, int* p)
+{
+	// Cast due to missing intrinsics
+	_mm_storeu_ps(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
+}
+
+/**
+ * @brief Store lowest N (vector width) bytes into an unaligned address.
+ */
+ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
+{
+	// Cast due to missing intrinsics
+	_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
+}
+
+/**
+ * @brief Gather N (vector width) indices from the array.
+ */
+ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
+{
+#if ASTCENC_AVX >= 2
+	return vint4(_mm_i32gather_epi32(base, indices.m, 4));
+#else
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
+#endif
+}
+
+/**
+ * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
+ */
+ASTCENC_SIMD_INLINE vint4 pack_low_bytes(vint4 a)
+{
+#if ASTCENC_SSE >= 41
+	__m128i shuf = _mm_set_epi8(0,0,0,0, 0,0,0,0, 0,0,0,0, 12,8,4,0);
+	return vint4(_mm_shuffle_epi8(a.m, shuf));
+#else
+	__m128i va = _mm_unpacklo_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(1,1,1,1)));
+	__m128i vb = _mm_unpackhi_epi8(a.m, _mm_shuffle_epi32(a.m, _MM_SHUFFLE(3,3,3,3)));
+	return vint4(_mm_unpacklo_epi16(va, vb));
+#endif
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vint4 select(vint4 a, vint4 b, vmask4 cond)
+{
+	__m128i condi = _mm_castps_si128(cond.m);
+
+#if ASTCENC_SSE >= 41
+	return vint4(_mm_blendv_epi8(a.m, b.m, condi));
+#else
+	return vint4(_mm_or_si128(_mm_and_si128(condi, b.m), _mm_andnot_si128(condi, a.m)));
+#endif
+}
+
+// ============================================================================
+// vfloat4 operators and functions
+// ============================================================================
+
+/**
+ * @brief Overload: vector by vector addition.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator+(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(_mm_add_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector subtraction.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator-(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(_mm_sub_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector multiplication.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator*(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(_mm_mul_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector division.
+ */
+ASTCENC_SIMD_INLINE vfloat4 operator/(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(_mm_div_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector equality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator==(vfloat4 a, vfloat4 b)
+{
+	return vmask4(_mm_cmpeq_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector inequality.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator!=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(_mm_cmpneq_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<(vfloat4 a, vfloat4 b)
+{
+	return vmask4(_mm_cmplt_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>(vfloat4 a, vfloat4 b)
+{
+	return vmask4(_mm_cmpgt_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector less than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator<=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(_mm_cmple_ps(a.m, b.m));
+}
+
+/**
+ * @brief Overload: vector by vector greater than or equal.
+ */
+ASTCENC_SIMD_INLINE vmask4 operator>=(vfloat4 a, vfloat4 b)
+{
+	return vmask4(_mm_cmpge_ps(a.m, b.m));
+}
+
+/**
+ * @brief Return the min vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 min(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(_mm_min_ps(a.m, b.m));
+}
+
+/**
+ * @brief Return the max vector of two vectors.
+ *
+ * If either lane value is NaN, @c b will be returned for that lane.
+ */
+ASTCENC_SIMD_INLINE vfloat4 max(vfloat4 a, vfloat4 b)
+{
+	// Do not reorder - second operand will return if either is NaN
+	return vfloat4(_mm_max_ps(a.m, b.m));
+}
+
+/**
+ * @brief Return the absolute value of the float vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 abs(vfloat4 a)
+{
+	return vfloat4(_mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a.m), a.m));
+}
+
+/**
+ * @brief Return a float rounded to the nearest integer value.
+ */
+ASTCENC_SIMD_INLINE vfloat4 round(vfloat4 a)
+{
+#if ASTCENC_SSE >= 41
+	constexpr int flags = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC;
+	return vfloat4(_mm_round_ps(a.m, flags));
+#else
+	__m128 v = a.m;
+	__m128 neg_zero = _mm_castsi128_ps(_mm_set1_epi32(static_cast<int>(0x80000000)));
+	__m128 no_fraction = _mm_set1_ps(8388608.0f);
+	__m128 abs_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
+	__m128 sign = _mm_and_ps(v, neg_zero);
+	__m128 s_magic = _mm_or_ps(no_fraction, sign);
+	__m128 r1 = _mm_add_ps(v, s_magic);
+	r1 = _mm_sub_ps(r1, s_magic);
+	__m128 r2 = _mm_and_ps(v, abs_mask);
+	__m128 mask = _mm_cmple_ps(r2, no_fraction);
+	r2 = _mm_andnot_ps(mask, v);
+	r1 = _mm_and_ps(r1, mask);
+	return vfloat4(_mm_xor_ps(r1, r2));
+#endif
+}
+
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmin(vfloat4 a)
+{
+	a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));
+	a = min(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));
+	return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 hmax(vfloat4 a)
+{
+	a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 3, 2))));
+	a = max(a, vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 1))));
+	return vfloat4(_mm_shuffle_ps(a.m, a.m, _MM_SHUFFLE(0, 0, 0, 0)));
+}
+
+/**
+ * @brief Return the horizontal sum of a vector as a scalar.
+ */
+ASTCENC_SIMD_INLINE float hadd_s(vfloat4 a)
+{
+	// Add top and bottom halves, lane 1/0
+	__m128 t = _mm_add_ps(a.m, _mm_movehl_ps(a.m, a.m));
+
+	// Add top and bottom halves, lane 0 (_mm_hadd_ps exists but slow)
+	t = _mm_add_ss(t, _mm_shuffle_ps(t, t, 0x55));
+
+	return _mm_cvtss_f32(t);
+}
+
+/**
+ * @brief Return the sqrt of the lanes in the vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 sqrt(vfloat4 a)
+{
+	return vfloat4(_mm_sqrt_ps(a.m));
+}
+
+/**
+ * @brief Return lanes from @c b if @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+#if ASTCENC_SSE >= 41
+	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
+#else
+	return vfloat4(_mm_or_ps(_mm_and_ps(cond.m, b.m), _mm_andnot_ps(cond.m, a.m)));
+#endif
+}
+
+/**
+ * @brief Return lanes from @c b if MSB of @c cond is set, else @c a.
+ */
+ASTCENC_SIMD_INLINE vfloat4 select_msb(vfloat4 a, vfloat4 b, vmask4 cond)
+{
+#if ASTCENC_SSE >= 41
+	return vfloat4(_mm_blendv_ps(a.m, b.m, cond.m));
+#else
+	__m128 d = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(cond.m), 31));
+	return vfloat4(_mm_or_ps(_mm_and_ps(d, b.m), _mm_andnot_ps(d, a.m)));
+#endif
+}
+
+/**
+ * @brief Load a vector of gathered results from an array;
+ */
+ASTCENC_SIMD_INLINE vfloat4 gatherf(const float* base, vint4 indices)
+{
+#if ASTCENC_AVX >= 2
+	return vfloat4(_mm_i32gather_ps(base, indices.m, 4));
+#else
+	alignas(16) int idx[4];
+	storea(indices, idx);
+	return vfloat4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
+#endif
+}
+
+/**
+ * @brief Store a vector to an unaligned memory address.
+ */
+ASTCENC_SIMD_INLINE void store(vfloat4 a, float* p)
+{
+	_mm_storeu_ps(p, a.m);
+}
+
+/**
+ * @brief Store a vector to a 16B aligned memory address.
+ */
+ASTCENC_SIMD_INLINE void storea(vfloat4 a, float* p)
+{
+	_mm_store_ps(p, a.m);
+}
+
+/**
+ * @brief Return a integer value for a float vector, using truncation.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int(vfloat4 a)
+{
+	return vint4(_mm_cvttps_epi32(a.m));
+}
+
+/**
+ * @brief Return a integer value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_int_rtn(vfloat4 a)
+{
+	a = round(a);
+	return vint4(_mm_cvttps_epi32(a.m));
+}
+
+/**
+ * @brief Return a float value for an integer vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_to_float(vint4 a)
+{
+	return vfloat4(_mm_cvtepi32_ps(a.m));
+}
+
+/**
+ * @brief Return a float16 value for a float vector, using round-to-nearest.
+ */
+ASTCENC_SIMD_INLINE vint4 float_to_float16(vfloat4 a)
+{
+#if ASTCENC_F16C >= 1
+	__m128i packedf16 = _mm_cvtps_ph(a.m, 0);
+	__m128i f16 = _mm_cvtepu16_epi32(packedf16);
+	return vint4(f16);
+#else
+	return vint4(
+		float_to_sf16(a.lane<0>()),
+		float_to_sf16(a.lane<1>()),
+		float_to_sf16(a.lane<2>()),
+		float_to_sf16(a.lane<3>()));
+#endif
+}
+
+/**
+ * @brief Return a float16 value for a float scalar, using round-to-nearest.
+ */
+static inline uint16_t float_to_float16(float a)
+{
+#if ASTCENC_F16C >= 1
+	__m128i f16 = _mm_cvtps_ph(_mm_set1_ps(a), 0);
+	return  static_cast<uint16_t>(_mm_cvtsi128_si32(f16));
+#else
+	return float_to_sf16(a);
+#endif
+}
+
+/**
+ * @brief Return a float value for a float16 vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 float16_to_float(vint4 a)
+{
+#if ASTCENC_F16C >= 1
+	__m128i packed = _mm_packs_epi32(a.m, a.m);
+	__m128 f32 = _mm_cvtph_ps(packed);
+	return vfloat4(f32);
+#else
+	return vfloat4(
+		sf16_to_float(static_cast<uint16_t>(a.lane<0>())),
+		sf16_to_float(static_cast<uint16_t>(a.lane<1>())),
+		sf16_to_float(static_cast<uint16_t>(a.lane<2>())),
+		sf16_to_float(static_cast<uint16_t>(a.lane<3>())));
+#endif
+}
+
+/**
+ * @brief Return a float value for a float16 scalar.
+ */
+ASTCENC_SIMD_INLINE float float16_to_float(uint16_t a)
+{
+#if ASTCENC_F16C >= 1
+	__m128i packed = _mm_set1_epi16(static_cast<short>(a));
+	__m128 f32 = _mm_cvtph_ps(packed);
+	return _mm_cvtss_f32(f32);
+#else
+	return sf16_to_float(a);
+#endif
+}
+
+/**
+ * @brief Return a float value as an integer bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the first half of that flip.
+ */
+ASTCENC_SIMD_INLINE vint4 float_as_int(vfloat4 a)
+{
+	return vint4(_mm_castps_si128(a.m));
+}
+
+/**
+ * @brief Return a integer value as a float bit pattern (i.e. no conversion).
+ *
+ * It is a common trick to convert floats into integer bit patterns, perform
+ * some bit hackery based on knowledge they are IEEE 754 layout, and then
+ * convert them back again. This is the second half of that flip.
+ */
+ASTCENC_SIMD_INLINE vfloat4 int_as_float(vint4 v)
+{
+	return vfloat4(_mm_castsi128_ps(v.m));
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
+{
+	t0p = t0;
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
+{
+#if ASTCENC_SSE >= 41
+	t0p = t0;
+	t1p = t0 ^ t1;
+#else
+	t0p = t0;
+	t1p = t1;
+#endif
+}
+
+/**
+ * @brief Prepare a vtable lookup table for use with the native SIMD size.
+ */
+ASTCENC_SIMD_INLINE void vtable_prepare(
+	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
+{
+#if ASTCENC_SSE >= 41
+	t0p = t0;
+	t1p = t0 ^ t1;
+	t2p = t1 ^ t2;
+	t3p = t2 ^ t3;
+#else
+	t0p = t0;
+	t1p = t1;
+	t2p = t2;
+	t3p = t3;
+#endif
+}
+
+/**
+ * @brief Perform an 8-bit 16-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+{
+#if ASTCENC_SSE >= 41
+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+
+	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	return vint4(result);
+#else
+	alignas(ASTCENC_VECALIGN) uint8_t table[16];
+	storea(t0, reinterpret_cast<int*>(table +  0));
+
+	return vint4(table[idx.lane<0>()],
+	             table[idx.lane<1>()],
+	             table[idx.lane<2>()],
+	             table[idx.lane<3>()]);
+#endif
+}
+
+/**
+ * @brief Perform an 8-bit 32-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
+{
+#if ASTCENC_SSE >= 41
+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+
+	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
+
+	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	result = _mm_xor_si128(result, result2);
+
+	return vint4(result);
+#else
+	alignas(ASTCENC_VECALIGN) uint8_t table[32];
+	storea(t0, reinterpret_cast<int*>(table +  0));
+	storea(t1, reinterpret_cast<int*>(table + 16));
+
+	return vint4(table[idx.lane<0>()],
+	             table[idx.lane<1>()],
+	             table[idx.lane<2>()],
+	             table[idx.lane<3>()]);
+#endif
+}
+
+/**
+ * @brief Perform an 8-bit 64-entry table lookup, with 32-bit indexes.
+ */
+ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
+{
+#if ASTCENC_SSE >= 41
+	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+
+	__m128i result = _mm_shuffle_epi8(t0.m, idxx);
+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
+
+	__m128i result2 = _mm_shuffle_epi8(t1.m, idxx);
+	result = _mm_xor_si128(result, result2);
+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
+
+	result2 = _mm_shuffle_epi8(t2.m, idxx);
+	result = _mm_xor_si128(result, result2);
+	idxx = _mm_sub_epi8(idxx, _mm_set1_epi8(16));
+
+	result2 = _mm_shuffle_epi8(t3.m, idxx);
+	result = _mm_xor_si128(result, result2);
+
+	return vint4(result);
+#else
+	alignas(ASTCENC_VECALIGN) uint8_t table[64];
+	storea(t0, reinterpret_cast<int*>(table +  0));
+	storea(t1, reinterpret_cast<int*>(table + 16));
+	storea(t2, reinterpret_cast<int*>(table + 32));
+	storea(t3, reinterpret_cast<int*>(table + 48));
+
+	return vint4(table[idx.lane<0>()],
+	             table[idx.lane<1>()],
+	             table[idx.lane<2>()],
+	             table[idx.lane<3>()]);
+#endif
+}
+
+/**
+ * @brief Return a vector of interleaved RGBA data.
+ *
+ * Input vectors have the value stored in the bottom 8 bits of each lane,
+ * with high  bits set to zero.
+ *
+ * Output vector stores a single RGBA texel packed in each lane.
+ */
+ASTCENC_SIMD_INLINE vint4 interleave_rgba8(vint4 r, vint4 g, vint4 b, vint4 a)
+{
+// Workaround an XCode compiler internal fault; note is slower than slli_epi32
+// so we should revert this when we get the opportunity
+#if defined(__APPLE__)
+	__m128i value = r.m;
+	value = _mm_add_epi32(value, _mm_bslli_si128(g.m, 1));
+	value = _mm_add_epi32(value, _mm_bslli_si128(b.m, 2));
+	value = _mm_add_epi32(value, _mm_bslli_si128(a.m, 3));
+	return vint4(value);
+#else
+	__m128i value = r.m;
+	value = _mm_add_epi32(value, _mm_slli_epi32(g.m,  8));
+	value = _mm_add_epi32(value, _mm_slli_epi32(b.m, 16));
+	value = _mm_add_epi32(value, _mm_slli_epi32(a.m, 24));
+	return vint4(value);
+#endif
+}
+
+/**
+ * @brief Store a vector, skipping masked lanes.
+ *
+ * All masked lanes must be at the end of vector, after all non-masked lanes.
+ */
+ASTCENC_SIMD_INLINE void store_lanes_masked(int* base, vint4 data, vmask4 mask)
+{
+#if ASTCENC_AVX >= 2
+	_mm_maskstore_epi32(base, _mm_castps_si128(mask.m), data.m);
+#else
+	// Note - we cannot use _mm_maskmoveu_si128 as the underlying hardware doesn't guarantee
+	// fault suppression on masked lanes so we can get page faults at the end of an image.
+	if (mask.lane<3>() != 0.0f)
+	{
+		store(data, base);
+	}
+	else if (mask.lane<2>() != 0.0f)
+	{
+		base[0] = data.lane<0>();
+		base[1] = data.lane<1>();
+		base[2] = data.lane<2>();
+	}
+	else if (mask.lane<1>() != 0.0f)
+	{
+		base[0] = data.lane<0>();
+		base[1] = data.lane<1>();
+	}
+	else if (mask.lane<0>() != 0.0f)
+	{
+		base[0] = data.lane<0>();
+	}
+#endif
+}
+
+#if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
+
+#define ASTCENC_USE_NATIVE_DOT_PRODUCT 1
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot_s(vfloat4 a, vfloat4 b)
+{
+	return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0xFF));
+}
+
+/**
+ * @brief Return the dot product for the full 4 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(_mm_dp_ps(a.m, b.m, 0xFF));
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning scalar.
+ */
+ASTCENC_SIMD_INLINE float dot3_s(vfloat4 a, vfloat4 b)
+{
+	return _mm_cvtss_f32(_mm_dp_ps(a.m, b.m, 0x77));
+}
+
+/**
+ * @brief Return the dot product for the bottom 3 lanes, returning vector.
+ */
+ASTCENC_SIMD_INLINE vfloat4 dot3(vfloat4 a, vfloat4 b)
+{
+	return vfloat4(_mm_dp_ps(a.m, b.m, 0x77));
+}
+
+#endif // #if defined(ASTCENC_NO_INVARIANCE) && (ASTCENC_SSE >= 41)
+
+#if ASTCENC_POPCNT >= 1
+
+#define ASTCENC_USE_NATIVE_POPCOUNT 1
+
+/**
+ * @brief Population bit count.
+ *
+ * @param v   The value to population count.
+ *
+ * @return The number of 1 bits.
+ */
+ASTCENC_SIMD_INLINE int popcount(uint64_t v)
+{
+	return static_cast<int>(_mm_popcnt_u64(v));
+}
+
+#endif // ASTCENC_POPCNT >= 1
+
+#endif // #ifndef ASTC_VECMATHLIB_SSE_4_H_INCLUDED
diff --git a/thirdparty/astcenc/astcenc_weight_align.cpp b/thirdparty/astcenc/astcenc_weight_align.cpp
new file mode 100644
index 0000000000..e40a318cf5
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_weight_align.cpp
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2022 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for angular-sum algorithm for weight alignment.
+ *
+ * This algorithm works as follows:
+ * - we compute a complex number P as (cos s*i, sin s*i) for each weight,
+ *   where i is the input value and s is a scaling factor based on the spacing between the weights.
+ * - we then add together complex numbers for all the weights.
+ * - we then compute the length and angle of the resulting sum.
+ *
+ * This should produce the following results:
+ * - perfect alignment results in a vector whose length is equal to the sum of lengths of all inputs
+ * - even distribution results in a vector of length 0.
+ * - all samples identical results in perfect alignment for every scaling.
+ *
+ * For each scaling factor within a given set, we compute an alignment factor from 0 to 1. This
+ * should then result in some scalings standing out as having particularly good alignment factors;
+ * we can use this to produce a set of candidate scale/shift values for various quantization levels;
+ * we should then actually try them and see what happens.
+ */
+
+#include "astcenc_internal.h"
+#include "astcenc_vecmathlib.h"
+
+#include <stdio.h>
+#include <cassert>
+#include <cstring>
+
+static constexpr unsigned int ANGULAR_STEPS { 32 };
+
+static_assert((ANGULAR_STEPS % ASTCENC_SIMD_WIDTH) == 0,
+              "ANGULAR_STEPS must be multiple of ASTCENC_SIMD_WIDTH");
+
+static_assert(ANGULAR_STEPS >= 32,
+              "ANGULAR_STEPS must be at least max(steps_for_quant_level)");
+
+// Store a reduced sin/cos table for 64 possible weight values; this causes
+// slight quality loss compared to using sin() and cos() directly. Must be 2^N.
+static constexpr unsigned int SINCOS_STEPS { 64 };
+
+static const uint8_t steps_for_quant_level[12] {
+	2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32
+};
+
+alignas(ASTCENC_VECALIGN) static float sin_table[SINCOS_STEPS][ANGULAR_STEPS];
+alignas(ASTCENC_VECALIGN) static float cos_table[SINCOS_STEPS][ANGULAR_STEPS];
+
+#if defined(ASTCENC_DIAGNOSTICS)
+	static bool print_once { true };
+#endif
+
+/* See header for documentation. */
+void prepare_angular_tables()
+{
+	for (unsigned int i = 0; i < ANGULAR_STEPS; i++)
+	{
+		float angle_step = static_cast<float>(i + 1);
+
+		for (unsigned int j = 0; j < SINCOS_STEPS; j++)
+		{
+			sin_table[j][i] = static_cast<float>(sinf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+			cos_table[j][i] = static_cast<float>(cosf((2.0f * astc::PI / (SINCOS_STEPS - 1.0f)) * angle_step * static_cast<float>(j)));
+		}
+	}
+}
+
+/**
+ * @brief Compute the angular alignment factors and offsets.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param[out] offsets                   The output angular offsets array.
+ */
+static void compute_angular_offsets(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	float* offsets
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	alignas(ASTCENC_VECALIGN) int isamplev[BLOCK_MAX_WEIGHTS];
+
+	// Precompute isample; arrays are always allocated 64 elements long
+	for (unsigned int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
+	{
+		// Add 2^23 and interpreting bits extracts round-to-nearest int
+		vfloat sample = loada(dec_weight_ideal_value + i) * (SINCOS_STEPS - 1.0f) + vfloat(12582912.0f);
+		vint isample = float_as_int(sample) & vint((SINCOS_STEPS - 1));
+		storea(isample, isamplev + i);
+	}
+
+	// Arrays are multiple of SIMD width (ANGULAR_STEPS), safe to overshoot max
+	vfloat mult = vfloat(1.0f / (2.0f * astc::PI));
+
+	for (unsigned int i = 0; i < max_angular_steps; i += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat anglesum_x = vfloat::zero();
+		vfloat anglesum_y = vfloat::zero();
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			int isample = isamplev[j];
+			anglesum_x += loada(cos_table[isample] + i);
+			anglesum_y += loada(sin_table[isample] + i);
+		}
+
+		vfloat angle = atan2(anglesum_y, anglesum_x);
+		vfloat ofs = angle * mult;
+		storea(ofs, offsets + i);
+	}
+}
+
+/**
+ * @brief For a given step size compute the lowest and highest weight.
+ *
+ * Compute the lowest and highest weight that results from quantizing using the given stepsize and
+ * offset, and then compute the resulting error. The cut errors indicate the error that results from
+ * forcing samples that should have had one weight value one step up or down.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_angular_steps         The maximum number of steps to be tested.
+ * @param      max_quant_steps           The maximum quantization level to be tested.
+ * @param      offsets                   The angular offsets array.
+ * @param[out] lowest_weight             Per angular step, the lowest weight.
+ * @param[out] weight_span               Per angular step, the span between lowest and highest weight.
+ * @param[out] error                     Per angular step, the error.
+ * @param[out] cut_low_weight_error      Per angular step, the low weight cut error.
+ * @param[out] cut_high_weight_error     Per angular step, the high weight cut error.
+ */
+static void compute_lowest_and_highest_weight(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_angular_steps,
+	unsigned int max_quant_steps,
+	const float* offsets,
+	float* lowest_weight,
+	int* weight_span,
+	float* error,
+	float* cut_low_weight_error,
+	float* cut_high_weight_error
+) {
+	promise(weight_count > 0);
+	promise(max_angular_steps > 0);
+
+	vfloat rcp_stepsize = vfloat::lane_id() + vfloat(1.0f);
+
+	// Arrays are ANGULAR_STEPS long, so always safe to run full vectors
+	for (unsigned int sp = 0; sp < max_angular_steps; sp += ASTCENC_SIMD_WIDTH)
+	{
+		vfloat minidx(128.0f);
+		vfloat maxidx(-128.0f);
+		vfloat errval = vfloat::zero();
+		vfloat cut_low_weight_err = vfloat::zero();
+		vfloat cut_high_weight_err = vfloat::zero();
+		vfloat offset = loada(offsets + sp);
+
+		for (unsigned int j = 0; j < weight_count; j++)
+		{
+			vfloat sval = load1(dec_weight_ideal_value + j) * rcp_stepsize - offset;
+			vfloat svalrte = round(sval);
+			vfloat diff = sval - svalrte;
+			errval += diff * diff;
+
+			// Reset tracker on min hit
+			vmask mask = svalrte < minidx;
+			minidx = select(minidx, svalrte, mask);
+			cut_low_weight_err = select(cut_low_weight_err, vfloat::zero(), mask);
+
+			// Accumulate on min hit
+			mask = svalrte == minidx;
+			vfloat accum = cut_low_weight_err + vfloat(1.0f) - vfloat(2.0f) * diff;
+			cut_low_weight_err = select(cut_low_weight_err, accum, mask);
+
+			// Reset tracker on max hit
+			mask = svalrte > maxidx;
+			maxidx = select(maxidx, svalrte, mask);
+			cut_high_weight_err = select(cut_high_weight_err, vfloat::zero(), mask);
+
+			// Accumulate on max hit
+			mask = svalrte == maxidx;
+			accum = cut_high_weight_err + vfloat(1.0f) + vfloat(2.0f) * diff;
+			cut_high_weight_err = select(cut_high_weight_err, accum, mask);
+		}
+
+		// Write out min weight and weight span; clamp span to a usable range
+		vint span = float_to_int(maxidx - minidx + vfloat(1));
+		span = min(span, vint(max_quant_steps + 3));
+		span = max(span, vint(2));
+		storea(minidx, lowest_weight + sp);
+		storea(span, weight_span + sp);
+
+		// The cut_(lowest/highest)_weight_error indicate the error that results from  forcing
+		// samples that should have had the weight value one step (up/down).
+		vfloat ssize = 1.0f / rcp_stepsize;
+		vfloat errscale = ssize * ssize;
+		storea(errval * errscale, error + sp);
+		storea(cut_low_weight_err * errscale, cut_low_weight_error + sp);
+		storea(cut_high_weight_err * errscale, cut_high_weight_error + sp);
+
+		rcp_stepsize = rcp_stepsize + vfloat(ASTCENC_SIMD_WIDTH);
+	}
+}
+
+/**
+ * @brief The main function for the angular algorithm.
+ *
+ * @param      weight_count              The number of (decimated) weights.
+ * @param      dec_weight_ideal_value    The ideal decimated unquantized weight values.
+ * @param      max_quant_level           The maximum quantization level to be tested.
+ * @param[out] low_value                 Per angular step, the lowest weight value.
+ * @param[out] high_value                Per angular step, the highest weight value.
+ */
+static void compute_angular_endpoints_for_quant_levels(
+	unsigned int weight_count,
+	const float* dec_weight_ideal_value,
+	unsigned int max_quant_level,
+	float low_value[TUNE_MAX_ANGULAR_QUANT + 1],
+	float high_value[TUNE_MAX_ANGULAR_QUANT + 1]
+) {
+	unsigned int max_quant_steps = steps_for_quant_level[max_quant_level];
+	unsigned int max_angular_steps = steps_for_quant_level[max_quant_level];
+
+	alignas(ASTCENC_VECALIGN) float angular_offsets[ANGULAR_STEPS];
+
+	compute_angular_offsets(weight_count, dec_weight_ideal_value,
+	                        max_angular_steps, angular_offsets);
+
+	alignas(ASTCENC_VECALIGN) float lowest_weight[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) int32_t weight_span[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) float error[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) float cut_low_weight_error[ANGULAR_STEPS];
+	alignas(ASTCENC_VECALIGN) float cut_high_weight_error[ANGULAR_STEPS];
+
+	compute_lowest_and_highest_weight(weight_count, dec_weight_ideal_value,
+	                                  max_angular_steps, max_quant_steps,
+	                                  angular_offsets, lowest_weight, weight_span, error,
+	                                  cut_low_weight_error, cut_high_weight_error);
+
+	// For each quantization level, find the best error terms. Use packed vectors so data-dependent
+	// branches can become selects. This involves some integer to float casts, but the values are
+	// small enough so they never round the wrong way.
+	vfloat4 best_results[36];
+
+	// Initialize the array to some safe defaults
+	promise(max_quant_steps > 0);
+	for (unsigned int i = 0; i < (max_quant_steps + 4); i++)
+	{
+		// Lane<0> = Best error
+		// Lane<1> = Best scale; -1 indicates no solution found
+		// Lane<2> = Cut low weight
+		best_results[i] = vfloat4(ERROR_CALC_DEFAULT, -1.0f, 0.0f, 0.0f);
+	}
+
+	promise(max_angular_steps > 0);
+	for (unsigned int i = 0; i < max_angular_steps; i++)
+	{
+		float i_flt = static_cast<float>(i);
+
+		int idx_span = weight_span[i];
+
+		float error_cut_low = error[i] + cut_low_weight_error[i];
+		float error_cut_high = error[i] + cut_high_weight_error[i];
+		float error_cut_low_high = error[i] + cut_low_weight_error[i] + cut_high_weight_error[i];
+
+		// Check best error against record N
+		vfloat4 best_result = best_results[idx_span];
+		vfloat4 new_result = vfloat4(error[i], i_flt, 0.0f, 0.0f);
+		vmask4 mask = vfloat4(best_result.lane<0>()) > vfloat4(error[i]);
+		best_results[idx_span] = select(best_result, new_result, mask);
+
+		// Check best error against record N-1 with either cut low or cut high
+		best_result = best_results[idx_span - 1];
+
+		new_result = vfloat4(error_cut_low, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low);
+		best_result = select(best_result, new_result, mask);
+
+		new_result = vfloat4(error_cut_high, i_flt, 0.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_high);
+		best_results[idx_span - 1] = select(best_result, new_result, mask);
+
+		// Check best error against record N-2 with both cut low and high
+		best_result = best_results[idx_span - 2];
+		new_result = vfloat4(error_cut_low_high, i_flt, 1.0f, 0.0f);
+		mask = vfloat4(best_result.lane<0>()) > vfloat4(error_cut_low_high);
+		best_results[idx_span - 2] = select(best_result, new_result, mask);
+	}
+
+	for (unsigned int i = 0; i <= max_quant_level; i++)
+	{
+		unsigned int q = steps_for_quant_level[i];
+		int bsi = static_cast<int>(best_results[q].lane<1>());
+
+		// Did we find anything?
+#if defined(ASTCENC_DIAGNOSTICS)
+		if ((bsi < 0) && print_once)
+		{
+			print_once = false;
+			printf("INFO: Unable to find full encoding within search error limit.\n\n");
+		}
+#endif
+
+		bsi = astc::max(0, bsi);
+
+		float lwi = lowest_weight[bsi] + best_results[q].lane<2>();
+		float hwi = lwi + static_cast<float>(q) - 1.0f;
+
+		float stepsize = 1.0f / (1.0f + static_cast<float>(bsi));
+		low_value[i]  = (angular_offsets[bsi] + lwi) * stepsize;
+		high_value[i] = (angular_offsets[bsi] + hwi) * stepsize;
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_1plane(
+	bool only_always,
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+
+	float (&low_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+
+	unsigned int max_decimation_modes = only_always ? bsd.decimation_mode_count_always
+	                                                : bsd.decimation_mode_count_selected;
+	promise(max_decimation_modes > 0);
+	for (unsigned int i = 0; i < max_decimation_modes; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_1_plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_1plane;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values[i], high_values[i]);
+	}
+
+	unsigned int max_block_modes = only_always ? bsd.block_mode_count_1plane_always
+	                                           : bsd.block_mode_count_1plane_selected;
+	promise(max_block_modes > 0);
+	for (unsigned int i = 0; i < max_block_modes; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		assert(!bm.is_dual_plane);
+
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value[i] = low_values[decim_mode][quant_mode];
+			high_value[i] = high_values[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value[i] = 0.0f;
+			high_value[i] = 1.0f;
+		}
+	}
+}
+
+/* See header for documentation. */
+void compute_angular_endpoints_2planes(
+	const block_size_descriptor& bsd,
+	const float* dec_weight_ideal_value,
+	unsigned int max_weight_quant,
+	compression_working_buffers& tmpbuf
+) {
+	float (&low_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value1;
+	float (&high_value1)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value1;
+	float (&low_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_low_value2;
+	float (&high_value2)[WEIGHTS_MAX_BLOCK_MODES] = tmpbuf.weight_high_value2;
+
+	float (&low_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values1;
+	float (&high_values1)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values1;
+	float (&low_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_low_values2;
+	float (&high_values2)[WEIGHTS_MAX_DECIMATION_MODES][TUNE_MAX_ANGULAR_QUANT + 1] = tmpbuf.weight_high_values2;
+
+	promise(bsd.decimation_mode_count_selected > 0);
+	for (unsigned int i = 0; i < bsd.decimation_mode_count_selected; i++)
+	{
+		const decimation_mode& dm = bsd.decimation_modes[i];
+		if (!dm.is_ref_2_plane(static_cast<quant_method>(max_weight_quant)))
+		{
+			continue;
+		}
+
+		unsigned int weight_count = bsd.get_decimation_info(i).weight_count;
+
+		unsigned int max_precision = dm.maxprec_2planes;
+		if (max_precision > TUNE_MAX_ANGULAR_QUANT)
+		{
+			max_precision = TUNE_MAX_ANGULAR_QUANT;
+		}
+
+		if (max_precision > max_weight_quant)
+		{
+			max_precision = max_weight_quant;
+		}
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS,
+		    max_precision, low_values1[i], high_values1[i]);
+
+		compute_angular_endpoints_for_quant_levels(
+		    weight_count,
+		    dec_weight_ideal_value + i * BLOCK_MAX_WEIGHTS + WEIGHTS_PLANE2_OFFSET,
+		    max_precision, low_values2[i], high_values2[i]);
+	}
+
+	unsigned int start = bsd.block_mode_count_1plane_selected;
+	unsigned int end = bsd.block_mode_count_1plane_2plane_selected;
+	for (unsigned int i = start; i < end; i++)
+	{
+		const block_mode& bm = bsd.block_modes[i];
+		unsigned int quant_mode = bm.quant_mode;
+		unsigned int decim_mode = bm.decimation_mode;
+
+		if (quant_mode <= TUNE_MAX_ANGULAR_QUANT)
+		{
+			low_value1[i] = low_values1[decim_mode][quant_mode];
+			high_value1[i] = high_values1[decim_mode][quant_mode];
+			low_value2[i] = low_values2[decim_mode][quant_mode];
+			high_value2[i] = high_values2[decim_mode][quant_mode];
+		}
+		else
+		{
+			low_value1[i] = 0.0f;
+			high_value1[i] = 1.0f;
+			low_value2[i] = 0.0f;
+			high_value2[i] = 1.0f;
+		}
+	}
+}
+
+#endif
diff --git a/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp b/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
new file mode 100644
index 0000000000..8fdf73adc2
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_weight_quant_xfer_tables.cpp
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2021 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+/**
+ * @brief Data tables for quantization transfer.
+ */
+
+#include "astcenc_internal.h"
+
+#define _ 0 // Using _ to indicate an entry that will not be used.
+
+const quant_and_transfer_table quant_and_xfer_tables[12] {
+	// QUANT2, range 0..1
+	{
+		{0, 64},
+		{0, 1},
+		{0, 64},
+		{0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x4000}
+	},
+	// QUANT_3, range 0..2
+	{
+		{0, 32, 64},
+		{0, 1, 2},
+		{0, 32, 64},
+		{0x2000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,0x4020}
+	},
+	// QUANT_4, range 0..3
+	{
+		{0, 21, 43, 64},
+		{0, 1, 2, 3},
+		{0, 21, 43, 64},
+		{0x1500,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2b00,_,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4015,_,_,_,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,_,_,0x402b}
+	},
+	//QUANT_5, range 0..4
+	{
+		{0, 16, 32, 48, 64},
+		{0, 1, 2, 3, 4},
+		{0, 16, 32, 48, 64},
+		{0x1000,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x2000,_,_,_,_,_,_,_,_,_,
+		 _,_,_,_,_,_,0x3010,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,0x4020,_,_,_,
+		 _,_,_,_,_,_,_,_,_,_,_,_,0x4030}
+	},
+	// QUANT_6, range 0..5
+	{
+		{0, 12, 25, 39, 52, 64},
+		{0, 2, 4, 5, 3, 1},
+		{0, 64, 12, 52, 25, 39},
+		{0x0c00,_,_,_,_,_,_,_,_,_,_,_,0x1900,_,_,_,_,_,_,_,_,_,_,_,_,
+		 0x270c,_,_,_,_,_,_,_,_,_,_,_,_,_,0x3419,_,_,_,_,_,_,_,_,_,_,
+		 _,_,0x4027,_,_,_,_,_,_,_,_,_,_,_,0x4034}
+	},
+	// QUANT_8, range 0..7
+	{
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7},
+		{0, 9, 18, 27, 37, 46, 55, 64},
+		{0x0900,_,_,_,_,_,_,_,_,0x1200,_,_,_,_,_,_,_,_,0x1b09,_,_,
+		 _,_,_,_,_,_,0x2512,_,_,_,_,_,_,_,_,_,0x2e1b,_,_,_,_,_,_,_,_,
+		 0x3725,_,_,_,_,_,_,_,_,0x402e,_,_,_,_,_,_,_,_,0x4037}
+	},
+	// QUANT_10, range 0..9
+	{
+		{0, 7, 14, 21, 28, 36, 43, 50, 57, 64},
+		{0, 2, 4, 6, 8, 9, 7, 5, 3, 1},
+		{0, 64, 7, 57, 14, 50, 21, 43, 28, 36},
+		{0x0700,_,_,_,_,_,_,0x0e00,_,_,_,_,_,_,0x1507,_,_,_,_,_,_,
+		 0x1c0e,_,_,_,_,_,_,0x2415,_,_,_,_,_,_,_,0x2b1c,_,_,_,_,_,
+		 _,0x3224,_,_,_,_,_,_,0x392b,_,_,_,_,_,_,0x4032,_,_,_,_,_,
+		 _,0x4039}
+	},
+	// QUANT_12, range 0..11
+	{
+		{0, 5, 11, 17, 23, 28, 36, 41, 47, 53, 59, 64},
+		{0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1},
+		{0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36},
+		{0x0500,_,_,_,_,0x0b00,_,_,_,_,_,0x1105,_,_,_,_,_,
+		 0x170b,_,_,_,_,_,0x1c11,_,_,_,_,0x2417,_,_,_,_,_,_,_,
+		 0x291c,_,_,_,_,0x2f24,_,_,_,_,_,0x3529,_,_,_,_,_,
+		 0x3b2f,_,_,_,_,_,0x4035,_,_,_,_,0x403b}
+	},
+	// QUANT_16, range 0..15
+	{
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+		{0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64},
+		{0x0400,_,_,_,0x0800,_,_,_,0x0c04,_,_,_,0x1108,_,_,_,_,
+		 0x150c,_,_,_,0x1911,_,_,_,0x1d15,_,_,_,0x2319,_,_,_,_,
+		 _,0x271d,_,_,_,0x2b23,_,_,_,0x2f27,_,_,_,0x342b,_,_,_,
+		 _,0x382f,_,_,_,0x3c34,_,_,_,0x4038,_,_,_,0x403c}
+	},
+	// QUANT_20, range 0..19
+	{
+		{0, 3, 6, 9, 13, 16, 19, 23, 26, 29, 35, 38, 41, 45, 48, 51, 55, 58, 61, 64},
+		{0, 4, 8, 12, 16, 2, 6, 10, 14, 18, 19, 15, 11, 7, 3, 17, 13, 9, 5, 1},
+		{0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35},
+		{0x0300,_,_,0x0600,_,_,0x0903,_,_,0x0d06,_,_,_,
+		 0x1009,_,_,0x130d,_,_,0x1710,_,_,_,0x1a13,_,_,
+		 0x1d17,_,_,0x231a,_,_,_,_,_,0x261d,_,_,0x2923,_,_,
+		 0x2d26,_,_,_,0x3029,_,_,0x332d,_,_,0x3730,_,_,_,
+		 0x3a33,_,_,0x3d37,_,_,0x403a,_,_,0x403d}
+	},
+	// QUANT_24, range 0..23
+	{
+		{0, 2, 5, 8, 11, 13, 16, 19, 22, 24, 27, 30, 34, 37, 40, 42, 45, 48, 51, 53, 56, 59, 62, 64},
+		{0, 8, 16, 2, 10, 18, 4, 12, 20, 6, 14, 22, 23, 15, 7, 21, 13, 5, 19, 11, 3, 17, 9, 1},
+		{0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34},
+		{0x0200,_,0x0500,_,_,0x0802,_,_,0x0b05,_,_,0x0d08,
+		 _,0x100b,_,_,0x130d,_,_,0x1610,_,_,0x1813,_,
+		 0x1b16,_,_,0x1e18,_,_,0x221b,_,_,_,0x251e,_,_,
+		 0x2822,_,_,0x2a25,_,0x2d28,_,_,0x302a,_,_,0x332d,
+		 _,_,0x3530,_,0x3833,_,_,0x3b35,_,_,0x3e38,_,_,
+		 0x403b,_,0x403e}
+	},
+	// QUANT_32, range 0..31
+	{
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31},
+		{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64},
+		{0x0200,_,0x0400,_,0x0602,_,0x0804,_,0x0a06,_,
+		 0x0c08,_,0x0e0a,_,0x100c,_,0x120e,_,0x1410,_,
+		 0x1612,_,0x1814,_,0x1a16,_,0x1c18,_,0x1e1a,_,
+		 0x221c,_,_,_,0x241e,_,0x2622,_,0x2824,_,0x2a26,_,
+		 0x2c28,_,0x2e2a,_,0x302c,_,0x322e,_,0x3430,_,
+		 0x3632,_,0x3834,_,0x3a36,_,0x3c38,_,0x3e3a,_,
+		 0x403c,_,0x403e}
+	}
+};
diff --git a/thirdparty/astcenc/patches/fix-build-no-ssse3.patch b/thirdparty/astcenc/patches/fix-build-no-ssse3.patch
new file mode 100644
index 0000000000..9da4f3e1f3
--- /dev/null
+++ b/thirdparty/astcenc/patches/fix-build-no-ssse3.patch
@@ -0,0 +1,81 @@
+From 02c22d3df501dc284ba732fa82a6c408c57b3237 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?R=C3=A9mi=20Verschelde?= <rverschelde@gmail.com>
+Date: Thu, 19 Jan 2023 23:30:13 +0100
+Subject: [PATCH] mathlib: Remove incomplete support for SSE3 which assumed
+ SSSE3
+
+`_mm_shuffle_epi8` requires SSSE3 so the check on `ASTCENC_SSE >= 30` is
+too lax and would fail if `__SSE3__` is supported, but not `__SSSE3__`.
+
+The only supported configurations are SSE2, SSE4.1, and AVX2, so as
+discussed in #393 we drop the SSE3 checks and require SSE4.1 instead.
+---
+ Source/astcenc_mathlib.h          |  2 --
+ Source/astcenc_vecmathlib_sse_4.h | 10 +++++-----
+ 2 files changed, 5 insertions(+), 7 deletions(-)
+
+diff --git a/Source/astcenc_mathlib.h b/Source/astcenc_mathlib.h
+index 67e989e..0540c4f 100644
+--- a/Source/astcenc_mathlib.h
++++ b/Source/astcenc_mathlib.h
+@@ -48,8 +48,6 @@
+     #define ASTCENC_SSE 42
+   #elif defined(__SSE4_1__)
+     #define ASTCENC_SSE 41
+-  #elif defined(__SSE3__)
+-    #define ASTCENC_SSE 30
+   #elif defined(__SSE2__)
+     #define ASTCENC_SSE 20
+   #else
+diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
+index 76fe577..26dcc4a 100644
+--- a/Source/astcenc_vecmathlib_sse_4.h
++++ b/Source/astcenc_vecmathlib_sse_4.h
+@@ -1046,7 +1046,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4& t0p)
+  */
+ ASTCENC_SIMD_INLINE void vtable_prepare(vint4 t0, vint4 t1, vint4& t0p, vint4& t1p)
+ {
+-#if ASTCENC_SSE >= 30
++#if ASTCENC_SSE >= 41
+ 	t0p = t0;
+ 	t1p = t0 ^ t1;
+ #else
+@@ -1062,7 +1062,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
+ 	vint4 t0, vint4 t1, vint4 t2, vint4 t3,
+ 	vint4& t0p, vint4& t1p, vint4& t2p, vint4& t3p)
+ {
+-#if ASTCENC_SSE >= 30
++#if ASTCENC_SSE >= 41
+ 	t0p = t0;
+ 	t1p = t0 ^ t1;
+ 	t2p = t1 ^ t2;
+@@ -1080,7 +1080,7 @@ ASTCENC_SIMD_INLINE void vtable_prepare(
+  */
+ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+ {
+-#if ASTCENC_SSE >= 30
++#if ASTCENC_SSE >= 41
+ 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+ 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+ 
+@@ -1102,7 +1102,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 idx)
+  */
+ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
+ {
+-#if ASTCENC_SSE >= 30
++#if ASTCENC_SSE >= 41
+ 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+ 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+ 
+@@ -1130,7 +1130,7 @@ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 idx)
+  */
+ ASTCENC_SIMD_INLINE vint4 vtable_8bt_32bi(vint4 t0, vint4 t1, vint4 t2, vint4 t3, vint4 idx)
+ {
+-#if ASTCENC_SSE >= 30
++#if ASTCENC_SSE >= 41
+ 	// Set index byte MSB to 1 for unused bytes so shuffle returns zero
+ 	__m128i idxx = _mm_or_si128(idx.m, _mm_set1_epi32(static_cast<int>(0xFFFFFF00)));
+ 
+-- 
+2.39.1
+