26 files changed, 13873 insertions, 3289 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index d011832210..3962a83597 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -357,6 +357,16 @@ File extracted from upstream release tarball:
 - Added 2 files `godot_core_mbedtls_platform.{c,h}` providing configuration
   for light bundling with core.
 
+## meshoptimizer
+
+- Upstream: https://github.com/zeux/meshoptimizer
+- Version: 0.15(2020)
+- License: MIT
+
+File extracted from upstream release tarball:
+
+- Files in src/ go to thirdparty/meshoptimizer
+
 
 ## miniupnpc
 
@@ -413,7 +423,7 @@ Collection of single-file libraries used in Godot components.
   * License: Apache 2.0
 - `open-simplex-noise.{c,h}`
   * Upstream: https://github.com/smcameron/open-simplex-noise-in-c
-  * Version: git (0fef0dbedd76f767da7e3c894822729d0f07e54d, 2020) + custom changes
+  * Version: git (826f1dd1724e6fb3ff45f58e48c0fbae864c3403, 2020) + custom changes
   * License: Unlicense
 - `pcg.{cpp,h}`
   * Upstream: http://www.pcg-random.org
@@ -671,7 +681,7 @@ File extracted from upstream release tarball:
 ## xatlas
 
 - Upstream: https://github.com/jpcy/xatlas
-- Version: git (470576d3516f7e6d8b4554e7c941194a935969fd, 2020)
+- Version: git (5571fc7ef0d06832947c0a935ccdcf083f7a9264, 2020)
 - License: MIT
 
 Files extracted from upstream source:
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
new file mode 100644
index 0000000000..4fcd766d22
--- /dev/null
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016-2020 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/thirdparty/meshoptimizer/allocator.cpp b/thirdparty/meshoptimizer/allocator.cpp
new file mode 100644
index 0000000000..da7cc540b2
--- /dev/null
+++ b/thirdparty/meshoptimizer/allocator.cpp
@@ -0,0 +1,8 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*))
+{
+	meshopt_Allocator::Storage::allocate = allocate;
+	meshopt_Allocator::Storage::deallocate = deallocate;
+}
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
new file mode 100644
index 0000000000..f7d88c5136
--- /dev/null
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -0,0 +1,351 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
+// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
+// Jack Ritter. An Efficient Bounding Sphere. 1990
+namespace meshopt
+{
+
+static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+{
+	assert(count > 0);
+
+	// find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
+	size_t pmin[3] = {0, 0, 0};
+	size_t pmax[3] = {0, 0, 0};
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+
+		for (int axis = 0; axis < 3; ++axis)
+		{
+			pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
+			pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+		}
+	}
+
+	// find the pair of points with largest distance
+	float paxisd2 = 0;
+	int paxis = 0;
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		const float* p1 = points[pmin[axis]];
+		const float* p2 = points[pmax[axis]];
+
+		float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+
+		if (d2 > paxisd2)
+		{
+			paxisd2 = d2;
+			paxis = axis;
+		}
+	}
+
+	// use the longest segment as the initial sphere diameter
+	const float* p1 = points[pmin[paxis]];
+	const float* p2 = points[pmax[paxis]];
+
+	float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
+	float radius = sqrtf(paxisd2) / 2;
+
+	// iteratively adjust the sphere up until all points fit
+	for (size_t i = 0; i < count; ++i)
+	{
+		const float* p = points[i];
+		float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+		if (d2 > radius * radius)
+		{
+			float d = sqrtf(d2);
+			assert(d > 0);
+
+			float k = 0.5f + (radius / d) / 2;
+
+			center[0] = center[0] * k + p[0] * (1 - k);
+			center[1] = center[1] * k + p[1] * (1 - k);
+			center[2] = center[2] * k + p[2] * (1 - k);
+			radius = (radius + d) / 2;
+		}
+	}
+
+	result[0] = center[0];
+	result[1] = center[1];
+	result[2] = center[2];
+	result[3] = radius;
+}
+
+} // namespace meshopt
+
+size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	// meshlet construction is limited by max vertices and max triangles per meshlet
+	// the worst case is that the input is an unindexed stream since this equally stresses both limits
+	// note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
+	size_t max_vertices_conservative = max_vertices - 2;
+	size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
+	size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
+
+	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
+}
+
+size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	assert(index_count % 3 == 0);
+	assert(max_vertices >= 3);
+	assert(max_triangles >= 1);
+
+	meshopt_Allocator allocator;
+
+	meshopt_Meshlet meshlet;
+	memset(&meshlet, 0, sizeof(meshlet));
+
+	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
+	assert(max_triangles <= sizeof(meshlet.indices) / 3);
+
+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, -1, vertex_count);
+
+	size_t offset = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		unsigned char& av = used[a];
+		unsigned char& bv = used[b];
+		unsigned char& cv = used[c];
+
+		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+		{
+			destination[offset++] = meshlet;
+
+			for (size_t j = 0; j < meshlet.vertex_count; ++j)
+				used[meshlet.vertices[j]] = 0xff;
+
+			memset(&meshlet, 0, sizeof(meshlet));
+		}
+
+		if (av == 0xff)
+		{
+			av = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = a;
+		}
+
+		if (bv == 0xff)
+		{
+			bv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = b;
+		}
+
+		if (cv == 0xff)
+		{
+			cv = meshlet.vertex_count;
+			meshlet.vertices[meshlet.vertex_count++] = c;
+		}
+
+		meshlet.indices[meshlet.triangle_count][0] = av;
+		meshlet.indices[meshlet.triangle_count][1] = bv;
+		meshlet.indices[meshlet.triangle_count][2] = cv;
+		meshlet.triangle_count++;
+	}
+
+	if (meshlet.triangle_count)
+		destination[offset++] = meshlet;
+
+	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+
+	return offset;
+}
+
+meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(index_count / 3 <= 256);
+
+	(void)vertex_count;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	// compute triangle normals and gather triangle corners
+	float normals[256][3];
+	float corners[256][3][3];
+	size_t triangles = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* p0 = vertex_positions + vertex_stride_float * a;
+		const float* p1 = vertex_positions + vertex_stride_float * b;
+		const float* p2 = vertex_positions + vertex_stride_float * c;
+
+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+		// no need to include degenerate triangles - they will be invisible anyway
+		if (area == 0.f)
+			continue;
+
+		// record triangle normals & corners for future use; normal and corner 0 define a plane equation
+		normals[triangles][0] = normalx / area;
+		normals[triangles][1] = normaly / area;
+		normals[triangles][2] = normalz / area;
+		memcpy(corners[triangles][0], p0, 3 * sizeof(float));
+		memcpy(corners[triangles][1], p1, 3 * sizeof(float));
+		memcpy(corners[triangles][2], p2, 3 * sizeof(float));
+		triangles++;
+	}
+
+	meshopt_Bounds bounds = {};
+
+	// degenerate cluster, no valid triangles => trivial reject (cone data is 0)
+	if (triangles == 0)
+		return bounds;
+
+	// compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
+	float psphere[4] = {};
+	computeBoundingSphere(psphere, corners[0], triangles * 3);
+
+	float center[3] = {psphere[0], psphere[1], psphere[2]};
+
+	// treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
+	float nsphere[4] = {};
+	computeBoundingSphere(nsphere, normals, triangles);
+
+	float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
+	float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
+	float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
+
+	axis[0] *= invaxislength;
+	axis[1] *= invaxislength;
+	axis[2] *= invaxislength;
+
+	// compute a tight cone around all normals, mindp = cos(angle/2)
+	float mindp = 1.f;
+
+	for (size_t i = 0; i < triangles; ++i)
+	{
+		float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
+
+		mindp = (dp < mindp) ? dp : mindp;
+	}
+
+	// fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
+	bounds.center[0] = center[0];
+	bounds.center[1] = center[1];
+	bounds.center[2] = center[2];
+	bounds.radius = psphere[3];
+
+	// degenerate cluster, normal cone is larger than a hemisphere => trivial accept
+	// note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
+	// we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
+	if (mindp <= 0.1f)
+	{
+		bounds.cone_cutoff = 1;
+		bounds.cone_cutoff_s8 = 127;
+		return bounds;
+	}
+
+	float maxt = 0;
+
+	// we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
+	for (size_t i = 0; i < triangles; ++i)
+	{
+		// dot(center-t*axis-corner, trinormal) = 0
+		// dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
+		float cx = center[0] - corners[i][0][0];
+		float cy = center[1] - corners[i][0][1];
+		float cz = center[2] - corners[i][0][2];
+
+		float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
+		float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
+
+		// dn should be larger than mindp cutoff above
+		assert(dn > 0.f);
+		float t = dc / dn;
+
+		maxt = (t > maxt) ? t : maxt;
+	}
+
+	// cone apex should be in the negative half-space of all cluster triangles by construction
+	bounds.cone_apex[0] = center[0] - axis[0] * maxt;
+	bounds.cone_apex[1] = center[1] - axis[1] * maxt;
+	bounds.cone_apex[2] = center[2] - axis[2] * maxt;
+
+	// note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
+	bounds.cone_axis[0] = axis[0];
+	bounds.cone_axis[1] = axis[1];
+	bounds.cone_axis[2] = axis[2];
+
+	// cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
+	// which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
+	bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
+
+	// quantize axis & cutoff to 8-bit SNORM format
+	bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
+	bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
+	bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
+
+	// for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
+	float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
+	float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
+	float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
+
+	// note that we need to round this up instead of rounding to nearest, hence +1
+	int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
+
+	bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
+
+	return bounds;
+}
+
+meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+
+	for (size_t i = 0; i < meshlet->triangle_count; ++i)
+	{
+		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
+		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
+		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
+
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		indices[i * 3 + 0] = a;
+		indices[i * 3 + 1] = b;
+		indices[i * 3 + 2] = c;
+	}
+
+	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+}
diff --git a/thirdparty/meshoptimizer/indexcodec.cpp b/thirdparty/meshoptimizer/indexcodec.cpp
new file mode 100644
index 0000000000..eeb541e5be
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexcodec.cpp
@@ -0,0 +1,752 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
+// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
+namespace meshopt
+{
+
+const unsigned char kIndexHeader = 0xe0;
+const unsigned char kSequenceHeader = 0xd0;
+
+static int gEncodeIndexVersion = 0;
+
+typedef unsigned int VertexFifo[16];
+typedef unsigned int EdgeFifo[16][2];
+
+static const unsigned int kTriangleIndexOrder[3][3] = {
+    {0, 1, 2},
+    {1, 2, 0},
+    {2, 0, 1},
+};
+
+static const unsigned char kCodeAuxEncodingTable[16] = {
+    0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
+    0, 0, // last two entries aren't used for encoding
+};
+
+static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
+{
+	(void)a;
+
+	return (b == next) ? 1 : (c == next) ? 2 : 0;
+}
+
+static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		unsigned int e0 = fifo[index][0];
+		unsigned int e1 = fifo[index][1];
+
+		if (e0 == a && e1 == b)
+			return (i << 2) | 0;
+		if (e0 == b && e1 == c)
+			return (i << 2) | 1;
+		if (e0 == c && e1 == a)
+			return (i << 2) | 2;
+	}
+
+	return -1;
+}
+
+static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
+{
+	fifo[offset][0] = a;
+	fifo[offset][1] = b;
+	offset = (offset + 1) & 15;
+}
+
+static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
+{
+	for (int i = 0; i < 16; ++i)
+	{
+		size_t index = (offset - 1 - i) & 15;
+
+		if (fifo[index] == v)
+			return i;
+	}
+
+	return -1;
+}
+
+static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
+{
+	fifo[offset] = v;
+	offset = (offset + cond) & 15;
+}
+
+static void encodeVByte(unsigned char*& data, unsigned int v)
+{
+	// encode 32-bit value in up to 5 7-bit groups
+	do
+	{
+		*data++ = (v & 127) | (v > 127 ? 128 : 0);
+		v >>= 7;
+	} while (v);
+}
+
+static unsigned int decodeVByte(const unsigned char*& data)
+{
+	unsigned char lead = *data++;
+
+	// fast path: single byte
+	if (lead < 128)
+		return lead;
+
+	// slow path: up to 4 extra bytes
+	// note that this loop always terminates, which is important for malformed data
+	unsigned int result = lead & 127;
+	unsigned int shift = 7;
+
+	for (int i = 0; i < 4; ++i)
+	{
+		unsigned char group = *data++;
+		result |= (group & 127) << shift;
+		shift += 7;
+
+		if (group < 128)
+			break;
+	}
+
+	return result;
+}
+
+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
+{
+	unsigned int d = index - last;
+	unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+	encodeVByte(data, v);
+}
+
+static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
+{
+	unsigned int v = decodeVByte(data);
+	unsigned int d = (v >> 1) ^ -int(v & 1);
+
+	return last + d;
+}
+
+static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
+{
+	for (int i = 0; i < 16; ++i)
+		if (table[i] == v)
+			return i;
+
+	return -1;
+}
+
+static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
+{
+	if (index_size == 2)
+	{
+		static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
+		static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
+		static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
+	}
+	else
+	{
+		static_cast<unsigned int*>(destination)[offset + 0] = a;
+		static_cast<unsigned int*>(destination)[offset + 1] = b;
+		static_cast<unsigned int*>(destination)[offset + 2] = c;
+	}
+}
+
+#if TRACE
+static size_t sortTop16(unsigned char dest[16], size_t stats[256])
+{
+	size_t destsize = 0;
+
+	for (size_t i = 0; i < 256; ++i)
+	{
+		size_t j = 0;
+		for (; j < destsize; ++j)
+		{
+			if (stats[i] >= stats[dest[j]])
+			{
+				if (destsize < 16)
+					destsize++;
+
+				memmove(&dest[j + 1], &dest[j], destsize - 1 - j);
+				dest[j] = (unsigned char)i;
+				break;
+			}
+		}
+
+		if (j == destsize && destsize < 16)
+		{
+			dest[destsize] = (unsigned char)i;
+			destsize++;
+		}
+	}
+
+	return destsize;
+}
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+#if TRACE
+	size_t codestats[256] = {};
+	size_t codeauxstats[256] = {};
+#endif
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kIndexHeader | version);
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	unsigned char* code = buffer + 1;
+	unsigned char* data = code + index_count / 3;
+	unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// use static encoding table; it's possible to pack the result and then build an optimal table and repack
+	// for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
+	const unsigned char* codeaux_table = kCodeAuxEncodingTable;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough space to write a triangle
+		// each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can write without extra bounds checks
+		if (data > data_safe_end)
+			return 0;
+
+		int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
+
+		if (fer >= 0 && (fer >> 2) < 15)
+		{
+			const unsigned int* order = kTriangleIndexOrder[fer & 3];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// encode edge index and vertex fifo index, next or free index
+			int fe = fer >> 2;
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next) ? (next++, 0) : 15;
+
+			if (fec == 15 && version >= 1)
+			{
+				// encode last-1 and last+1 to optimize strip-like sequences
+				if (c + 1 == last)
+					fec = 13, last = c;
+				if (c == last + 1)
+					fec = 14, last = c;
+			}
+
+			*code++ = (unsigned char)((fe << 4) | fec);
+
+#if TRACE
+			codestats[code[-1]]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// we only need to push third vertex since first two are likely already in the vertex fifo
+			if (fec == 0 || fec >= fecmax)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// we only need to push two new edges to edge fifo since the third one is already there
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+		else
+		{
+			int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
+			const unsigned int* order = kTriangleIndexOrder[rotation];
+
+			unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+			// if a/b/c are 0/1/2, we emit a reset code
+			bool reset = false;
+
+			if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
+			{
+				reset = true;
+				next = 0;
+
+				// reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
+				// this makes sure next continues to get incremented instead of being stuck
+				memset(vertexfifo, -1, sizeof(vertexfifo));
+			}
+
+			int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
+			int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+			int fea = (a == next) ? (next++, 0) : 15;
+			int feb = (fb >= 0 && fb < 14) ? (fb + 1) : (b == next) ? (next++, 0) : 15;
+			int fec = (fc >= 0 && fc < 14) ? (fc + 1) : (c == next) ? (next++, 0) : 15;
+
+			// we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
+			unsigned char codeaux = (unsigned char)((feb << 4) | fec);
+			int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
+
+			// <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
+			if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
+			{
+				*code++ = (unsigned char)((15 << 4) | codeauxindex);
+			}
+			else
+			{
+				*code++ = (unsigned char)((15 << 4) | 14 | fea);
+				*data++ = codeaux;
+			}
+
+#if TRACE
+			codestats[code[-1]]++;
+			codeauxstats[codeaux]++;
+#endif
+
+			// note that we need to update the last index since free indices are delta-encoded
+			if (fea == 15)
+				encodeIndex(data, a, last), last = a;
+
+			if (feb == 15)
+				encodeIndex(data, b, last), last = b;
+
+			if (fec == 15)
+				encodeIndex(data, c, last), last = c;
+
+			// only push vertices that weren't already in fifo
+			if (fea == 0 || fea == 15)
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+
+			if (feb == 0 || feb == 15)
+				pushVertexFifo(vertexfifo, b, vertexfifooffset);
+
+			if (fec == 0 || fec == 15)
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+			// all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
+			pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+			pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+			pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+		}
+	}
+
+	// make sure we have enough space to write codeaux table
+	if (data > data_safe_end)
+		return 0;
+
+	// add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
+	// we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
+	// this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
+	for (size_t i = 0; i < 16; ++i)
+	{
+		// decoder assumes that table entries never refer to separately encoded indices
+		assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
+
+		*data++ = codeaux_table[i];
+	}
+
+	// since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
+	assert(codeaux_table[0] == 0);
+
+	assert(data >= buffer + index_count / 3 + 16);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	unsigned char codetop[16], codeauxtop[16];
+	size_t codetopsize = sortTop16(codetop, codestats);
+	size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats);
+
+	size_t sumcode = 0, sumcodeaux = 0;
+	for (size_t i = 0; i < 256; ++i)
+		sumcode += codestats[i], sumcodeaux += codeauxstats[i];
+
+	size_t acccode = 0, acccodeaux = 0;
+
+	printf("code\t\t\t\t\tcodeaux\n");
+
+	for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i)
+	{
+		acccode += codestats[codetop[i]];
+		acccodeaux += codeauxstats[codeauxtop[i]];
+
+		printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n",
+		       int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100,
+		       int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100);
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
+	unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
+
+	return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
+}
+
+void meshopt_encodeIndexVersion(int version)
+{
+	assert(unsigned(version) <= 1);
+
+	meshopt::gEncodeIndexVersion = version;
+}
+
+int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(index_size == 2 || index_size == 4);
+
+	// the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+	if (buffer_size < 1 + index_count / 3 + 16)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kIndexHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > 1)
+		return -1;
+
+	EdgeFifo edgefifo;
+	memset(edgefifo, -1, sizeof(edgefifo));
+
+	VertexFifo vertexfifo;
+	memset(vertexfifo, -1, sizeof(vertexfifo));
+
+	size_t edgefifooffset = 0;
+	size_t vertexfifooffset = 0;
+
+	unsigned int next = 0;
+	unsigned int last = 0;
+
+	int fecmax = version >= 1 ? 13 : 15;
+
+	// since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
+	const unsigned char* code = buffer + 1;
+	const unsigned char* data = code + index_count / 3;
+	const unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+	const unsigned char* codeaux_table = data_safe_end;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		// make sure we have enough data to read for a triangle
+		// each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
+		// after this we can be sure we can read without extra bounds checks
+		if (data > data_safe_end)
+			return -2;
+
+		unsigned char codetri = *code++;
+
+		if (codetri < 0xf0)
+		{
+			int fe = codetri >> 4;
+
+			// fifo reads are wrapped around 16 entry buffer
+			unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
+			unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+
+			int fec = codetri & 15;
+
+			// note: this is the most common path in the entire decoder
+			// inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
+			if (fec < fecmax)
+			{
+				// fifo reads are wrapped around 16 entry buffer
+				unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				unsigned int c = 0;
+
+				// fec - (fec ^ 3) decodes 13, 14 into -1, 1
+				// note that we need to update the last index since free indices are delta-encoded
+				last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+		else
+		{
+			// fast path: read codeaux from the table
+			if (codetri < 0xfe)
+			{
+				unsigned char codeaux = codeaux_table[codetri & 15];
+
+				// note: table can't contain feb/fec=15
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = next++;
+
+				unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int b = (feb == 0) ? next : bf;
+
+				int feb0 = feb == 0;
+				next += feb0;
+
+				unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
+				unsigned int c = (fec == 0) ? next : cf;
+
+				int fec0 = fec == 0;
+				next += fec0;
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+			else
+			{
+				// slow path: read a full byte for codeaux instead of using a table lookup
+				unsigned char codeaux = *data++;
+
+				int fea = codetri == 0xfe ? 0 : 15;
+				int feb = codeaux >> 4;
+				int fec = codeaux & 15;
+
+				// reset: codeaux is 0 but encoded as not-a-table
+				if (codeaux == 0)
+					next = 0;
+
+				// fifo reads are wrapped around 16 entry buffer
+				// also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+				unsigned int a = (fea == 0) ? next++ : 0;
+				unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
+				unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
+
+				// note that we need to update the last index since free indices are delta-encoded
+				if (fea == 15)
+					last = a = decodeIndex(data, last);
+
+				if (feb == 15)
+					last = b = decodeIndex(data, last);
+
+				if (fec == 15)
+					last = c = decodeIndex(data, last);
+
+				// output triangle
+				writeTriangle(destination, i, index_size, a, b, c);
+
+				// push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+				pushVertexFifo(vertexfifo, a, vertexfifooffset);
+				pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
+				pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
+
+				pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+				pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+				pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+			}
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and codeaux table
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
+
+size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return 0;
+
+	int version = gEncodeIndexVersion;
+
+	buffer[0] = (unsigned char)(kSequenceHeader | version);
+
+	unsigned int last[2] = {};
+	unsigned int current = 0;
+
+	unsigned char* data = buffer + 1;
+	unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to write
+		// each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can write without extra bounds checks
+		if (data >= data_safe_end)
+			return 0;
+
+		unsigned int index = indices[i];
+
+		// this is a heuristic that switches between baselines when the delta grows too large
+		// we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
+		// for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
+		int cd = int(index - last[current]);
+		current ^= ((cd < 0 ? -cd : cd) >= 30);
+
+		// encode delta from the last index
+		unsigned int d = index - last[current];
+		unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+		// note: low bit encodes the index of the last baseline which will be used for reconstruction
+		encodeVByte(data, (v << 1) | current);
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+	}
+
+	// make sure we have enough space to write tail
+	if (data > data_safe_end)
+		return 0;
+
+	for (int k = 0; k < 4; ++k)
+		*data++ = 0;
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
+{
+	// compute number of bits required for each index
+	unsigned int vertex_bits = 1;
+
+	while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+		vertex_bits++;
+
+	// worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
+	unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
+
+	return 1 + index_count * vertex_groups + 4;
+}
+
+int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	// the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+	if (buffer_size < 1 + index_count + 4)
+		return -2;
+
+	if ((buffer[0] & 0xf0) != kSequenceHeader)
+		return -1;
+
+	int version = buffer[0] & 0x0f;
+	if (version > 1)
+		return -1;
+
+	const unsigned char* data = buffer + 1;
+	const unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+	unsigned int last[2] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		// make sure we have enough data to read
+		// each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+		// after this we can be sure we can read without extra bounds checks
+		if (data >= data_safe_end)
+			return -2;
+
+		unsigned int v = decodeVByte(data);
+
+		// decode the index of the last baseline
+		unsigned int current = v & 1;
+		v >>= 1;
+
+		// reconstruct index as a delta
+		unsigned int d = (v >> 1) ^ -int(v & 1);
+		unsigned int index = last[current] + d;
+
+		// update last for the next iteration that uses it
+		last[current] = index;
+
+		if (index_size == 2)
+		{
+			static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
+		}
+		else
+		{
+			static_cast<unsigned int*>(destination)[i] = index;
+		}
+	}
+
+	// we should've read all data bytes and stopped at the boundary between data and tail
+	if (data != data_safe_end)
+		return -3;
+
+	return 0;
+}
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
new file mode 100644
index 0000000000..aa4a30efa4
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -0,0 +1,347 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+namespace meshopt
+{
+
+static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
+{
+	// MurmurHash2
+	const unsigned int m = 0x5bd1e995;
+	const int r = 24;
+
+	while (len >= 4)
+	{
+		unsigned int k = *reinterpret_cast<const unsigned int*>(key);
+
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+
+		key += 4;
+		len -= 4;
+	}
+
+	return h;
+}
+
+struct VertexHasher
+{
+	const unsigned char* vertices;
+	size_t vertex_size;
+	size_t vertex_stride;
+
+	size_t hash(unsigned int index) const
+	{
+		return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
+	}
+};
+
+struct VertexStreamHasher
+{
+	const meshopt_Stream* streams;
+	size_t stream_count;
+
+	size_t hash(unsigned int index) const
+	{
+		unsigned int h = 0;
+
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			h = hashUpdate4(h, data + index * s.stride, s.size);
+		}
+
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		for (size_t i = 0; i < stream_count; ++i)
+		{
+			const meshopt_Stream& s = streams[i];
+			const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+			if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
+				return false;
+		}
+
+		return true;
+	}
+};
+
+static size_t hashBuckets(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return 0;
+}
+
+} // namespace meshopt
+
+size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices || index_count == vertex_count);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+			{
+				*entry = index;
+
+				destination[index] = next_vertex++;
+			}
+			else
+			{
+				assert(destination[*entry] != ~0u);
+
+				destination[index] = destination[*entry];
+			}
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place remap
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] != ~0u)
+		{
+			assert(remap[i] < vertex_count);
+
+			memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
+		}
+	}
+}
+
+void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+	assert(index_count % 3 == 0);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices ? indices[i] : unsigned(i);
+		assert(remap[index] != ~0u);
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size <= vertex_stride);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
+
+void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+	using namespace meshopt;
+
+	assert(indices);
+	assert(index_count % 3 == 0);
+	assert(stream_count > 0 && stream_count <= 16);
+
+	for (size_t i = 0; i < stream_count; ++i)
+	{
+		assert(streams[i].size > 0 && streams[i].size <= 256);
+		assert(streams[i].size <= streams[i].stride);
+	}
+
+	meshopt_Allocator allocator;
+
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+	VertexStreamHasher hasher = {streams, stream_count};
+
+	size_t table_size = hashBuckets(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (remap[index] == ~0u)
+		{
+			unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+			if (*entry == ~0u)
+				*entry = index;
+
+			remap[index] = *entry;
+		}
+
+		destination[i] = remap[index];
+	}
+}
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
new file mode 100644
index 0000000000..a442d103c8
--- /dev/null
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -0,0 +1,948 @@
+/**
+ * meshoptimizer - version 0.15
+ *
+ * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
+ *
+ * This library is distributed under the MIT License. See notice at the end of this file.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stddef.h>
+
+/* Version macro; major * 1000 + minor * 10 + patch */
+#define MESHOPTIMIZER_VERSION 150 /* 0.15 */
+
+/* If no API is defined, assume default */
+#ifndef MESHOPTIMIZER_API
+#define MESHOPTIMIZER_API
+#endif
+
+/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+
+/* C interface */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Vertex attribute stream, similar to glVertexPointer
+ * Each element takes size bytes, with stride controlling the spacing between successive elements.
+ */
+struct meshopt_Stream
+{
+	const void* data;
+	size_t size;
+	size_t stride;
+};
+
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Generates a vertex remap table from multiple vertex streams and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting vertex buffer (unique_vertex_count elements, returned by meshopt_generateVertexRemap)
+ * vertex_count should be the initial vertex count and not the value returned by meshopt_generateVertexRemap
+ */
+MESHOPTIMIZER_API void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap);
+
+/**
+ * Generate index buffer from the source index buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Vertex transform cache optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for strip-like caches
+ * Produces inferior results to meshopt_optimizeVertexCache from the GPU vertex cache perspective
+ * However, the resulting index order is more optimal if the goal is to reduce the triangle strip length or improve compression efficiency
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for FIFO caches
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * cache_size should be less than the actual GPU cache size to avoid cache thrashing
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+
+/**
+ * Overdraw optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
+ */
+MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+
+/**
+ * Vertex fetch cache optimizer
+ * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
+ * indices is used both as an input and as an output index buffer
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Vertex fetch cache optimizer
+ * Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * The resulting remap table should be used to reorder vertex/index buffers using meshopt_remapVertexBuffer/meshopt_remapIndexBuffer
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Index buffer encoder
+ * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original.
+ * Input index buffer must represent a triangle list.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first.
+ *
+ * buffer must contain enough space for the encoded index buffer (use meshopt_encodeIndexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Experimental: Set index encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeIndexVersion(int version);
+
+/**
+ * Index buffer decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Experimental: Index sequence encoder
+ * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
+ * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ *
+ * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Index sequence decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexSequence
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index sequence (index_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer encoder
+ * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
+ * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Set vertex encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeVertexVersion(int version);
+
+/**
+ * Vertex buffer decoder
+ * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer filters
+ * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
+ * count must be aligned by 4 and stride is fixed for each function to facilitate SIMD implementation.
+ *
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ *
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
+ *
+ * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
+ * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Mesh simplifier
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+
+/**
+ * Experimental: Mesh simplifier (sloppy)
+ * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
+ * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+
+/**
+ * Experimental: Point cloud simplifier
+ * Reduces the number of points in the cloud to reach the given target
+ * Returns the number of points after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
+
+/**
+ * Mesh stripifier
+ * Converts a previously vertex cache optimized triangle list to triangle strip, stitching strips using restart index or degenerate triangles
+ * Returns the number of indices in the resulting strip, with destination containing new index data
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ * Using restart indices can result in ~10% smaller index buffers, but on some GPUs restart indices may result in decreased performance.
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_stripifyBound
+ * restart_index should be 0xffff or 0xffffffff depending on index size, or 0 to use degenerate triangles
+ */
+MESHOPTIMIZER_API size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_stripifyBound(size_t index_count);
+
+/**
+ * Mesh unstripifier
+ * Converts a triangle strip to a triangle list
+ * Returns the number of indices in the resulting list, with destination containing new index data
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_unstripifyBound
+ */
+MESHOPTIMIZER_API size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_unstripifyBound(size_t index_count);
+
+struct meshopt_VertexCacheStatistics
+{
+	unsigned int vertices_transformed;
+	unsigned int warps_executed;
+	float acmr; /* transformed vertices / triangle count; best case 0.5, worst case 3.0, optimum depends on topology */
+	float atvr; /* transformed vertices / vertex count; best case 1.0, worst case 6.0, optimum is 1.0 (each vertex is transformed once) */
+};
+
+/**
+ * Vertex transform cache analyzer
+ * Returns cache hit statistics using a simplified FIFO model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+
+struct meshopt_OverdrawStatistics
+{
+	unsigned int pixels_covered;
+	unsigned int pixels_shaded;
+	float overdraw; /* shaded pixels / covered pixels; best case 1.0 */
+};
+
+/**
+ * Overdraw analyzer
+ * Returns overdraw statistics using a software rasterizer
+ * Results may not match actual GPU performance
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+struct meshopt_VertexFetchStatistics
+{
+	unsigned int bytes_fetched;
+	float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
+struct meshopt_Meshlet
+{
+	unsigned int vertices[64];
+	unsigned char indices[126][3];
+	unsigned char triangle_count;
+	unsigned char vertex_count;
+};
+
+/**
+ * Experimental: Meshlet builder
+ * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
+ * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ *
+ * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
+
+struct meshopt_Bounds
+{
+	/* bounding sphere, useful for frustum and occlusion culling */
+	float center[3];
+	float radius;
+
+	/* normal cone, useful for backface culling */
+	float cone_apex[3];
+	float cone_axis[3];
+	float cone_cutoff; /* = cos(angle/2) */
+
+	/* normal cone axis and cutoff, stored in 8-bit SNORM format; decode using x/127.0 */
+	signed char cone_axis_s8[3];
+	signed char cone_cutoff_s8;
+};
+
+/**
+ * Experimental: Cluster bounds generator
+ * Creates bounding volumes that can be used for frustum, backface and occlusion culling.
+ *
+ * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
+ *   dot(view, cone_axis) >= cone_cutoff
+ *
+ * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
+ *   dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
+ *
+ * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
+ *   dot(normalize(center - camera_position), cone_axis) >= cone_cutoff + radius / length(center - camera_position)
+ * or an equivalent formula that doesn't have a singularity at center = camera_position:
+ *   dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
+ *
+ * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Generates a remap table that can be used to reorder points for spatial locality.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Set allocation callbacks
+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
+ * Note that all algorithms only allocate memory for temporary use.
+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
+ */
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+/* Quantization into commonly supported data formats */
+#ifdef __cplusplus
+/**
+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
+ * Maximum reconstruction error: 1/2^(N+1)
+ */
+inline int meshopt_quantizeUnorm(float v, int N);
+
+/**
+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
+ * Maximum reconstruction error: 1/2^N
+ */
+inline int meshopt_quantizeSnorm(float v, int N);
+
+/**
+ * Quantize a float into half-precision floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+inline unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+inline float meshopt_quantizeFloat(float v, int N);
+#endif
+
+/**
+ * C++ template interface
+ *
+ * These functions mirror the C interface the library provides, providing template-based overloads so that
+ * the caller can use an arbitrary type for the index data, both for input and output.
+ * When the supplied type is the same size as that of unsigned int, the wrappers are zero-cost; when it's not,
+ * the wrappers end up allocating memory and copying index data to convert from one type to another.
+ */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+#endif
+
+/* Inline implementation */
+#ifdef __cplusplus
+inline int meshopt_quantizeUnorm(float v, int N)
+{
+	const float scale = float((1 << N) - 1);
+
+	v = (v >= 0) ? v : 0;
+	v = (v <= 1) ? v : 1;
+
+	return int(v * scale + 0.5f);
+}
+
+inline int meshopt_quantizeSnorm(float v, int N)
+{
+	const float scale = float((1 << (N - 1)) - 1);
+
+	float round = (v >= 0 ? 0.5f : -0.5f);
+
+	v = (v >= -1) ? v : -1;
+	v = (v <= +1) ? v : +1;
+
+	return int(v * scale + round);
+}
+
+inline unsigned short meshopt_quantizeHalf(float v)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	int s = (ui >> 16) & 0x8000;
+	int em = ui & 0x7fffffff;
+
+	/* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
+	int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+	/* underflow: flush to zero; 113 encodes exponent -14 */
+	h = (em < (113 << 23)) ? 0 : h;
+
+	/* overflow: infinity; 143 encodes exponent 16 */
+	h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+	/* NaN; note that we convert all types of NaN to qNaN */
+	h = (em > (255 << 23)) ? 0x7e00 : h;
+
+	return (unsigned short)(s | h);
+}
+
+inline float meshopt_quantizeFloat(float v, int N)
+{
+	union { float f; unsigned int ui; } u = {v};
+	unsigned int ui = u.ui;
+
+	const int mask = (1 << (23 - N)) - 1;
+	const int round = (1 << (23 - N)) >> 1;
+
+	int e = ui & 0x7f800000;
+	unsigned int rui = (ui + round) & ~mask;
+
+	/* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
+	ui = e == 0x7f800000 ? ui : rui;
+
+	/* flush denormals to zero */
+	ui = e == 0 ? 0 : ui;
+
+	u.ui = ui;
+	return u.f;
+}
+#endif
+
+/* Internal implementation helpers */
+#ifdef __cplusplus
+class meshopt_Allocator
+{
+public:
+	template <typename T>
+	struct StorageT
+	{
+		static void* (*allocate)(size_t);
+		static void (*deallocate)(void*);
+	};
+
+	typedef StorageT<void> Storage;
+
+	meshopt_Allocator()
+		: blocks()
+		, count(0)
+	{
+	}
+
+	~meshopt_Allocator()
+	{
+		for (size_t i = count; i > 0; --i)
+			Storage::deallocate(blocks[i - 1]);
+	}
+
+	template <typename T> T* allocate(size_t size)
+	{
+		assert(count < sizeof(blocks) / sizeof(blocks[0]));
+		T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+		blocks[count++] = result;
+		return result;
+	}
+
+private:
+	void* blocks[24];
+	size_t count;
+};
+
+// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
+template <typename T> void* (*meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
+template <typename T> void (*meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
+#endif
+
+/* Inline implementation for C++ templated wrappers */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T, bool ZeroCopy = sizeof(T) == sizeof(unsigned int)>
+struct meshopt_IndexAdapter;
+
+template <typename T>
+struct meshopt_IndexAdapter<T, false>
+{
+	T* result;
+	unsigned int* data;
+	size_t count;
+
+	meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
+	    : result(result_)
+	    , data(0)
+	    , count(count_)
+	{
+		size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
+
+		data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
+
+		if (input)
+		{
+			for (size_t i = 0; i < count; ++i)
+				data[i] = input[i];
+		}
+	}
+
+	~meshopt_IndexAdapter()
+	{
+		if (result)
+		{
+			for (size_t i = 0; i < count; ++i)
+				result[i] = T(data[i]);
+		}
+
+		meshopt_Allocator::Storage::deallocate(data);
+	}
+};
+
+template <typename T>
+struct meshopt_IndexAdapter<T, true>
+{
+	unsigned int* data;
+
+	meshopt_IndexAdapter(T* result, const T* input, size_t)
+	    : data(reinterpret_cast<unsigned int*>(result ? result : const_cast<T*>(input)))
+	{
+	}
+};
+
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+	return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
+{
+	meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
+}
+
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> inout(indices, indices, index_count);
+
+	return meshopt_optimizeVertexFetch(destination, inout.data, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+	(void)index_size_valid;
+
+	return meshopt_decodeIndexBuffer(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+	char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+	(void)index_size_valid;
+
+	return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, target_index_count);
+
+	return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count);
+}
+
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 5);
+
+	return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3);
+
+	return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+}
+
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+}
+
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+	meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+#endif
+
+/**
+ * Copyright (c) 2016-2020 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/thirdparty/meshoptimizer/overdrawanalyzer.cpp b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
new file mode 100644
index 0000000000..8d5859ba39
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
@@ -0,0 +1,230 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Nicolas Capens. Advanced Rasterization. 2004
+namespace meshopt
+{
+
+const int kViewport = 256;
+
+struct OverdrawBuffer
+{
+	float z[kViewport][kViewport][2];
+	unsigned int overdraw[kViewport][kViewport][2];
+};
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
+{
+	// z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
+	// z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
+	// (x2-x1 y2-y1)(dzdx) = (z2-z1)
+	// (x3-x1 y3-y1)(dzdy)   (z3-z1)
+	// we'll solve it with Cramer's rule
+	float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
+	float invdet = (det == 0) ? 0 : 1 / det;
+
+	dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
+	dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+
+	return det;
+}
+
+// half-space fixed point triangle rasterizer
+static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
+{
+	// compute depth gradients
+	float DZx, DZy;
+	float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+	int sign = det > 0;
+
+	// flip backfacing triangles to simplify rasterization logic
+	if (sign)
+	{
+		// flipping v2 & v3 preserves depth gradients since they're based on v1
+		float t;
+		t = v2x, v2x = v3x, v3x = t;
+		t = v2y, v2y = v3y, v3y = t;
+		t = v2z, v2z = v3z, v3z = t;
+
+		// flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
+		v1z = kViewport - v1z;
+		DZx = -DZx;
+		DZy = -DZy;
+	}
+
+	// coordinates, 28.4 fixed point
+	int X1 = int(16.0f * v1x + 0.5f);
+	int X2 = int(16.0f * v2x + 0.5f);
+	int X3 = int(16.0f * v3x + 0.5f);
+
+	int Y1 = int(16.0f * v1y + 0.5f);
+	int Y2 = int(16.0f * v2y + 0.5f);
+	int Y3 = int(16.0f * v3y + 0.5f);
+
+	// bounding rectangle, clipped against viewport
+	// since we rasterize pixels with covered centers, min >0.5 should round up
+	// as for max, due to top-left filling convention we will never rasterize right/bottom edges
+	// so max >= 0.5 should round down
+	int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
+	int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
+	int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
+	int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+
+	// deltas, 28.4 fixed point
+	int DX12 = X1 - X2;
+	int DX23 = X2 - X3;
+	int DX31 = X3 - X1;
+
+	int DY12 = Y1 - Y2;
+	int DY23 = Y2 - Y3;
+	int DY31 = Y3 - Y1;
+
+	// fill convention correction
+	int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
+	int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
+	int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
+
+	// half edge equations, 24.8 fixed point
+	// note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
+	int FX = (minx << 4) + 8;
+	int FY = (miny << 4) + 8;
+	int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
+	int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
+	int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
+	float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
+
+	for (int y = miny; y < maxy; y++)
+	{
+		int CX1 = CY1;
+		int CX2 = CY2;
+		int CX3 = CY3;
+		float ZX = ZY;
+
+		for (int x = minx; x < maxx; x++)
+		{
+			// check if all CXn are non-negative
+			if ((CX1 | CX2 | CX3) >= 0)
+			{
+				if (ZX >= buffer->z[y][x][sign])
+				{
+					buffer->z[y][x][sign] = ZX;
+					buffer->overdraw[y][x][sign]++;
+				}
+			}
+
+			// signed left shift is UB for negative numbers so use unsigned-signed casts
+			CX1 -= int(unsigned(DY12) << 4);
+			CX2 -= int(unsigned(DY23) << 4);
+			CX3 -= int(unsigned(DY31) << 4);
+			ZX += DZx;
+		}
+
+		// signed left shift is UB for negative numbers so use unsigned-signed casts
+		CY1 += int(unsigned(DX12) << 4);
+		CY2 += int(unsigned(DX23) << 4);
+		CY3 += int(unsigned(DX31) << 4);
+		ZY += DZy;
+	}
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_OverdrawStatistics result = {};
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			minv[j] = min(minv[j], v[j]);
+			maxv[j] = max(maxv[j], v[j]);
+		}
+	}
+
+	float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
+	float scale = kViewport / extent;
+
+	float* triangles = allocator.allocate<float>(index_count * 3);
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		const float* v = vertex_positions + index * vertex_stride_float;
+
+		triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
+		triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
+		triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
+	}
+
+	OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+	for (int axis = 0; axis < 3; ++axis)
+	{
+		memset(buffer, 0, sizeof(OverdrawBuffer));
+
+		for (size_t i = 0; i < index_count; i += 3)
+		{
+			const float* vn0 = &triangles[3 * (i + 0)];
+			const float* vn1 = &triangles[3 * (i + 1)];
+			const float* vn2 = &triangles[3 * (i + 2)];
+
+			switch (axis)
+			{
+			case 0:
+				rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+				break;
+			case 1:
+				rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+				break;
+			case 2:
+				rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+				break;
+			}
+		}
+
+		for (int y = 0; y < kViewport; ++y)
+			for (int x = 0; x < kViewport; ++x)
+				for (int s = 0; s < 2; ++s)
+				{
+					unsigned int overdraw = buffer->overdraw[y][x][s];
+
+					result.pixels_covered += overdraw > 0;
+					result.pixels_shaded += overdraw;
+				}
+	}
+
+	result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/overdrawoptimizer.cpp b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
new file mode 100644
index 0000000000..143656ed76
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
@@ -0,0 +1,333 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float mesh_centroid[3] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		const float* p = vertex_positions + vertex_stride_float * indices[i];
+
+		mesh_centroid[0] += p[0];
+		mesh_centroid[1] += p[1];
+		mesh_centroid[2] += p[2];
+	}
+
+	mesh_centroid[0] /= index_count;
+	mesh_centroid[1] /= index_count;
+	mesh_centroid[2] /= index_count;
+
+	for (size_t cluster = 0; cluster < cluster_count; ++cluster)
+	{
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		float cluster_area = 0;
+		float cluster_centroid[3] = {};
+		float cluster_normal[3] = {};
+
+		for (size_t i = cluster_begin; i < cluster_end; i += 3)
+		{
+			const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
+			const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
+			const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
+
+			float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+			float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+			float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+			float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+			float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+			float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+			cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
+			cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
+			cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
+			cluster_normal[0] += normalx;
+			cluster_normal[1] += normaly;
+			cluster_normal[2] += normalz;
+			cluster_area += area;
+		}
+
+		float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
+
+		cluster_centroid[0] *= inv_cluster_area;
+		cluster_centroid[1] *= inv_cluster_area;
+		cluster_centroid[2] *= inv_cluster_area;
+
+		float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
+		float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
+
+		cluster_normal[0] *= inv_cluster_normal_length;
+		cluster_normal[1] *= inv_cluster_normal_length;
+		cluster_normal[2] *= inv_cluster_normal_length;
+
+		float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
+
+		sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
+	}
+}
+
+static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
+{
+	// compute sort data bounds and renormalize, using fixed point snorm
+	float sort_data_max = 1e-3f;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		float dpa = fabsf(sort_data[i]);
+
+		sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
+	}
+
+	const int sort_bits = 11;
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		// note that we flip distribution since high dot product should come first
+		float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
+
+		sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
+	}
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		histogram[sort_keys[i]]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == cluster_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < cluster_count; ++i)
+	{
+		sort_order[histogram[sort_keys[i]]++] = unsigned(i);
+	}
+}
+
+static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
+{
+	unsigned int cache_misses = 0;
+
+	// if vertex is not in cache, put it in cache
+	if (timestamp - cache_timestamps[a] > cache_size)
+	{
+		cache_timestamps[a] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[b] > cache_size)
+	{
+		cache_timestamps[b] = timestamp++;
+		cache_misses++;
+	}
+
+	if (timestamp - cache_timestamps[c] > cache_size)
+	{
+		cache_timestamps[c] = timestamp++;
+		cache_misses++;
+	}
+
+	return cache_misses;
+}
+
+static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	size_t face_count = index_count / 3;
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+		// when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
+		// that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
+		// suggests an inefficiency in the vertex cache optimization algorithm
+		// usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
+		if (i == 0 || m == 3)
+		{
+			destination[result++] = unsigned(i);
+		}
+	}
+
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
+{
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = 0;
+
+	size_t result = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		size_t start = clusters[it];
+		size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
+		assert(start < end);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		// measure cluster ACMR
+		unsigned int cluster_misses = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			cluster_misses += m;
+		}
+
+		float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
+
+		// first cluster always starts from the hard cluster boundary
+		destination[result++] = unsigned(start);
+
+		// reset cache
+		timestamp += cache_size + 1;
+
+		unsigned int running_misses = 0;
+		unsigned int running_faces = 0;
+
+		for (size_t i = start; i < end; ++i)
+		{
+			unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+			running_misses += m;
+			running_faces += 1;
+
+			if (float(running_misses) / float(running_faces) <= cluster_threshold)
+			{
+				// we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
+				// note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
+				// cluster is empty; however, the 'pop_back' after the loop will clean it up
+				destination[result++] = unsigned(i + 1);
+
+				// reset cache
+				timestamp += cache_size + 1;
+
+				running_misses = 0;
+				running_faces = 0;
+			}
+		}
+
+		// each time we reach the target ACMR we flush the cluster
+		// this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
+		// in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
+		// thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
+		// there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
+		// to the cluster boundary array which we need to remove anyway - this code will do that automatically
+		if (destination[result - 1] != start)
+		{
+			result--;
+		}
+	}
+
+	assert(result >= cluster_count);
+	assert(result <= index_count / 3);
+
+	return result;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+
+	// generate hard boundaries from full-triangle cache misses
+	unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
+	size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
+
+	// generate soft boundaries
+	unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
+	size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
+
+	const unsigned int* clusters = soft_clusters;
+	size_t cluster_count = soft_cluster_count;
+
+	// fill sort data
+	float* sort_data = allocator.allocate<float>(cluster_count);
+	calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+
+	// sort clusters using sort data
+	unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
+	unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
+	calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
+
+	// fill output buffer
+	size_t offset = 0;
+
+	for (size_t it = 0; it < cluster_count; ++it)
+	{
+		unsigned int cluster = sort_order[it];
+		assert(cluster < cluster_count);
+
+		size_t cluster_begin = clusters[cluster] * 3;
+		size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+		assert(cluster_begin < cluster_end);
+
+		memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
+		offset += cluster_end - cluster_begin;
+	}
+
+	assert(offset == index_count);
+}
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
new file mode 100644
index 0000000000..bd523275ce
--- /dev/null
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -0,0 +1,1529 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
+// Michael Garland. Quadric-based polygonal surface simplification. 1999
+// Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
+// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
+namespace meshopt
+{
+
+struct EdgeAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill edge counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill edge data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = b;
+		adjacency.data[adjacency.offsets[b]++] = c;
+		adjacency.data[adjacency.offsets[c]++] = a;
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+struct PositionHasher
+{
+	const float* vertex_positions;
+	size_t vertex_stride_float;
+
+	size_t hash(unsigned int index) const
+	{
+		const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (key[0] * 73856093) ^ (key[1] * 19349663) ^ (key[2] * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return memcmp(vertex_positions + lhs * vertex_stride_float, vertex_positions + rhs * vertex_stride_float, sizeof(float) * 3) == 0;
+	}
+};
+
+static size_t hashBuckets2(size_t count)
+{
+	size_t buckets = 1;
+	while (buckets < count)
+		buckets *= 2;
+
+	return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup2(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+	assert(buckets > 0);
+	assert((buckets & (buckets - 1)) == 0);
+
+	size_t hashmod = buckets - 1;
+	size_t bucket = hash.hash(key) & hashmod;
+
+	for (size_t probe = 0; probe <= hashmod; ++probe)
+	{
+		T& item = table[bucket];
+
+		if (item == empty)
+			return &item;
+
+		if (hash.equal(item, key))
+			return &item;
+
+		// hash collision, quadratic probing
+		bucket = (bucket + probe + 1) & hashmod;
+	}
+
+	assert(false && "Hash table is full"); // unreachable
+	return 0;
+}
+
+static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+	PositionHasher hasher = {vertex_positions_data, vertex_positions_stride / sizeof(float)};
+
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	// build forward remap: for each vertex, which other (canonical) vertex does it map to?
+	// we use position equivalence for this, and remap vertices to other existing vertices
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int index = unsigned(i);
+		unsigned int* entry = hashLookup2(table, table_size, hasher, index, ~0u);
+
+		if (*entry == ~0u)
+			*entry = index;
+
+		remap[index] = *entry;
+	}
+
+	// build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
+	// entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
+	for (size_t i = 0; i < vertex_count; ++i)
+		wedge[i] = unsigned(i);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		if (remap[i] != i)
+		{
+			unsigned int r = remap[i];
+
+			wedge[i] = wedge[r];
+			wedge[r] = unsigned(i);
+		}
+}
+
+enum VertexKind
+{
+	Kind_Manifold, // not on an attribute seam, not on any boundary
+	Kind_Border,   // not on an attribute seam, has exactly two open edges
+	Kind_Seam,     // on an attribute seam with exactly two attribute seam edges
+	Kind_Complex,  // none of the above; these vertices can move as long as all wedges move to the target vertex
+	Kind_Locked,   // none of the above; these vertices can't move
+
+	Kind_Count
+};
+
+// manifold vertices can collapse onto anything
+// border/seam vertices can only be collapsed onto border/seam respectively
+// complex vertices can collapse onto complex/locked
+// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
+// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
+const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
+    {1, 1, 1, 1, 1},
+    {0, 1, 0, 0, 0},
+    {0, 0, 1, 0, 0},
+    {0, 0, 0, 1, 1},
+    {0, 0, 0, 0, 0},
+};
+
+// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
+// note that for seam edges, the opposite edge isn't present in the attribute-based topology
+// but is present if you consider a position-only mesh variant
+const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
+    {1, 1, 1, 0, 1},
+    {1, 0, 1, 0, 0},
+    {1, 1, 1, 0, 1},
+    {0, 0, 0, 0, 0},
+    {1, 0, 1, 0, 0},
+};
+
+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b)
+{
+	unsigned int count = adjacency.counts[a];
+	const unsigned int* data = adjacency.data + adjacency.offsets[a];
+
+	for (size_t i = 0; i < count; ++i)
+		if (data[i] == b)
+			return true;
+
+	return false;
+}
+
+static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge)
+{
+	memset(loop, -1, vertex_count * sizeof(unsigned int));
+	memset(loopback, -1, vertex_count * sizeof(unsigned int));
+
+	// incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
+	// note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
+	// but here it's okay to fill the data out for other types of vertices as well
+	unsigned int* openinc = loopback;
+	unsigned int* openout = loop;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int vertex = unsigned(i);
+
+		unsigned int count = adjacency.counts[vertex];
+		const unsigned int* data = adjacency.data + adjacency.offsets[vertex];
+
+		for (size_t j = 0; j < count; ++j)
+		{
+			unsigned int target = data[j];
+
+			if (!hasEdge(adjacency, target, vertex))
+			{
+				openinc[target] = (openinc[target] == ~0u) ? vertex : target;
+				openout[vertex] = (openout[vertex] == ~0u) ? target : vertex;
+			}
+		}
+	}
+
+#if TRACE
+	size_t lockedstats[4] = {};
+#define TRACELOCKED(i) lockedstats[i]++;
+#else
+#define TRACELOCKED(i) (void)0
+#endif
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (remap[i] == i)
+		{
+			if (wedge[i] == i)
+			{
+				// no attribute seam, need to check if it's manifold
+				unsigned int openi = openinc[i], openo = openout[i];
+
+				// note: we classify any vertices with no open edges as manifold
+				// this is technically incorrect - if 4 triangles share an edge, we'll classify vertices as manifold
+				// it's unclear if this is a problem in practice
+				if (openi == ~0u && openo == ~0u)
+				{
+					result[i] = Kind_Manifold;
+				}
+				else if (openi != i && openo != i)
+				{
+					result[i] = Kind_Border;
+				}
+				else
+				{
+					result[i] = Kind_Locked;
+					TRACELOCKED(0);
+				}
+			}
+			else if (wedge[wedge[i]] == i)
+			{
+				// attribute seam; need to distinguish between Seam and Locked
+				unsigned int w = wedge[i];
+				unsigned int openiv = openinc[i], openov = openout[i];
+				unsigned int openiw = openinc[w], openow = openout[w];
+
+				// seam should have one open half-edge for each vertex, and the edges need to "connect" - point to the same vertex post-remap
+				if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
+				    openiw != ~0u && openiw != w && openow != ~0u && openow != w)
+				{
+					if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw])
+					{
+						result[i] = Kind_Seam;
+					}
+					else
+					{
+						result[i] = Kind_Locked;
+						TRACELOCKED(1);
+					}
+				}
+				else
+				{
+					result[i] = Kind_Locked;
+					TRACELOCKED(2);
+				}
+			}
+			else
+			{
+				// more than one vertex maps to this one; we don't have classification available
+				result[i] = Kind_Locked;
+				TRACELOCKED(3);
+			}
+		}
+		else
+		{
+			assert(remap[i] < i);
+
+			result[i] = result[remap[i]];
+		}
+	}
+
+#if TRACE
+	printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n",
+	       int(lockedstats[0]), int(lockedstats[1]), int(lockedstats[2]), int(lockedstats[3]));
+#endif
+}
+
+struct Vector3
+{
+	float x, y, z;
+};
+
+static void rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		result[i].x = v[0];
+		result[i].y = v[1];
+		result[i].z = v[2];
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = extent == 0 ? 0.f : 1.f / extent;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		result[i].x = (result[i].x - minv[0]) * scale;
+		result[i].y = (result[i].y - minv[1]) * scale;
+		result[i].z = (result[i].z - minv[2]) * scale;
+	}
+}
+
+struct Quadric
+{
+	float a00, a11, a22;
+	float a10, a20, a21;
+	float b0, b1, b2, c;
+	float w;
+};
+
+struct Collapse
+{
+	unsigned int v0;
+	unsigned int v1;
+
+	union
+	{
+		unsigned int bidi;
+		float error;
+		unsigned int errorui;
+	};
+};
+
+static float normalize(Vector3& v)
+{
+	float length = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+
+	if (length > 0)
+	{
+		v.x /= length;
+		v.y /= length;
+		v.z /= length;
+	}
+
+	return length;
+}
+
+static void quadricAdd(Quadric& Q, const Quadric& R)
+{
+	Q.a00 += R.a00;
+	Q.a11 += R.a11;
+	Q.a22 += R.a22;
+	Q.a10 += R.a10;
+	Q.a20 += R.a20;
+	Q.a21 += R.a21;
+	Q.b0 += R.b0;
+	Q.b1 += R.b1;
+	Q.b2 += R.b2;
+	Q.c += R.c;
+	Q.w += R.w;
+}
+
+static float quadricError(const Quadric& Q, const Vector3& v)
+{
+	float rx = Q.b0;
+	float ry = Q.b1;
+	float rz = Q.b2;
+
+	rx += Q.a10 * v.y;
+	ry += Q.a21 * v.z;
+	rz += Q.a20 * v.x;
+
+	rx *= 2;
+	ry *= 2;
+	rz *= 2;
+
+	rx += Q.a00 * v.x;
+	ry += Q.a11 * v.y;
+	rz += Q.a22 * v.z;
+
+	float r = Q.c;
+	r += rx * v.x;
+	r += ry * v.y;
+	r += rz * v.z;
+
+	float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
+
+	return fabsf(r) * s;
+}
+
+static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
+{
+	float aw = a * w;
+	float bw = b * w;
+	float cw = c * w;
+	float dw = d * w;
+
+	Q.a00 = a * aw;
+	Q.a11 = b * bw;
+	Q.a22 = c * cw;
+	Q.a10 = a * bw;
+	Q.a20 = a * cw;
+	Q.a21 = b * cw;
+	Q.b0 = a * dw;
+	Q.b1 = b * dw;
+	Q.b2 = c * dw;
+	Q.c = d * dw;
+	Q.w = w;
+}
+
+static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+{
+	// we need to encode (x - X) ^ 2 + (y - Y)^2 + (z - Z)^2 into the quadric
+	Q.a00 = w;
+	Q.a11 = w;
+	Q.a22 = w;
+	Q.a10 = 0.f;
+	Q.a20 = 0.f;
+	Q.a21 = 0.f;
+	Q.b0 = -2.f * x * w;
+	Q.b1 = -2.f * y * w;
+	Q.b2 = -2.f * z * w;
+	Q.c = (x * x + y * y + z * z) * w;
+	Q.w = w;
+}
+
+static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+	// normal = cross(p1 - p0, p2 - p0)
+	Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+	float area = normalize(normal);
+
+	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+	// we use sqrtf(area) so that the error is scaled linearly; this tends to improve silhouettes
+	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, sqrtf(area) * weight);
+}
+
+static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+	Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+	float length = normalize(p10);
+
+	// p20p = length of projection of p2-p0 onto normalize(p1 - p0)
+	Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+	float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
+
+	// normal = altitude of triangle from point p2 onto edge p1-p0
+	Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p};
+	normalize(normal);
+
+	float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+	// note: the weight is scaled linearly with edge length; this has to match the triangle weight
+	quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+}
+
+static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int i0 = indices[i + 0];
+		unsigned int i1 = indices[i + 1];
+		unsigned int i2 = indices[i + 2];
+
+		Quadric Q;
+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
+
+		quadricAdd(vertex_quadrics[remap[i0]], Q);
+		quadricAdd(vertex_quadrics[remap[i1]], Q);
+		quadricAdd(vertex_quadrics[remap[i2]], Q);
+	}
+}
+
+static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			// check that either i0 or i1 are border/seam and are on the same edge loop
+			// note that we need to add the error even for edged that connect e.g. border & locked
+			// if we don't do that, the adjacent border->border edge won't have correct errors for corners
+			if (k0 != Kind_Border && k0 != Kind_Seam && k1 != Kind_Border && k1 != Kind_Seam)
+				continue;
+
+			if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+				continue;
+
+			if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
+				continue;
+
+			// seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+				continue;
+
+			unsigned int i2 = indices[i + next[next[e]]];
+
+			// we try hard to maintain border edge geometry; seam edges can move more freely
+			// due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
+			const float kEdgeWeightSeam = 1.f;
+			const float kEdgeWeightBorder = 10.f;
+
+			float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
+
+			Quadric Q;
+			quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+
+			quadricAdd(vertex_quadrics[remap[i0]], Q);
+			quadricAdd(vertex_quadrics[remap[i1]], Q);
+		}
+	}
+}
+
+static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+{
+	size_t collapse_count = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			// this can happen either when input has a zero-length edge, or when we perform collapses for complex
+			// topology w/seams and collapse a manifold vertex that connects to both wedges onto one of them
+			// we leave edges like this alone since they may be important for preserving mesh integrity
+			if (remap[i0] == remap[i1])
+				continue;
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			// the edge has to be collapsible in at least one direction
+			if (!(kCanCollapse[k0][k1] | kCanCollapse[k1][k0]))
+				continue;
+
+			// manifold and seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+			if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+				continue;
+
+			// two vertices are on a border or a seam, but there's no direct edge between them
+			// this indicates that they belong to two different edge loops and we should not collapse this edge
+			// loop[] tracks half edges so we only need to check i0->i1
+			if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+				continue;
+
+			// edge can be collapsed in either direction - we will pick the one with minimum error
+			// note: we evaluate error later during collapse ranking, here we just tag the edge as bidirectional
+			if (kCanCollapse[k0][k1] & kCanCollapse[k1][k0])
+			{
+				Collapse c = {i0, i1, {/* bidi= */ 1}};
+				collapses[collapse_count++] = c;
+			}
+			else
+			{
+				// edge can only be collapsed in one direction
+				unsigned int e0 = kCanCollapse[k0][k1] ? i0 : i1;
+				unsigned int e1 = kCanCollapse[k0][k1] ? i1 : i0;
+
+				Collapse c = {e0, e1, {/* bidi= */ 0}};
+				collapses[collapse_count++] = c;
+			}
+		}
+	}
+
+	return collapse_count;
+}
+
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap)
+{
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		Collapse& c = collapses[i];
+
+		unsigned int i0 = c.v0;
+		unsigned int i1 = c.v1;
+
+		// most edges are bidirectional which means we need to evaluate errors for two collapses
+		// to keep this code branchless we just use the same edge for unidirectional edges
+		unsigned int j0 = c.bidi ? i1 : i0;
+		unsigned int j1 = c.bidi ? i0 : i1;
+
+		const Quadric& qi = vertex_quadrics[remap[i0]];
+		const Quadric& qj = vertex_quadrics[remap[j0]];
+
+		float ei = quadricError(qi, vertex_positions[i1]);
+		float ej = quadricError(qj, vertex_positions[j1]);
+
+		// pick edge direction with minimal error
+		c.v0 = ei <= ej ? i0 : j0;
+		c.v1 = ei <= ej ? i1 : j1;
+		c.error = ei <= ej ? ei : ej;
+	}
+}
+
+#if TRACE > 1
+static void dumpEdgeCollapses(const Collapse* collapses, size_t collapse_count, const unsigned char* vertex_kind)
+{
+	size_t ckinds[Kind_Count][Kind_Count] = {};
+	float cerrors[Kind_Count][Kind_Count] = {};
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			cerrors[k0][k1] = FLT_MAX;
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		unsigned int i0 = collapses[i].v0;
+		unsigned int i1 = collapses[i].v1;
+
+		unsigned char k0 = vertex_kind[i0];
+		unsigned char k1 = vertex_kind[i1];
+
+		ckinds[k0][k1]++;
+		cerrors[k0][k1] = (collapses[i].error < cerrors[k0][k1]) ? collapses[i].error : cerrors[k0][k1];
+	}
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			if (ckinds[k0][k1])
+				printf("collapses %d -> %d: %d, min error %e\n", k0, k1, int(ckinds[k0][k1]), cerrors[k0][k1]);
+}
+
+static void dumpLockedCollapses(const unsigned int* indices, size_t index_count, const unsigned char* vertex_kind)
+{
+	size_t locked_collapses[Kind_Count][Kind_Count] = {};
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		static const int next[3] = {1, 2, 0};
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+
+			unsigned char k0 = vertex_kind[i0];
+			unsigned char k1 = vertex_kind[i1];
+
+			locked_collapses[k0][k1] += !kCanCollapse[k0][k1] && !kCanCollapse[k1][k0];
+		}
+	}
+
+	for (int k0 = 0; k0 < Kind_Count; ++k0)
+		for (int k1 = 0; k1 < Kind_Count; ++k1)
+			if (locked_collapses[k0][k1])
+				printf("locked collapses %d -> %d: %d\n", k0, k1, int(locked_collapses[k0][k1]));
+}
+#endif
+
+static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
+{
+	const int sort_bits = 11;
+
+	// fill histogram for counting sort
+	unsigned int histogram[1 << sort_bits];
+	memset(histogram, 0, sizeof(histogram));
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		// skip sign bit since error is non-negative
+		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+		histogram[key]++;
+	}
+
+	// compute offsets based on histogram data
+	size_t histogram_sum = 0;
+
+	for (size_t i = 0; i < 1 << sort_bits; ++i)
+	{
+		size_t count = histogram[i];
+		histogram[i] = unsigned(histogram_sum);
+		histogram_sum += count;
+	}
+
+	assert(histogram_sum == collapse_count);
+
+	// compute sort order based on offsets
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		// skip sign bit since error is non-negative
+		unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+		sort_order[histogram[key]++] = unsigned(i);
+	}
+}
+
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, size_t triangle_collapse_goal, float error_goal, float error_limit)
+{
+	size_t edge_collapses = 0;
+	size_t triangle_collapses = 0;
+
+	for (size_t i = 0; i < collapse_count; ++i)
+	{
+		const Collapse& c = collapses[collapse_order[i]];
+
+		if (c.error > error_limit)
+			break;
+
+		if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 10)
+			break;
+
+		if (triangle_collapses >= triangle_collapse_goal)
+			break;
+
+		unsigned int i0 = c.v0;
+		unsigned int i1 = c.v1;
+
+		unsigned int r0 = remap[i0];
+		unsigned int r1 = remap[i1];
+
+		// we don't collapse vertices that had source or target vertex involved in a collapse
+		// it's important to not move the vertices twice since it complicates the tracking/remapping logic
+		// it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
+		if (collapse_locked[r0] | collapse_locked[r1])
+			continue;
+
+		assert(collapse_remap[r0] == r0);
+		assert(collapse_remap[r1] == r1);
+
+		quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+		if (vertex_kind[i0] == Kind_Complex)
+		{
+			unsigned int v = i0;
+
+			do
+			{
+				collapse_remap[v] = r1;
+				v = wedge[v];
+			} while (v != i0);
+		}
+		else if (vertex_kind[i0] == Kind_Seam)
+		{
+			// remap v0 to v1 and seam pair of v0 to seam pair of v1
+			unsigned int s0 = wedge[i0];
+			unsigned int s1 = wedge[i1];
+
+			assert(s0 != i0 && s1 != i1);
+			assert(wedge[s0] == i0 && wedge[s1] == i1);
+
+			collapse_remap[i0] = i1;
+			collapse_remap[s0] = s1;
+		}
+		else
+		{
+			assert(wedge[i0] == i0);
+
+			collapse_remap[i0] = i1;
+		}
+
+		collapse_locked[r0] = 1;
+		collapse_locked[r1] = 1;
+
+		// border edges collapse 1 triangle, other edges collapse 2 or more
+		triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+		edge_collapses++;
+	}
+
+	return edge_collapses;
+}
+
+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
+{
+	size_t write = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int v0 = collapse_remap[indices[i + 0]];
+		unsigned int v1 = collapse_remap[indices[i + 1]];
+		unsigned int v2 = collapse_remap[indices[i + 2]];
+
+		// we never move the vertex twice during a single pass
+		assert(collapse_remap[v0] == v0);
+		assert(collapse_remap[v1] == v1);
+		assert(collapse_remap[v2] == v2);
+
+		if (v0 != v1 && v0 != v2 && v1 != v2)
+		{
+			indices[write + 0] = v0;
+			indices[write + 1] = v1;
+			indices[write + 2] = v2;
+			write += 3;
+		}
+	}
+
+	return write;
+}
+
+static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsigned int* collapse_remap)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		if (loop[i] != ~0u)
+		{
+			unsigned int l = loop[i];
+			unsigned int r = collapse_remap[l];
+
+			// i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
+			loop[i] = (i == r) ? loop[l] : r;
+		}
+	}
+}
+
+struct CellHasher
+{
+	const unsigned int* vertex_ids;
+
+	size_t hash(unsigned int i) const
+	{
+		unsigned int h = vertex_ids[i];
+
+		// MurmurHash2 finalizer
+		h ^= h >> 13;
+		h *= 0x5bd1e995;
+		h ^= h >> 15;
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return vertex_ids[lhs] == vertex_ids[rhs];
+	}
+};
+
+struct IdHasher
+{
+	size_t hash(unsigned int id) const
+	{
+		unsigned int h = id;
+
+		// MurmurHash2 finalizer
+		h ^= h >> 13;
+		h *= 0x5bd1e995;
+		h ^= h >> 15;
+		return h;
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		return lhs == rhs;
+	}
+};
+
+struct TriangleHasher
+{
+	unsigned int* indices;
+
+	size_t hash(unsigned int i) const
+	{
+		const unsigned int* tri = indices + i * 3;
+
+		// Optimized Spatial Hashing for Collision Detection of Deformable Objects
+		return (tri[0] * 73856093) ^ (tri[1] * 19349663) ^ (tri[2] * 83492791);
+	}
+
+	bool equal(unsigned int lhs, unsigned int rhs) const
+	{
+		const unsigned int* lt = indices + lhs * 3;
+		const unsigned int* rt = indices + rhs * 3;
+
+		return lt[0] == rt[0] && lt[1] == rt[1] && lt[2] == rt[2];
+	}
+};
+
+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
+{
+	assert(grid_size >= 1 && grid_size <= 1024);
+	float cell_scale = float(grid_size - 1);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const Vector3& v = vertex_positions[i];
+
+		int xi = int(v.x * cell_scale + 0.5f);
+		int yi = int(v.y * cell_scale + 0.5f);
+		int zi = int(v.z * cell_scale + 0.5f);
+
+		vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
+	}
+}
+
+static size_t countTriangles(const unsigned int* vertex_ids, const unsigned int* indices, size_t index_count)
+{
+	size_t result = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int id0 = vertex_ids[indices[i + 0]];
+		unsigned int id1 = vertex_ids[indices[i + 1]];
+		unsigned int id2 = vertex_ids[indices[i + 2]];
+
+		result += (id0 != id1) & (id0 != id2) & (id1 != id2);
+	}
+
+	return result;
+}
+
+static size_t fillVertexCells(unsigned int* table, size_t table_size, unsigned int* vertex_cells, const unsigned int* vertex_ids, size_t vertex_count)
+{
+	CellHasher hasher = {vertex_ids};
+
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int* entry = hashLookup2(table, table_size, hasher, unsigned(i), ~0u);
+
+		if (*entry == ~0u)
+		{
+			*entry = unsigned(i);
+			vertex_cells[i] = unsigned(result++);
+		}
+		else
+		{
+			vertex_cells[i] = vertex_cells[*entry];
+		}
+	}
+
+	return result;
+}
+
+static size_t countVertexCells(unsigned int* table, size_t table_size, const unsigned int* vertex_ids, size_t vertex_count)
+{
+	IdHasher hasher;
+
+	memset(table, -1, table_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int id = vertex_ids[i];
+		unsigned int* entry = hashLookup2(table, table_size, hasher, id, ~0u);
+
+		result += (*entry == ~0u);
+		*entry = id;
+	}
+
+	return result;
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* vertex_cells)
+{
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int i0 = indices[i + 0];
+		unsigned int i1 = indices[i + 1];
+		unsigned int i2 = indices[i + 2];
+
+		unsigned int c0 = vertex_cells[i0];
+		unsigned int c1 = vertex_cells[i1];
+		unsigned int c2 = vertex_cells[i2];
+
+		bool single_cell = (c0 == c1) & (c0 == c2);
+
+		Quadric Q;
+		quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], single_cell ? 3.f : 1.f);
+
+		if (single_cell)
+		{
+			quadricAdd(cell_quadrics[c0], Q);
+		}
+		else
+		{
+			quadricAdd(cell_quadrics[c0], Q);
+			quadricAdd(cell_quadrics[c1], Q);
+			quadricAdd(cell_quadrics[c2], Q);
+		}
+	}
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* vertex_cells)
+{
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int c = vertex_cells[i];
+		const Vector3& v = vertex_positions[i];
+
+		Quadric Q;
+		quadricFromPoint(Q, v.x, v.y, v.z, 1.f);
+
+		quadricAdd(cell_quadrics[c], Q);
+	}
+}
+
+static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count)
+{
+	memset(cell_remap, -1, cell_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int cell = vertex_cells[i];
+		float error = quadricError(cell_quadrics[cell], vertex_positions[i]);
+
+		if (cell_remap[cell] == ~0u || cell_errors[cell] > error)
+		{
+			cell_remap[cell] = unsigned(i);
+			cell_errors[cell] = error;
+		}
+	}
+}
+
+static size_t filterTriangles(unsigned int* destination, unsigned int* tritable, size_t tritable_size, const unsigned int* indices, size_t index_count, const unsigned int* vertex_cells, const unsigned int* cell_remap)
+{
+	TriangleHasher hasher = {destination};
+
+	memset(tritable, -1, tritable_size * sizeof(unsigned int));
+
+	size_t result = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int c0 = vertex_cells[indices[i + 0]];
+		unsigned int c1 = vertex_cells[indices[i + 1]];
+		unsigned int c2 = vertex_cells[indices[i + 2]];
+
+		if (c0 != c1 && c0 != c2 && c1 != c2)
+		{
+			unsigned int a = cell_remap[c0];
+			unsigned int b = cell_remap[c1];
+			unsigned int c = cell_remap[c2];
+
+			if (b < a && b < c)
+			{
+				unsigned int t = a;
+				a = b, b = c, c = t;
+			}
+			else if (c < a && c < b)
+			{
+				unsigned int t = c;
+				c = b, b = a, a = t;
+			}
+
+			destination[result * 3 + 0] = a;
+			destination[result * 3 + 1] = b;
+			destination[result * 3 + 2] = c;
+
+			unsigned int* entry = hashLookup2(tritable, tritable_size, hasher, unsigned(result), ~0u);
+
+			if (*entry == ~0u)
+				*entry = unsigned(result++);
+		}
+	}
+
+	return result * 3;
+}
+
+static float interpolate(float y, float x0, float y0, float x1, float y1, float x2, float y2)
+{
+	// three point interpolation from "revenge of interpolation search" paper
+	float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
+	float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
+	return x1 + num / den;
+}
+
+} // namespace meshopt
+
+#ifndef NDEBUG
+unsigned char* meshopt_simplifyDebugKind = 0;
+unsigned int* meshopt_simplifyDebugLoop = 0;
+unsigned int* meshopt_simplifyDebugLoopBack = 0;
+#endif
+
+size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_index_count <= index_count);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* result = destination;
+
+	// build adjacency information
+	EdgeAdjacency adjacency = {};
+	buildEdgeAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// build position remap that maps each vertex to the one with identical position
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator);
+
+	// classify vertices; vertex kind determines collapse rules, see kCanCollapse
+	unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
+	unsigned int* loop = allocator.allocate<unsigned int>(vertex_count);
+	unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count);
+	classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge);
+
+#if TRACE
+	size_t unique_positions = 0;
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_positions += remap[i] == i;
+
+	printf("position remap: %d vertices => %d positions\n", int(vertex_count), int(unique_positions));
+
+	size_t kinds[Kind_Count] = {};
+	for (size_t i = 0; i < vertex_count; ++i)
+		kinds[vertex_kind[i]] += remap[i] == i;
+
+	printf("kinds: manifold %d, border %d, seam %d, complex %d, locked %d\n",
+	       int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked]));
+#endif
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
+	memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
+
+	fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap);
+	fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
+
+	if (result != indices)
+		memcpy(result, indices, index_count * sizeof(unsigned int));
+
+#if TRACE
+	size_t pass_count = 0;
+	float worst_error = 0;
+#endif
+
+	Collapse* edge_collapses = allocator.allocate<Collapse>(index_count);
+	unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count);
+	unsigned int* collapse_remap = allocator.allocate<unsigned int>(vertex_count);
+	unsigned char* collapse_locked = allocator.allocate<unsigned char>(vertex_count);
+
+	size_t result_count = index_count;
+
+	// target_error input is linear; we need to adjust it to match quadricError units
+	float error_limit = target_error * target_error;
+
+	while (result_count > target_index_count)
+	{
+		size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop);
+
+		// no edges can be collapsed any more due to topology restrictions
+		if (edge_collapse_count == 0)
+			break;
+
+		rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap);
+
+#if TRACE > 1
+		dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind);
+#endif
+
+		sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
+
+		// most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit
+		// note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses
+		size_t triangle_collapse_goal = (result_count - target_index_count) / 3;
+		size_t edge_collapse_goal = triangle_collapse_goal / 2;
+
+		// we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
+		// as they will share vertices with other successfull collapses, we need to increase the acceptable error by this factor
+		const float kPassErrorBound = 1.5f;
+
+		float error_goal = edge_collapse_goal < edge_collapse_count ? edge_collapses[collapse_order[edge_collapse_goal]].error * kPassErrorBound : FLT_MAX;
+
+		for (size_t i = 0; i < vertex_count; ++i)
+			collapse_remap[i] = unsigned(i);
+
+		memset(collapse_locked, 0, vertex_count);
+
+		size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, triangle_collapse_goal, error_goal, error_limit);
+
+		// no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
+		if (collapses == 0)
+			break;
+
+		remapEdgeLoops(loop, vertex_count, collapse_remap);
+		remapEdgeLoops(loopback, vertex_count, collapse_remap);
+
+		size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
+		assert(new_count < result_count);
+
+#if TRACE
+		float pass_error = 0.f;
+		for (size_t i = 0; i < edge_collapse_count; ++i)
+		{
+			Collapse& c = edge_collapses[collapse_order[i]];
+
+			if (collapse_remap[c.v0] == c.v1)
+				pass_error = c.error;
+		}
+
+		pass_count++;
+		worst_error = (worst_error < pass_error) ? pass_error : worst_error;
+
+		printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal);
+#endif
+
+		result_count = new_count;
+	}
+
+#if TRACE
+	printf("passes: %d, worst error: %e\n", int(pass_count), worst_error);
+#endif
+
+#if TRACE > 1
+	dumpLockedCollapses(result, result_count, vertex_kind);
+#endif
+
+#ifndef NDEBUG
+	if (meshopt_simplifyDebugKind)
+		memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
+
+	if (meshopt_simplifyDebugLoop)
+		memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int));
+
+	if (meshopt_simplifyDebugLoopBack)
+		memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int));
+#endif
+
+	return result_count;
+}
+
+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_index_count <= index_count);
+
+	// we expect to get ~2 triangles/vertex in the output
+	size_t target_cell_count = target_index_count / 6;
+
+	if (target_cell_count == 0)
+		return 0;
+
+	meshopt_Allocator allocator;
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	// find the optimal grid size using guided binary search
+#if TRACE
+	printf("source: %d vertices, %d triangles\n", int(vertex_count), int(index_count / 3));
+	printf("target: %d cells, %d triangles\n", int(target_cell_count), int(target_index_count / 3));
+#endif
+
+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+	const int kInterpolationPasses = 5;
+
+	// invariant: # of triangles in min_grid <= target_count
+	int min_grid = 0;
+	int max_grid = 1025;
+	size_t min_triangles = 0;
+	size_t max_triangles = index_count / 3;
+
+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+	{
+		assert(min_triangles < target_index_count / 3);
+		assert(max_grid - min_grid > 1);
+
+		// we clamp the prediction of the grid size to make sure that the search converges
+		int grid_size = next_grid_size;
+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		size_t triangles = countTriangles(vertex_ids, indices, index_count);
+
+#if TRACE
+		printf("pass %d (%s): grid size %d, triangles %d, %s\n",
+		       pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+		       grid_size, int(triangles),
+		       (triangles <= target_index_count / 3) ? "under" : "over");
+#endif
+
+		float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+
+		if (triangles <= target_index_count / 3)
+		{
+			min_grid = grid_size;
+			min_triangles = triangles;
+		}
+		else
+		{
+			max_grid = grid_size;
+			max_triangles = triangles;
+		}
+
+		if (triangles == target_index_count / 3 || max_grid - min_grid <= 1)
+			break;
+
+		// we start by using interpolation search - it usually converges faster
+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+	}
+
+	if (min_triangles == 0)
+		return 0;
+
+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+	// build a quadric for each target cell
+	Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+	memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+	fillCellQuadrics(cell_quadrics, indices, index_count, vertex_positions, vertex_cells);
+
+	// for each target cell, find the vertex with the minimal error
+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+	float* cell_errors = allocator.allocate<float>(cell_count);
+
+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+	// collapse triangles!
+	// note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
+	size_t tritable_size = hashBuckets2(min_triangles);
+	unsigned int* tritable = allocator.allocate<unsigned int>(tritable_size);
+
+	size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
+	assert(write <= target_index_count);
+
+#if TRACE
+	printf("result: %d cells, %d triangles (%d unfiltered)\n", int(cell_count), int(write / 3), int(min_triangles));
+#endif
+
+	return write;
+}
+
+size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+	assert(target_vertex_count <= vertex_count);
+
+	size_t target_cell_count = target_vertex_count;
+
+	if (target_cell_count == 0)
+		return 0;
+
+	meshopt_Allocator allocator;
+
+	Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+	rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+	// find the optimal grid size using guided binary search
+#if TRACE
+	printf("source: %d vertices\n", int(vertex_count));
+	printf("target: %d cells\n", int(target_cell_count));
+#endif
+
+	unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+	size_t table_size = hashBuckets2(vertex_count);
+	unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+	const int kInterpolationPasses = 5;
+
+	// invariant: # of vertices in min_grid <= target_count
+	int min_grid = 0;
+	int max_grid = 1025;
+	size_t min_vertices = 0;
+	size_t max_vertices = vertex_count;
+
+	// instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+	int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+	for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+	{
+		assert(min_vertices < target_vertex_count);
+		assert(max_grid - min_grid > 1);
+
+		// we clamp the prediction of the grid size to make sure that the search converges
+		int grid_size = next_grid_size;
+		grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+		computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+		size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
+
+#if TRACE
+		printf("pass %d (%s): grid size %d, vertices %d, %s\n",
+		       pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+		       grid_size, int(vertices),
+		       (vertices <= target_vertex_count) ? "under" : "over");
+#endif
+
+		float tip = interpolate(float(target_vertex_count), float(min_grid), float(min_vertices), float(grid_size), float(vertices), float(max_grid), float(max_vertices));
+
+		if (vertices <= target_vertex_count)
+		{
+			min_grid = grid_size;
+			min_vertices = vertices;
+		}
+		else
+		{
+			max_grid = grid_size;
+			max_vertices = vertices;
+		}
+
+		if (vertices == target_vertex_count || max_grid - min_grid <= 1)
+			break;
+
+		// we start by using interpolation search - it usually converges faster
+		// however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+		next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+	}
+
+	if (min_vertices == 0)
+		return 0;
+
+	// build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+	unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+	computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+	size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+	// build a quadric for each target cell
+	Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+	memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+	fillCellQuadrics(cell_quadrics, vertex_positions, vertex_count, vertex_cells);
+
+	// for each target cell, find the vertex with the minimal error
+	unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+	float* cell_errors = allocator.allocate<float>(cell_count);
+
+	fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+	// copy results to the output
+	assert(cell_count <= target_vertex_count);
+	memcpy(destination, cell_remap, sizeof(unsigned int) * cell_count);
+
+#if TRACE
+	printf("result: %d cells\n", int(cell_count));
+#endif
+
+	return cell_count;
+}
diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp
new file mode 100644
index 0000000000..b09f80ac6f
--- /dev/null
+++ b/thirdparty/meshoptimizer/spatialorder.cpp
@@ -0,0 +1,194 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Fabian Giesen. Decoding Morton codes. 2009
+namespace meshopt
+{
+
+// "Insert" two 0 bits after each of the 10 low bits of x
+inline unsigned int part1By2(unsigned int x)
+{
+	x &= 0x000003ff;                  // x = ---- ---- ---- ---- ---- --98 7654 3210
+	x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+	x = (x ^ (x << 8)) & 0x0300f00f;  // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+	x = (x ^ (x << 4)) & 0x030c30c3;  // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+	x = (x ^ (x << 2)) & 0x09249249;  // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+	return x;
+}
+
+static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+{
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+	float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		for (int j = 0; j < 3; ++j)
+		{
+			float vj = v[j];
+
+			minv[j] = minv[j] > vj ? vj : minv[j];
+			maxv[j] = maxv[j] < vj ? vj : maxv[j];
+		}
+	}
+
+	float extent = 0.f;
+
+	extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+	extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+	extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+	float scale = extent == 0 ? 0.f : 1.f / extent;
+
+	// generate Morton order based on the position inside a unit cube
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		const float* v = vertex_positions_data + i * vertex_stride_float;
+
+		int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
+		int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
+		int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+
+		result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+	}
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+{
+	memset(hist, 0, sizeof(hist));
+
+	// compute 3 10-bit histograms in parallel
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = data[i];
+
+		hist[(id >> 0) & 1023][0]++;
+		hist[(id >> 10) & 1023][1]++;
+		hist[(id >> 20) & 1023][2]++;
+	}
+
+	unsigned int sumx = 0, sumy = 0, sumz = 0;
+
+	// replace histogram data with prefix histogram sums in-place
+	for (int i = 0; i < 1024; ++i)
+	{
+		unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+		hist[i][0] = sumx;
+		hist[i][1] = sumy;
+		hist[i][2] = sumz;
+
+		sumx += hx;
+		sumy += hy;
+		sumz += hz;
+	}
+
+	assert(sumx == count && sumy == count && sumz == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+	int bitoff = pass * 10;
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+
+		destination[hist[id][pass]++] = source[i];
+	}
+}
+
+} // namespace meshopt
+
+void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
+	computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+
+	unsigned int hist[1024][3];
+	computeHistogram(hist, keys, vertex_count);
+
+	unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[i] = unsigned(i);
+
+	// 3-pass radix sort computes the resulting order into scratch
+	radixPass(scratch, destination, keys, vertex_count, hist, 0);
+	radixPass(destination, scratch, keys, vertex_count, hist, 1);
+	radixPass(scratch, destination, keys, vertex_count, hist, 2);
+
+	// since our remap table is mapping old=>new, we need to reverse it
+	for (size_t i = 0; i < vertex_count; ++i)
+		destination[scratch[i]] = unsigned(i);
+}
+
+void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	(void)vertex_count;
+
+	size_t face_count = index_count / 3;
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+	meshopt_Allocator allocator;
+
+	float* centroids = allocator.allocate<float>(face_count * 3);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* va = vertex_positions + a * vertex_stride_float;
+		const float* vb = vertex_positions + b * vertex_stride_float;
+		const float* vc = vertex_positions + c * vertex_stride_float;
+
+		centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f;
+		centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f;
+		centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f;
+	}
+
+	unsigned int* remap = allocator.allocate<unsigned int>(face_count);
+
+	meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3);
+
+	// support in-order remap
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		unsigned int r = remap[i];
+
+		destination[r * 3 + 0] = a;
+		destination[r * 3 + 1] = b;
+		destination[r * 3 + 2] = c;
+	}
+}
diff --git a/thirdparty/meshoptimizer/stripifier.cpp b/thirdparty/meshoptimizer/stripifier.cpp
new file mode 100644
index 0000000000..8ce17ef3dc
--- /dev/null
+++ b/thirdparty/meshoptimizer/stripifier.cpp
@@ -0,0 +1,295 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+// This work is based on:
+// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
+namespace meshopt
+{
+
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+{
+	unsigned int index = 0;
+	unsigned int iv = ~0u;
+
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+		unsigned int v = (va < vb && va < vc) ? va : (vb < vc) ? vb : vc;
+
+		if (v < iv)
+		{
+			index = unsigned(i);
+			iv = v;
+		}
+	}
+
+	return index;
+}
+
+static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
+{
+	for (size_t i = 0; i < buffer_size; ++i)
+	{
+		unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+		if (e0 == a && e1 == b)
+			return (int(i) << 2) | 2;
+		else if (e0 == b && e1 == c)
+			return (int(i) << 2) | 0;
+		else if (e0 == c && e1 == a)
+			return (int(i) << 2) | 1;
+	}
+
+	return -1;
+}
+
+} // namespace meshopt
+
+size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+	assert(index_count % 3 == 0);
+
+	using namespace meshopt;
+
+	meshopt_Allocator allocator;
+
+	const size_t buffer_capacity = 8;
+
+	unsigned int buffer[buffer_capacity][3] = {};
+	unsigned int buffer_size = 0;
+
+	size_t index_offset = 0;
+
+	unsigned int strip[2] = {};
+	unsigned int parity = 0;
+
+	size_t strip_size = 0;
+
+	// compute vertex valence; this is used to prioritize starting triangle for strips
+	unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
+	memset(valence, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		valence[index]++;
+	}
+
+	int next = -1;
+
+	while (buffer_size > 0 || index_offset < index_count)
+	{
+		assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
+
+		// fill triangle buffer
+		while (buffer_size < buffer_capacity && index_offset < index_count)
+		{
+			buffer[buffer_size][0] = indices[index_offset + 0];
+			buffer[buffer_size][1] = indices[index_offset + 1];
+			buffer[buffer_size][2] = indices[index_offset + 2];
+
+			buffer_size++;
+			index_offset += 3;
+		}
+
+		assert(buffer_size > 0);
+
+		if (next >= 0)
+		{
+			unsigned int i = next >> 2;
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+			unsigned int v = buffer[i][next & 3];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// find next triangle (note that edge order flips on every iteration)
+			// in some cases we need to perform a swap to pick a different outgoing triangle edge
+			// for [a b c], the default strip edge is [b c], but we might want to use [a c]
+			int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
+			int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
+
+			if (cont < 0 && swap >= 0)
+			{
+				// [a b c] => [a b a c]
+				destination[strip_size++] = strip[0];
+				destination[strip_size++] = v;
+
+				// next strip has same winding
+				// ? a b => b a v
+				strip[1] = v;
+
+				next = swap;
+			}
+			else
+			{
+				// emit the next vertex in the strip
+				destination[strip_size++] = v;
+
+				// next strip has flipped winding
+				strip[0] = strip[1];
+				strip[1] = v;
+				parity ^= 1;
+
+				next = cont;
+			}
+		}
+		else
+		{
+			// if we didn't find anything, we need to find the next new triangle
+			// we use a heuristic to maximize the strip length
+			unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+			unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+			// ordered removal from the buffer
+			memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+			buffer_size--;
+
+			// update vertex valences for strip start heuristic
+			valence[a]--;
+			valence[b]--;
+			valence[c]--;
+
+			// we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
+			int ea = findStripNext(buffer, buffer_size, c, b);
+			int eb = findStripNext(buffer, buffer_size, a, c);
+			int ec = findStripNext(buffer, buffer_size, b, a);
+
+			// in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
+			// triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
+			// reasons - slightly improves the stripification efficiency
+			int mine = INT_MAX;
+			mine = (ea >= 0 && mine > ea) ? ea : mine;
+			mine = (eb >= 0 && mine > eb) ? eb : mine;
+			mine = (ec >= 0 && mine > ec) ? ec : mine;
+
+			if (ea == mine)
+			{
+				// keep abc
+				next = ea;
+			}
+			else if (eb == mine)
+			{
+				// abc -> bca
+				unsigned int t = a;
+				a = b, b = c, c = t;
+
+				next = eb;
+			}
+			else if (ec == mine)
+			{
+				// abc -> cab
+				unsigned int t = c;
+				c = b, b = a, a = t;
+
+				next = ec;
+			}
+
+			if (restart_index)
+			{
+				if (strip_size)
+					destination[strip_size++] = restart_index;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = b;
+				destination[strip_size++] = c;
+
+				// new strip always starts with the same edge winding
+				strip[0] = b;
+				strip[1] = c;
+				parity = 1;
+			}
+			else
+			{
+				if (strip_size)
+				{
+					// connect last strip using degenerate triangles
+					destination[strip_size++] = strip[1];
+					destination[strip_size++] = a;
+				}
+
+				// note that we may need to flip the emitted triangle based on parity
+				// we always end up with outgoing edge "cb" in the end
+				unsigned int e0 = parity ? c : b;
+				unsigned int e1 = parity ? b : c;
+
+				destination[strip_size++] = a;
+				destination[strip_size++] = e0;
+				destination[strip_size++] = e1;
+
+				strip[0] = e0;
+				strip[1] = e1;
+				parity ^= 1;
+			}
+		}
+	}
+
+	return strip_size;
+}
+
+size_t meshopt_stripifyBound(size_t index_count)
+{
+	assert(index_count % 3 == 0);
+
+	// worst case without restarts is 2 degenerate indices and 3 indices per triangle
+	// worst case with restarts is 1 restart index and 3 indices per triangle
+	return (index_count / 3) * 5;
+}
+
+size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index)
+{
+	assert(destination != indices);
+
+	size_t offset = 0;
+	size_t start = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		if (restart_index && indices[i] == restart_index)
+		{
+			start = i + 1;
+		}
+		else if (i - start >= 2)
+		{
+			unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
+
+			// flip winding for odd triangles
+			if ((i - start) & 1)
+			{
+				unsigned int t = a;
+				a = b, b = t;
+			}
+
+			// although we use restart indices, strip swaps still produce degenerate triangles, so skip them
+			if (a != b && a != c && b != c)
+			{
+				destination[offset + 0] = a;
+				destination[offset + 1] = b;
+				destination[offset + 2] = c;
+				offset += 3;
+			}
+		}
+	}
+
+	return offset;
+}
+
+size_t meshopt_unstripifyBound(size_t index_count)
+{
+	assert(index_count == 0 || index_count >= 3);
+
+	return (index_count == 0) ? 0 : (index_count - 2) * 3;
+}
diff --git a/thirdparty/meshoptimizer/vcacheanalyzer.cpp b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
new file mode 100644
index 0000000000..3682743820
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
@@ -0,0 +1,73 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
+{
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+	assert(warp_size == 0 || warp_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexCacheStatistics result = {};
+
+	unsigned int warp_offset = 0;
+	unsigned int primgroup_offset = 0;
+
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	unsigned int timestamp = cache_size + 1;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		bool ac = (timestamp - cache_timestamps[a]) > cache_size;
+		bool bc = (timestamp - cache_timestamps[b]) > cache_size;
+		bool cc = (timestamp - cache_timestamps[c]) > cache_size;
+
+		// flush cache if triangle doesn't fit into warp or into the primitive buffer
+		if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
+		{
+			result.warps_executed += warp_offset > 0;
+
+			warp_offset = 0;
+			primgroup_offset = 0;
+
+			// reset cache
+			timestamp += cache_size + 1;
+		}
+
+		// update cache and add vertices to warp
+		for (int j = 0; j < 3; ++j)
+		{
+			unsigned int index = indices[i + j];
+
+			if (timestamp - cache_timestamps[index] > cache_size)
+			{
+				cache_timestamps[index] = timestamp++;
+				result.vertices_transformed++;
+				warp_offset++;
+			}
+		}
+
+		primgroup_offset++;
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += cache_timestamps[i] > 0;
+
+	result.warps_executed += warp_offset > 0;
+
+	result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
+	result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/vcacheoptimizer.cpp b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
new file mode 100644
index 0000000000..fb8ade4b77
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
@@ -0,0 +1,473 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+const size_t kCacheSizeMax = 16;
+const size_t kValenceMax = 8;
+
+struct VertexScoreTable
+{
+	float cache[1 + kCacheSizeMax];
+	float live[1 + kValenceMax];
+};
+
+// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
+static const VertexScoreTable kVertexScoreTable = {
+    {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
+    {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
+};
+
+// Tuned to minimize the encoded index buffer size
+static const VertexScoreTable kVertexScoreTableStrip = {
+    {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
+    {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
+};
+
+struct TriangleAdjacency
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
+static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
+{
+	// check dead-end stack
+	while (dead_end_top)
+	{
+		unsigned int vertex = dead_end[--dead_end_top];
+
+		if (live_triangles[vertex] > 0)
+			return vertex;
+	}
+
+	// input order
+	while (input_cursor < vertex_count)
+	{
+		if (live_triangles[input_cursor] > 0)
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+{
+	unsigned int best_candidate = ~0u;
+	int best_priority = -1;
+
+	for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
+	{
+		unsigned int vertex = *next_candidate;
+
+		// otherwise we don't need to process it
+		if (live_triangles[vertex] > 0)
+		{
+			int priority = 0;
+
+			// will it be in cache after fanning?
+			if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
+			{
+				priority = timestamp - cache_timestamps[vertex]; // position in cache
+			}
+
+			if (priority > best_priority)
+			{
+				best_candidate = vertex;
+				best_priority = priority;
+			}
+		}
+	}
+
+	return best_candidate;
+}
+
+static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
+{
+	assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
+
+	unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
+
+	return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
+}
+
+static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
+{
+	// input order
+	while (input_cursor < face_count)
+	{
+		if (!emitted_flags[input_cursor])
+			return input_cursor;
+
+		++input_cursor;
+	}
+
+	return ~0u;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	unsigned int cache_size = 16;
+	assert(cache_size <= kCacheSizeMax);
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// compute initial vertex scores
+	float* vertex_scores = allocator.allocate<float>(vertex_count);
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
+
+	// compute triangle scores
+	float* triangle_scores = allocator.allocate<float>(face_count);
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0];
+		unsigned int b = indices[i * 3 + 1];
+		unsigned int c = indices[i * 3 + 2];
+
+		triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
+	}
+
+	unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+	unsigned int* cache = cache_holder;
+	unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+	size_t cache_count = 0;
+
+	unsigned int current_triangle = 0;
+	unsigned int input_cursor = 1;
+
+	unsigned int output_triangle = 0;
+
+	while (current_triangle != ~0u)
+	{
+		assert(output_triangle < face_count);
+
+		unsigned int a = indices[current_triangle * 3 + 0];
+		unsigned int b = indices[current_triangle * 3 + 1];
+		unsigned int c = indices[current_triangle * 3 + 2];
+
+		// output indices
+		destination[output_triangle * 3 + 0] = a;
+		destination[output_triangle * 3 + 1] = b;
+		destination[output_triangle * 3 + 2] = c;
+		output_triangle++;
+
+		// update emitted flags
+		emitted_flags[current_triangle] = true;
+		triangle_scores[current_triangle] = 0;
+
+		// new triangle
+		size_t cache_write = 0;
+		cache_new[cache_write++] = a;
+		cache_new[cache_write++] = b;
+		cache_new[cache_write++] = c;
+
+		// old triangles
+		for (size_t i = 0; i < cache_count; ++i)
+		{
+			unsigned int index = cache[i];
+
+			if (index != a && index != b && index != c)
+			{
+				cache_new[cache_write++] = index;
+			}
+		}
+
+		unsigned int* cache_temp = cache;
+		cache = cache_new, cache_new = cache_temp;
+		cache_count = cache_write > cache_size ? cache_size : cache_write;
+
+		// update live triangle counts
+		live_triangles[a]--;
+		live_triangles[b]--;
+		live_triangles[c]--;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		for (size_t k = 0; k < 3; ++k)
+		{
+			unsigned int index = indices[current_triangle * 3 + k];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbours_size; ++i)
+			{
+				unsigned int tri = neighbours[i];
+
+				if (tri == current_triangle)
+				{
+					neighbours[i] = neighbours[neighbours_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
+		}
+
+		unsigned int best_triangle = ~0u;
+		float best_score = 0;
+
+		// update cache positions, vertex scores and triangle scores, and find next best triangle
+		for (size_t i = 0; i < cache_write; ++i)
+		{
+			unsigned int index = cache[i];
+
+			int cache_position = i >= cache_size ? -1 : int(i);
+
+			// update vertex score
+			float score = vertexScore(table, cache_position, live_triangles[index]);
+			float score_diff = score - vertex_scores[index];
+
+			vertex_scores[index] = score;
+
+			// update scores of vertex triangles
+			const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
+			const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
+
+			for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+			{
+				unsigned int tri = *it;
+				assert(!emitted_flags[tri]);
+
+				float tri_score = triangle_scores[tri] + score_diff;
+				assert(tri_score > 0);
+
+				if (best_score < tri_score)
+				{
+					best_triangle = tri;
+					best_score = tri_score;
+				}
+
+				triangle_scores[tri] = tri_score;
+			}
+		}
+
+		// step through input triangles in order if we hit a dead-end
+		current_triangle = best_triangle;
+
+		if (current_triangle == ~0u)
+		{
+			current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
+		}
+	}
+
+	assert(input_cursor == face_count);
+	assert(output_triangle == face_count);
+}
+
+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
+}
+
+void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
+}
+
+void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(cache_size >= 3);
+
+	meshopt_Allocator allocator;
+
+	// guard for empty meshes
+	if (index_count == 0 || vertex_count == 0)
+		return;
+
+	// support in-place optimization
+	if (destination == indices)
+	{
+		unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+		memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+		indices = indices_copy;
+	}
+
+	size_t face_count = index_count / 3;
+
+	// build adjacency information
+	TriangleAdjacency adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	// live triangle counts
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	// cache time stamps
+	unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+	memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+	// dead-end stack
+	unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
+	unsigned int dead_end_top = 0;
+
+	// emitted flags
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	unsigned int current_vertex = 0;
+
+	unsigned int timestamp = cache_size + 1;
+	unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
+
+	unsigned int output_triangle = 0;
+
+	while (current_vertex != ~0u)
+	{
+		const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
+
+		// emit all vertex neighbours
+		const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+		const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
+
+		for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+		{
+			unsigned int triangle = *it;
+
+			if (!emitted_flags[triangle])
+			{
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+				// output indices
+				destination[output_triangle * 3 + 0] = a;
+				destination[output_triangle * 3 + 1] = b;
+				destination[output_triangle * 3 + 2] = c;
+				output_triangle++;
+
+				// update dead-end stack
+				dead_end[dead_end_top + 0] = a;
+				dead_end[dead_end_top + 1] = b;
+				dead_end[dead_end_top + 2] = c;
+				dead_end_top += 3;
+
+				// update live triangle counts
+				live_triangles[a]--;
+				live_triangles[b]--;
+				live_triangles[c]--;
+
+				// update cache info
+				// if vertex is not in cache, put it in cache
+				if (timestamp - cache_timestamps[a] > cache_size)
+					cache_timestamps[a] = timestamp++;
+
+				if (timestamp - cache_timestamps[b] > cache_size)
+					cache_timestamps[b] = timestamp++;
+
+				if (timestamp - cache_timestamps[c] > cache_size)
+					cache_timestamps[c] = timestamp++;
+
+				// update emitted flags
+				emitted_flags[triangle] = true;
+			}
+		}
+
+		// next candidates are the ones we pushed to dead-end stack just now
+		const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
+
+		// get next vertex
+		current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+
+		if (current_vertex == ~0u)
+		{
+			current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
+		}
+	}
+
+	assert(output_triangle == face_count);
+}
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
new file mode 100644
index 0000000000..784c9a13db
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -0,0 +1,1265 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
+#if defined(__AVX__) || defined(__SSSE3__)
+#define SIMD_SSE
+#endif
+
+// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
+#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
+#undef SIMD_SSE
+#define SIMD_AVX
+#endif
+
+// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#endif
+
+// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#define SIMD_TARGET __attribute__((target("ssse3")))
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#ifndef SIMD_TARGET
+#define SIMD_TARGET
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <tmmintrin.h>
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+#ifdef _MSC_VER
+#include <intrin.h> // __cpuid
+#else
+#include <cpuid.h> // __cpuid
+#endif
+#endif
+
+#ifdef SIMD_AVX
+#include <immintrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
+#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
+#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
+#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
+#endif
+
+namespace meshopt
+{
+
+const unsigned char kVertexHeader = 0xa0;
+
+static int gEncodeVertexVersion = 0;
+
+const size_t kVertexBlockSizeBytes = 8192;
+const size_t kVertexBlockMaxSize = 256;
+const size_t kByteGroupSize = 16;
+const size_t kByteGroupDecodeLimit = 24;
+const size_t kTailMaxSize = 32;
+
+static size_t getVertexBlockSize(size_t vertex_size)
+{
+	// make sure the entire block fits into the scratch buffer
+	size_t result = kVertexBlockSizeBytes / vertex_size;
+
+	// align to byte group size; we encode each byte as a byte group
+	// if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
+	result &= ~(kByteGroupSize - 1);
+
+	return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
+}
+
+inline unsigned char zigzag8(unsigned char v)
+{
+	return ((signed char)(v) >> 7) ^ (v << 1);
+}
+
+inline unsigned char unzigzag8(unsigned char v)
+{
+	return -(v & 1) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+	size_t size;
+	size_t header;
+	size_t bitg[4];
+	size_t bitb[4];
+};
+
+Stats* bytestats;
+Stats vertexstats[256];
+#endif
+
+static bool encodeBytesGroupZero(const unsigned char* buffer)
+{
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		if (buffer[i])
+			return false;
+
+	return true;
+}
+
+static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
+
+	if (bits == 8)
+		return kByteGroupSize;
+
+	size_t result = kByteGroupSize * bits / 8;
+
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+		result += buffer[i] >= sentinel;
+
+	return result;
+}
+
+static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
+{
+	assert(bits >= 1 && bits <= 8);
+
+	if (bits == 1)
+		return data;
+
+	if (bits == 8)
+	{
+		memcpy(data, buffer, kByteGroupSize);
+		return data + kByteGroupSize;
+	}
+
+	size_t byte_size = 8 / bits;
+	assert(kByteGroupSize % byte_size == 0);
+
+	// fixed portion: bits bits for each value
+	// variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
+	unsigned char sentinel = (1 << bits) - 1;
+
+	for (size_t i = 0; i < kByteGroupSize; i += byte_size)
+	{
+		unsigned char byte = 0;
+
+		for (size_t k = 0; k < byte_size; ++k)
+		{
+			unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
+
+			byte <<= bits;
+			byte |= enc;
+		}
+
+		*data++ = byte;
+	}
+
+	for (size_t i = 0; i < kByteGroupSize; ++i)
+	{
+		if (buffer[i] >= sentinel)
+		{
+			*data++ = buffer[i];
+		}
+	}
+
+	return data;
+}
+
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	memset(header, 0, header_size);
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		int best_bits = 8;
+		size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+
+		for (int bits = 1; bits < 8; bits *= 2)
+		{
+			size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+
+			if (size < best_size)
+			{
+				best_bits = bits;
+				best_size = size;
+			}
+		}
+
+		int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
+		assert((1 << bitslog2) == best_bits);
+
+		size_t header_offset = i / kByteGroupSize;
+
+		header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
+
+		unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
+
+		assert(data + best_size == next);
+		data = next;
+
+#if TRACE > 1
+		bytestats->bitg[bitslog2]++;
+		bytestats->bitb[bitslog2] += best_size;
+#endif
+	}
+
+#if TRACE > 1
+	bytestats->header += header_size;
+#endif
+
+	return data;
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	assert(sizeof(buffer) % kByteGroupSize == 0);
+
+	// we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+	memset(buffer, 0, sizeof(buffer));
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+
+			p = vertex_data[vertex_offset];
+
+			vertex_offset += vertex_size;
+		}
+
+#if TRACE
+		const unsigned char* olddata = data;
+		bytestats = &vertexstats[k];
+#endif
+
+		data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
+		if (!data)
+			return 0;
+
+#if TRACE
+		bytestats = 0;
+		vertexstats[k].size += data - olddata;
+#endif
+	}
+
+	memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+#define READ() byte = *data++
+#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
+
+	unsigned char byte, enc, encv;
+	const unsigned char* data_var;
+
+	switch (bitslog2)
+	{
+	case 0:
+		memset(buffer, 0, kByteGroupSize);
+		return data;
+	case 1:
+		data_var = data + 4;
+
+		// 4 groups with 4 2-bit values in each byte
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+		READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+
+		return data_var;
+	case 2:
+		data_var = data + 8;
+
+		// 8 groups with 2 4-bit values in each byte
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+		READ(), NEXT(4), NEXT(4);
+
+		return data_var;
+	case 3:
+		memcpy(buffer, data, kByteGroupSize);
+		return data + kByteGroupSize;
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+
+#undef READ
+#undef NEXT
+}
+
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroup(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
+		if (!data)
+			return 0;
+
+		size_t vertex_offset = k;
+
+		unsigned char p = last_vertex[k];
+
+		for (size_t i = 0; i < vertex_count; ++i)
+		{
+			unsigned char v = unzigzag8(buffer[i]) + p;
+
+			transposed[vertex_offset] = v;
+			p = v;
+
+			vertex_offset += vertex_size;
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+static unsigned char kDecodeBytesGroupShuffle[256][8];
+static unsigned char kDecodeBytesGroupCount[256];
+
+#ifdef __wasm__
+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
+#endif
+static bool
+decodeBytesGroupBuildTables()
+{
+	for (int mask = 0; mask < 256; ++mask)
+	{
+		unsigned char shuffle[8];
+		unsigned char count = 0;
+
+		for (int i = 0; i < 8; ++i)
+		{
+			int maski = (mask >> i) & 1;
+			shuffle[i] = maski ? count : 0x80;
+			count += (unsigned char)(maski);
+		}
+
+		memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
+		kDecodeBytesGroupCount[mask] = count;
+	}
+
+	return true;
+}
+
+static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
+#endif
+
+#ifdef SIMD_SSE
+SIMD_TARGET
+static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+	__m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
+	__m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
+	__m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
+
+	__m128i sm1r = _mm_add_epi8(sm1, sm1off);
+
+	return _mm_unpacklo_epi64(sm0, sm1r);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		__m128i result = _mm_setzero_si128();
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data;
+	}
+
+	case 1:
+	{
+#ifdef __GNUC__
+		typedef int __attribute__((aligned(1))) unaligned_int;
+#else
+		typedef int unaligned_int;
+#endif
+
+		__m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
+
+		__m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
+		__m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
+		__m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		__m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
+
+		__m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
+		__m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
+
+		__m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
+		int mask16 = _mm_movemask_epi8(mask);
+		unsigned char mask0 = (unsigned char)(mask16 & 255);
+		unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+		__m128i shuf = decodeShuffleMask(mask0, mask1);
+
+		__m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_AVX
+static const __m128i decodeBytesGroupConfig[] = {
+    _mm_set1_epi8(3),
+    _mm_set1_epi8(15),
+    _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
+    _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+};
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		__m128i result = _mm_setzero_si128();
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data;
+	}
+
+	case 1:
+	case 2:
+	{
+		const unsigned char* skip = data + (bitslog2 << 2);
+
+		__m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+		__m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
+
+		__m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
+		__m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+
+		__m128i selw = _mm_shuffle_epi32(selb, 0x44);
+		__m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
+		__mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
+
+		__m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return skip + _mm_popcnt_u32(mask16);
+	}
+
+	case 3:
+	{
+		__m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_NEON
+static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+{
+	uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
+	uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
+
+	uint8x8_t r0 = vtbl1_u8(rest0, sm0);
+	uint8x8_t r1 = vtbl1_u8(rest1, sm1);
+
+	return vcombine_u8(r0, r1);
+}
+
+static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+	static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
+
+	uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
+	uint8x16_t masked = vandq_u8(mask, byte_mask);
+
+#ifdef __aarch64__
+	// aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
+	mask0 = vaddv_u8(vget_low_u8(masked));
+	mask1 = vaddv_u8(vget_high_u8(masked));
+#else
+	// we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
+	uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
+	uint8x8_t sum2 = vpadd_u8(sum1, sum1);
+	uint8x8_t sum3 = vpadd_u8(sum2, sum2);
+
+	mask0 = vget_lane_u8(sum3, 0);
+	mask1 = vget_lane_u8(sum3, 1);
+#endif
+}
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		uint8x16_t result = vdupq_n_u8(0);
+
+		vst1q_u8(buffer, result);
+
+		return data;
+	}
+
+	case 1:
+	{
+		uint8x8_t sel2 = vld1_u8(data);
+		uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
+		uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
+		uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 4);
+		uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		uint8x8_t sel4 = vld1_u8(data);
+		uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
+		uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
+
+		uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
+		unsigned char mask0, mask1;
+		neonMoveMask(mask, mask0, mask1);
+
+		uint8x8_t rest0 = vld1_u8(data + 8);
+		uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
+
+		uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+		vst1q_u8(buffer, result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		uint8x16_t result = vld1q_u8(data);
+
+		vst1q_u8(buffer, result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+	v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
+	v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
+
+	v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
+	sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+	v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
+
+	return wasmx_unpacklo_v64x2(sm0, sm1r);
+}
+
+SIMD_TARGET
+static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+	v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
+
+	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
+	uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
+
+	// TODO: This can use v8x16_bitmask in the future
+	uint64_t mask_2 = mask_1a | mask_1b;
+	uint64_t mask_4 = mask_2 | (mask_2 >> 16);
+	uint64_t mask_8 = mask_4 | (mask_4 >> 8);
+
+	mask0 = uint8_t(mask_8);
+	mask1 = uint8_t(mask_8 >> 32);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+	unsigned char byte, enc, encv;
+	const unsigned char* data_var;
+
+	switch (bitslog2)
+	{
+	case 0:
+	{
+		v128_t result = wasm_i8x16_splat(0);
+
+		wasm_v128_store(buffer, result);
+
+		return data;
+	}
+
+	case 1:
+	{
+		v128_t sel2 = wasm_v128_load(data);
+		v128_t rest = wasm_v128_load(data + 4);
+
+		v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
+		v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
+		v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
+
+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
+
+		unsigned char mask0, mask1;
+		wasmMoveMask(mask, mask0, mask1);
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 2:
+	{
+		v128_t sel4 = wasm_v128_load(data);
+		v128_t rest = wasm_v128_load(data + 8);
+
+		v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
+		v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
+
+		v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
+
+		unsigned char mask0, mask1;
+		wasmMoveMask(mask, mask0, mask1);
+
+		v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+		v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+	}
+
+	case 3:
+	{
+		v128_t result = wasm_v128_load(data);
+
+		wasm_v128_store(buffer, result);
+
+		return data + 16;
+	}
+
+	default:
+		assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+		return data;
+	}
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+SIMD_TARGET
+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+{
+	__m128i t0 = _mm_unpacklo_epi8(x0, x1);
+	__m128i t1 = _mm_unpackhi_epi8(x0, x1);
+	__m128i t2 = _mm_unpacklo_epi8(x2, x3);
+	__m128i t3 = _mm_unpackhi_epi8(x2, x3);
+
+	x0 = _mm_unpacklo_epi16(t0, t2);
+	x1 = _mm_unpackhi_epi16(t0, t2);
+	x2 = _mm_unpacklo_epi16(t1, t3);
+	x3 = _mm_unpackhi_epi16(t1, t3);
+}
+
+SIMD_TARGET
+static __m128i unzigzag8(__m128i v)
+{
+	__m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
+	__m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
+
+	return _mm_xor_si128(xl, xr);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+{
+	uint8x16x2_t t01 = vzipq_u8(x0, x1);
+	uint8x16x2_t t23 = vzipq_u8(x2, x3);
+
+	uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
+	uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
+
+	x0 = vreinterpretq_u8_u16(x01.val[0]);
+	x1 = vreinterpretq_u8_u16(x01.val[1]);
+	x2 = vreinterpretq_u8_u16(x23.val[0]);
+	x3 = vreinterpretq_u8_u16(x23.val[1]);
+}
+
+static uint8x16_t unzigzag8(uint8x16_t v)
+{
+	uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
+	uint8x16_t xr = vshrq_n_u8(v, 1);
+
+	return veorq_u8(xl, xr);
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+{
+	v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
+	v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
+	v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
+	v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
+
+	x0 = wasmx_unpacklo_v16x8(t0, t2);
+	x1 = wasmx_unpackhi_v16x8(t0, t2);
+	x2 = wasmx_unpacklo_v16x8(t1, t3);
+	x3 = wasmx_unpackhi_v16x8(t1, t3);
+}
+
+SIMD_TARGET
+static v128_t unzigzag8(v128_t v)
+{
+	v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
+	v128_t xr = wasm_u8x16_shr(v, 1);
+
+	return wasm_v128_xor(xl, xr);
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+SIMD_TARGET
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+	assert(buffer_size % kByteGroupSize == 0);
+	assert(kByteGroupSize == 16);
+
+	const unsigned char* header = data;
+
+	// round number of groups to 4 to get number of header bytes
+	size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+	if (size_t(data_end - data) < header_size)
+		return 0;
+
+	data += header_size;
+
+	size_t i = 0;
+
+	// fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+	for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
+	{
+		size_t header_offset = i / kByteGroupSize;
+		unsigned char header_byte = header[header_offset / 4];
+
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
+		data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+	}
+
+	// slow-path: process remaining groups
+	for (; i < buffer_size; i += kByteGroupSize)
+	{
+		if (size_t(data_end - data) < kByteGroupDecodeLimit)
+			return 0;
+
+		size_t header_offset = i / kByteGroupSize;
+
+		int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+		data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+	}
+
+	return data;
+}
+
+SIMD_TARGET
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+	assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+	unsigned char buffer[kVertexBlockMaxSize * 4];
+	unsigned char transposed[kVertexBlockSizeBytes];
+
+	size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+	for (size_t k = 0; k < vertex_size; k += 4)
+	{
+		for (size_t j = 0; j < 4; ++j)
+		{
+			data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
+			if (!data)
+				return 0;
+		}
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
+#endif
+
+		PREP();
+
+		unsigned char* savep = transposed + k;
+
+		for (size_t j = 0; j < vertex_count_aligned; j += 16)
+		{
+			LOAD(0);
+			LOAD(1);
+			LOAD(2);
+			LOAD(3);
+
+			r0 = unzigzag8(r0);
+			r1 = unzigzag8(r1);
+			r2 = unzigzag8(r2);
+			r3 = unzigzag8(r3);
+
+			transpose8(r0, r1, r2, r3);
+
+			TEMP t0, t1, t2, t3;
+
+			GRP4(0);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(1);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(2);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+			GRP4(3);
+			FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+			SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+		}
+	}
+
+	memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+	memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+	return data;
+}
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+static unsigned int getCpuFeatures()
+{
+	int cpuinfo[4] = {};
+#ifdef _MSC_VER
+	__cpuid(cpuinfo, 1);
+#else
+	__cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
+#endif
+	return cpuinfo[2];
+}
+
+unsigned int cpuid = getCpuFeatures();
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+#if TRACE
+	memset(vertexstats, 0, sizeof(vertexstats));
+#endif
+
+	const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
+
+	unsigned char* data = buffer;
+	unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return 0;
+
+	int version = gEncodeVertexVersion;
+
+	*data++ = (unsigned char)(kVertexHeader | version);
+
+	unsigned char first_vertex[256] = {};
+	if (vertex_count > 0)
+		memcpy(first_vertex, vertex_data, vertex_size);
+
+	unsigned char last_vertex[256] = {};
+	memcpy(last_vertex, first_vertex, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return 0;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) < tail_size)
+		return 0;
+
+	// write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
+	if (vertex_size < kTailMaxSize)
+	{
+		memset(data, 0, kTailMaxSize - vertex_size);
+		data += kTailMaxSize - vertex_size;
+	}
+
+	memcpy(data, first_vertex, vertex_size);
+	data += vertex_size;
+
+	assert(data >= buffer + tail_size);
+	assert(data <= buffer + buffer_size);
+
+#if TRACE
+	size_t total_size = data - buffer;
+
+	for (size_t k = 0; k < vertex_size; ++k)
+	{
+		const Stats& vsk = vertexstats[k];
+
+		printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+#if TRACE > 1
+		printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
+		       int(vsk.header),
+		       int(vsk.bitg[0]), int(vsk.bitb[0]),
+		       int(vsk.bitg[1]), int(vsk.bitb[1]),
+		       int(vsk.bitg[2]), int(vsk.bitb[2]),
+		       int(vsk.bitg[3]), int(vsk.bitb[3]));
+#endif
+
+		printf("\n");
+	}
+#endif
+
+	return data - buffer;
+}
+
+size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+	size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
+
+	size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
+	size_t vertex_block_data_size = vertex_block_size;
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+}
+
+void meshopt_encodeVertexVersion(int version)
+{
+	assert(unsigned(version) <= 0);
+
+	meshopt::gEncodeVertexVersion = version;
+}
+
+int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_size > 0 && vertex_size <= 256);
+	assert(vertex_size % 4 == 0);
+
+	const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+	decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
+#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decode = decodeVertexBlockSimd;
+#else
+	decode = decodeVertexBlock;
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	assert(gDecodeBytesGroupInitialized);
+	(void)gDecodeBytesGroupInitialized;
+#endif
+
+	unsigned char* vertex_data = static_cast<unsigned char*>(destination);
+
+	const unsigned char* data = buffer;
+	const unsigned char* data_end = buffer + buffer_size;
+
+	if (size_t(data_end - data) < 1 + vertex_size)
+		return -2;
+
+	unsigned char data_header = *data++;
+
+	if ((data_header & 0xf0) != kVertexHeader)
+		return -1;
+
+	int version = data_header & 0x0f;
+	if (version > 0)
+		return -1;
+
+	unsigned char last_vertex[256];
+	memcpy(last_vertex, data_end - vertex_size, vertex_size);
+
+	size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+	size_t vertex_offset = 0;
+
+	while (vertex_offset < vertex_count)
+	{
+		size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+		data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+		if (!data)
+			return -2;
+
+		vertex_offset += block_size;
+	}
+
+	size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+	if (size_t(data_end - data) != tail_size)
+		return -3;
+
+	return 0;
+}
+
+#undef SIMD_NEON
+#undef SIMD_SSE
+#undef SIMD_AVX
+#undef SIMD_WASM
+#undef SIMD_FALLBACK
+#undef SIMD_TARGET
diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp
new file mode 100644
index 0000000000..e7ad2c9d39
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexfilter.cpp
@@ -0,0 +1,825 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <math.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
+#if defined(__SSE2__)
+#define SIMD_SSE
+#endif
+
+// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <emmintrin.h>
+#include <stdint.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
+#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
+#endif
+
+namespace meshopt
+{
+
+#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
+template <typename T>
+static void decodeFilterOct(T* data, size_t count)
+{
+	const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float x = float(data[i * 4 + 0]);
+		float y = float(data[i * 4 + 1]);
+		float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
+
+		// fixup octahedral coordinates for z<0
+		float t = (z >= 0.f) ? 0.f : z;
+
+		x += (x >= 0.f) ? t : -t;
+		y += (y >= 0.f) ? t : -t;
+
+		// compute normal length & scale
+		float l = sqrtf(x * x + y * y + z * z);
+		float s = max / l;
+
+		// rounded signed float->int
+		int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
+
+		data[i * 4 + 0] = T(xf);
+		data[i * 4 + 1] = T(yf);
+		data[i * 4 + 2] = T(zf);
+	}
+}
+
+static void decodeFilterQuat(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; ++i)
+	{
+		// recover scale from the high byte of the component
+		int sf = data[i * 4 + 3] | 3;
+		float ss = scale / float(sf);
+
+		// convert x/y/z to [-1..1] (scaled...)
+		float x = float(data[i * 4 + 0]) * ss;
+		float y = float(data[i * 4 + 1]) * ss;
+		float z = float(data[i * 4 + 2]) * ss;
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		float ww = 1.f - x * x - y * y - z * z;
+		float w = sqrtf(ww >= 0.f ? ww : 0.f);
+
+		// rounded signed float->int
+		int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
+		int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
+		int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
+		int wf = int(w * 32767.f + 0.5f);
+
+		int qc = data[i * 4 + 3] & 3;
+
+		// output order is dictated by input index
+		data[i * 4 + ((qc + 1) & 3)] = short(xf);
+		data[i * 4 + ((qc + 2) & 3)] = short(yf);
+		data[i * 4 + ((qc + 3) & 3)] = short(zf);
+		data[i * 4 + ((qc + 0) & 3)] = short(wf);
+	}
+}
+
+static void decodeFilterExp(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; ++i)
+	{
+		unsigned int v = data[i];
+
+		// decode mantissa and exponent
+		int m = int(v << 8) >> 8;
+		int e = int(v) >> 24;
+
+		union
+		{
+			float f;
+			unsigned int ui;
+		} u;
+
+		// optimized version of ldexp(float(m), e)
+		u.ui = unsigned(e + 127) << 23;
+		u.f = u.f * float(m);
+
+		data[i] = u.ui;
+	}
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+inline uint64_t rotateleft64(uint64_t v, int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+	return _rotl64(v, x);
+// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
+// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
+#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
+	return __builtin_rotateleft64(v, x);
+#else
+	return (v << (x & 63)) | (v >> ((64 - x) & 63));
+#endif
+}
+#endif
+
+#ifdef SIMD_SSE
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const __m128 sign = _mm_set1_ps(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
+		__m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+		// fixup octahedral coordinates for z<0
+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+		// compute normal length & scale
+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+		__m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+		// combine xr/yr/zr into final value
+		__m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
+		res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
+		res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const __m128 sign = _mm_set1_ps(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+		__m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		__m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
+		__m128i yf = _mm_srai_epi32(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		__m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+		__m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		__m128 x = _mm_cvtepi32_ps(xf);
+		__m128 y = _mm_cvtepi32_ps(yf);
+		__m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+		// fixup octahedral coordinates for z<0
+		__m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+		x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+		y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+		// compute normal length & scale
+		__m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+		__m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+		__m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		__m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
+		__m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
+
+		// patch in .w
+		res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
+		res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
+
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+		__m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		__m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+		__m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		__m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
+		__m128i yf = _mm_srai_epi32(q4_xy, 16);
+		__m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
+		__m128i cf = _mm_srai_epi32(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		__m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
+		__m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		__m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
+		__m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
+		__m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		__m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+		__m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
+
+		__m128 s = _mm_set1_ps(32767.f);
+
+		// rounded signed float->int
+		__m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+		__m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+		__m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+		__m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		__m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+		__m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		__m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
+		__m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
+
+		// store results to stack so that we can rotate using scalar instructions
+		uint64_t res[4];
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
+		_mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
+
+		// rotate and store
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+		out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
+		out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
+		out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
+		out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		__m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
+
+		// decode exponent into 2^x directly
+		__m128i ef = _mm_srai_epi32(v, 24);
+		__m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		__m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
+		__m128 m = _mm_cvtepi32_ps(mf);
+
+		__m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
+
+		_mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
+	}
+}
+#endif
+
+#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
+inline float32x4_t vsqrtq_f32(float32x4_t x)
+{
+	float32x4_t r = vrsqrteq_f32(x);
+	r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
+	return vmulq_f32(r, x);
+}
+
+inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
+{
+	float32x4_t r = vrecpeq_f32(y);
+	r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
+	return vmulq_f32(x, r);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
+		int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+		// fixup octahedral coordinates for z<0
+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+		// compute normal length & scale
+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t rl = vrsqrteq_f32(ll);
+		float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+		// combine xr/yr/zr into final value
+		int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
+		res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
+		res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
+		int32x4_t yf = vshrq_n_s32(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
+		int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		float32x4_t x = vcvtq_f32_s32(xf);
+		float32x4_t y = vcvtq_f32_s32(yf);
+		float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+		// fixup octahedral coordinates for z<0
+		float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+		x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+		y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+		// compute normal length & scale
+		float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+		float32x4_t rl = vrsqrteq_f32(ll);
+		rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
+		float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
+
+		// patch in .w
+		res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
+		res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
+
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
+		vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+		int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
+		int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
+		int32x4_t yf = vshrq_n_s32(q4_xy, 16);
+		int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
+		int32x4_t cf = vshrq_n_s32(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
+		float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
+		float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
+		float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+		float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
+
+		float32x4_t s = vdupq_n_f32(32767.f);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+		int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+		int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+		int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+		int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+		int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
+		int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+
+		// rotate and store
+		uint64_t* out = (uint64_t*)&data[i * 4];
+
+		out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
+		out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
+		out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
+		out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
+
+		// decode exponent into 2^x directly
+		int32x4_t ef = vshrq_n_s32(v, 24);
+		int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
+		float32x4_t m = vcvtq_f32_s32(mf);
+
+		float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
+
+		vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
+	}
+}
+#endif
+
+#ifdef SIMD_WASM
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+	const v128_t sign = wasm_f32x4_splat(-0.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t n4 = wasm_v128_load(&data[i * 4]);
+
+		// sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
+		v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
+
+		// unpack z; note that z is unsigned so we technically don't need to sign extend it
+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+		// fixup octahedral coordinates for z<0
+		// note: i32x4_min with 0 is equvalent to f32x4_min
+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+		// compute normal length & scale
+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+		// combine xr/yr/zr into final value
+		v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
+		res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
+		res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
+
+		wasm_v128_store(&data[i * 4], res);
+	}
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+	const v128_t sign = wasm_f32x4_splat(-0.f);
+	const v128_t zmask = wasm_i32x4_splat(0x7fff);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
+		v128_t yf = wasm_i32x4_shr(n4, 16);
+
+		// unpack z; note that z is unsigned so we don't need to sign extend it
+		v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
+		v128_t zf = wasm_v128_and(z4, zmask);
+
+		// convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+		v128_t x = wasm_f32x4_convert_i32x4(xf);
+		v128_t y = wasm_f32x4_convert_i32x4(yf);
+		v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+		// fixup octahedral coordinates for z<0
+		// note: i32x4_min with 0 is equvalent to f32x4_min
+		v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+		x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+		y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+		// compute normal length & scale
+		v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+		v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+		// mix x/z and y/0 to make 16-bit unpack easier
+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+		v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
+
+		// pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+		v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
+		v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
+
+		// patch in .w
+		res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
+		res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
+
+		wasm_v128_store(&data[(i + 0) * 4], res_0);
+		wasm_v128_store(&data[(i + 2) * 4], res_1);
+	}
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+	const float scale = 1.f / sqrtf(2.f);
+
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+		v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+		// gather both x/y 16-bit pairs in each 32-bit lane
+		v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
+		v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
+
+		// sign-extends each of x,y in [x y] with arithmetic shifts
+		v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
+		v128_t yf = wasm_i32x4_shr(q4_xy, 16);
+		v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
+		v128_t cf = wasm_i32x4_shr(q4_zc, 16);
+
+		// get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+		v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
+		v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
+
+		// convert x/y/z to [-1..1] (scaled...)
+		v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
+		v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
+		v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
+
+		// reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+		// note: i32x4_max with 0 is equivalent to f32x4_max
+		v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
+		v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
+
+		v128_t s = wasm_f32x4_splat(32767.f);
+
+		// fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+		// note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+		const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+		v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+		v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+		v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+		v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
+
+		// mix x/z and w/y to make 16-bit unpack easier
+		v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+		v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
+
+		// pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+		v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
+		v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
+
+		// compute component index shifted left by 4 (and moved into i32x4 slot)
+		// TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
+		volatile v128_t cm = wasm_i32x4_shl(cf, 4);
+
+		// rotate and store
+		uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+		out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
+		out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
+		out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
+		out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
+	}
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+	for (size_t i = 0; i < count; i += 4)
+	{
+		v128_t v = wasm_v128_load(&data[i]);
+
+		// decode exponent into 2^x directly
+		v128_t ef = wasm_i32x4_shr(v, 24);
+		v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
+
+		// decode 24-bit mantissa into floating-point value
+		v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
+		v128_t m = wasm_f32x4_convert_i32x4(mf);
+
+		v128_t r = wasm_f32x4_mul(es, m);
+
+		wasm_v128_store(&data[i], r);
+	}
+}
+#endif
+
+} // namespace meshopt
+
+void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size == 4 || vertex_size == 8);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	if (vertex_size == 4)
+		decodeFilterOctSimd(static_cast<signed char*>(buffer), vertex_count);
+	else
+		decodeFilterOctSimd(static_cast<short*>(buffer), vertex_count);
+#else
+	if (vertex_size == 4)
+		decodeFilterOct(static_cast<signed char*>(buffer), vertex_count);
+	else
+		decodeFilterOct(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size == 8);
+	(void)vertex_size;
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decodeFilterQuatSimd(static_cast<short*>(buffer), vertex_count);
+#else
+	decodeFilterQuat(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+	using namespace meshopt;
+
+	assert(vertex_count % 4 == 0);
+	assert(vertex_size % 4 == 0);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+	decodeFilterExpSimd(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#else
+	decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#endif
+}
+
+#undef SIMD_SSE
+#undef SIMD_NEON
+#undef SIMD_WASM
diff --git a/thirdparty/meshoptimizer/vfetchanalyzer.cpp b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
new file mode 100644
index 0000000000..51dca873f8
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
@@ -0,0 +1,58 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	meshopt_VertexFetchStatistics result = {};
+
+	unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+	memset(vertex_visited, 0, vertex_count);
+
+	const size_t kCacheLine = 64;
+	const size_t kCacheSize = 128 * 1024;
+
+	// simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+	size_t cache[kCacheSize / kCacheLine] = {};
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		vertex_visited[index] = 1;
+
+		size_t start_address = index * vertex_size;
+		size_t end_address = start_address + vertex_size;
+
+		size_t start_tag = start_address / kCacheLine;
+		size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+		assert(start_tag < end_tag);
+
+		for (size_t tag = start_tag; tag < end_tag; ++tag)
+		{
+			size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+			// we store +1 since cache is filled with 0 by default
+			result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+			cache[line] = tag + 1;
+		}
+	}
+
+	size_t unique_vertex_count = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+		unique_vertex_count += vertex_visited[i];
+
+	result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+	return result;
+}
diff --git a/thirdparty/meshoptimizer/vfetchoptimizer.cpp b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
new file mode 100644
index 0000000000..465d6df5ca
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
@@ -0,0 +1,74 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+	assert(index_count % 3 == 0);
+
+	memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		if (destination[index] == ~0u)
+		{
+			destination[index] = next_vertex++;
+		}
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
+
+size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+	assert(index_count % 3 == 0);
+	assert(vertex_size > 0 && vertex_size <= 256);
+
+	meshopt_Allocator allocator;
+
+	// support in-place optimization
+	if (destination == vertices)
+	{
+		unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+		memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+		vertices = vertices_copy;
+	}
+
+	// build vertex remap table
+	unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
+	memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
+
+	unsigned int next_vertex = 0;
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		unsigned int index = indices[i];
+		assert(index < vertex_count);
+
+		unsigned int& remap = vertex_remap[index];
+
+		if (remap == ~0u) // vertex was not added to destination VB
+		{
+			// add vertex
+			memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
+
+			remap = next_vertex++;
+		}
+
+		// modify indices in place
+		indices[i] = remap;
+	}
+
+	assert(next_vertex <= vertex_count);
+
+	return next_vertex;
+}
diff --git a/thirdparty/minimp3/LICENSE b/thirdparty/minimp3/LICENSE
new file mode 100644
index 0000000000..2c4afabdb6
--- /dev/null
+++ b/thirdparty/minimp3/LICENSE
@@ -0,0 +1,117 @@
+CC0 1.0 Universal
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator and
+subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for the
+purpose of contributing to a commons of creative, cultural and scientific
+works ("Commons") that the public can reliably and without fear of later
+claims of infringement build upon, modify, incorporate in other works, reuse
+and redistribute as freely as possible in any form whatsoever and for any
+purposes, including without limitation commercial purposes. These owners may
+contribute to the Commons to promote the ideal of a free culture and the
+further production of creative, cultural and scientific works, or to gain
+reputation or greater distribution for their Work in part through the use and
+efforts of others.
+
+For these and/or other purposes and motivations, and without any expectation
+of additional consideration or compensation, the person associating CC0 with a
+Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
+and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
+and publicly distribute the Work under its terms, with knowledge of his or her
+Copyright and Related Rights in the Work and the meaning and intended legal
+effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not limited
+to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display, communicate,
+  and translate a Work;
+
+  ii. moral rights retained by the original author(s) and/or performer(s);
+
+  iii. publicity and privacy rights pertaining to a person's image or likeness
+  depicted in a Work;
+
+  iv. rights protecting against unfair competition in regards to a Work,
+  subject to the limitations in paragraph 4(a), below;
+
+  v. rights protecting the extraction, dissemination, use and reuse of data in
+  a Work;
+
+  vi. database rights (such as those arising under Directive 96/9/EC of the
+  European Parliament and of the Council of 11 March 1996 on the legal
+  protection of databases, and under any national implementation thereof,
+  including any amended or successor version of such directive); and
+
+  vii. other similar, equivalent or corresponding rights throughout the world
+  based on applicable law or treaty, and any national implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention of,
+applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
+unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
+and Related Rights and associated claims and causes of action, whether now
+known or unknown (including existing as well as future claims and causes of
+action), in the Work (i) in all territories worldwide, (ii) for the maximum
+duration provided by applicable law or treaty (including future time
+extensions), (iii) in any current or future medium and for any number of
+copies, and (iv) for any purpose whatsoever, including without limitation
+commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
+the Waiver for the benefit of each member of the public at large and to the
+detriment of Affirmer's heirs and successors, fully intending that such Waiver
+shall not be subject to revocation, rescission, cancellation, termination, or
+any other legal or equitable action to disrupt the quiet enjoyment of the Work
+by the public as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason be
+judged legally invalid or ineffective under applicable law, then the Waiver
+shall be preserved to the maximum extent permitted taking into account
+Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
+is so judged Affirmer hereby grants to each affected person a royalty-free,
+non transferable, non sublicensable, non exclusive, irrevocable and
+unconditional license to exercise Affirmer's Copyright and Related Rights in
+the Work (i) in all territories worldwide, (ii) for the maximum duration
+provided by applicable law or treaty (including future time extensions), (iii)
+in any current or future medium and for any number of copies, and (iv) for any
+purpose whatsoever, including without limitation commercial, advertising or
+promotional purposes (the "License"). The License shall be deemed effective as
+of the date CC0 was applied by Affirmer to the Work. Should any part of the
+License for any reason be judged legally invalid or ineffective under
+applicable law, such partial invalidity or ineffectiveness shall not
+invalidate the remainder of the License, and in such case Affirmer hereby
+affirms that he or she will not (i) exercise any of his or her remaining
+Copyright and Related Rights in the Work or (ii) assert any associated claims
+and causes of action with respect to the Work, in either case contrary to
+Affirmer's express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+  a. No trademark or patent rights held by Affirmer are waived, abandoned,
+  surrendered, licensed or otherwise affected by this document.
+
+  b. Affirmer offers the Work as-is and makes no representations or warranties
+  of any kind concerning the Work, express, implied, statutory or otherwise,
+  including without limitation warranties of title, merchantability, fitness
+  for a particular purpose, non infringement, or the absence of latent or
+  other defects, accuracy, or the present or absence of errors, whether or not
+  discoverable, all to the greatest extent permissible under applicable law.
+
+  c. Affirmer disclaims responsibility for clearing rights of other persons
+  that may apply to the Work or any use thereof, including without limitation
+  any person's Copyright and Related Rights in the Work. Further, Affirmer
+  disclaims responsibility for obtaining any necessary consents, permissions
+  or other rights required for any use of the Work.
+
+  d. Affirmer understands and acknowledges that Creative Commons is not a
+  party to this document and has no duty or obligation with respect to this
+  CC0 or use of the Work.
+
+For more information, please see
+<http://creativecommons.org/publicdomain/zero/1.0/>
+
diff --git a/thirdparty/minimp3/minimp3.h b/thirdparty/minimp3/minimp3.h
new file mode 100644
index 0000000000..796cbc1f8e
--- /dev/null
+++ b/thirdparty/minimp3/minimp3.h
@@ -0,0 +1,1855 @@
+#ifndef MINIMP3_H
+#define MINIMP3_H
+/*
+    https://github.com/lieff/minimp3
+    To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
+    This software is distributed without any warranty.
+    See <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#include <stdint.h>
+
+#define MINIMP3_MAX_SAMPLES_PER_FRAME (1152*2)
+
+typedef struct
+{
+    int frame_bytes, frame_offset, channels, hz, layer, bitrate_kbps;
+} mp3dec_frame_info_t;
+
+typedef struct
+{
+    float mdct_overlap[2][9*32], qmf_state[15*2*32];
+    int reserv, free_format_bytes;
+    unsigned char header[4], reserv_buf[511];
+} mp3dec_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+void mp3dec_init(mp3dec_t *dec);
+#ifndef MINIMP3_FLOAT_OUTPUT
+typedef int16_t mp3d_sample_t;
+#else /* MINIMP3_FLOAT_OUTPUT */
+typedef float mp3d_sample_t;
+void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples);
+#endif /* MINIMP3_FLOAT_OUTPUT */
+int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* MINIMP3_H */
+#if defined(MINIMP3_IMPLEMENTATION) && !defined(_MINIMP3_IMPLEMENTATION_GUARD)
+#define _MINIMP3_IMPLEMENTATION_GUARD
+
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX_FREE_FORMAT_FRAME_SIZE  2304    /* more than ISO spec's */
+#ifndef MAX_FRAME_SYNC_MATCHES
+#define MAX_FRAME_SYNC_MATCHES      10
+#endif /* MAX_FRAME_SYNC_MATCHES */
+
+#define MAX_L3_FRAME_PAYLOAD_BYTES  MAX_FREE_FORMAT_FRAME_SIZE /* MUST be >= 320000/8/32000*1152 = 1440 */
+
+#define MAX_BITRESERVOIR_BYTES      511
+#define SHORT_BLOCK_TYPE            2
+#define STOP_BLOCK_TYPE             3
+#define MODE_MONO                   3
+#define MODE_JOINT_STEREO           1
+#define HDR_SIZE                    4
+#define HDR_IS_MONO(h)              (((h[3]) & 0xC0) == 0xC0)
+#define HDR_IS_MS_STEREO(h)         (((h[3]) & 0xE0) == 0x60)
+#define HDR_IS_FREE_FORMAT(h)       (((h[2]) & 0xF0) == 0)
+#define HDR_IS_CRC(h)               (!((h[1]) & 1))
+#define HDR_TEST_PADDING(h)         ((h[2]) & 0x2)
+#define HDR_TEST_MPEG1(h)           ((h[1]) & 0x8)
+#define HDR_TEST_NOT_MPEG25(h)      ((h[1]) & 0x10)
+#define HDR_TEST_I_STEREO(h)        ((h[3]) & 0x10)
+#define HDR_TEST_MS_STEREO(h)       ((h[3]) & 0x20)
+#define HDR_GET_STEREO_MODE(h)      (((h[3]) >> 6) & 3)
+#define HDR_GET_STEREO_MODE_EXT(h)  (((h[3]) >> 4) & 3)
+#define HDR_GET_LAYER(h)            (((h[1]) >> 1) & 3)
+#define HDR_GET_BITRATE(h)          ((h[2]) >> 4)
+#define HDR_GET_SAMPLE_RATE(h)      (((h[2]) >> 2) & 3)
+#define HDR_GET_MY_SAMPLE_RATE(h)   (HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3)
+#define HDR_IS_FRAME_576(h)         ((h[1] & 14) == 2)
+#define HDR_IS_LAYER_1(h)           ((h[1] & 6) == 6)
+
+#define BITS_DEQUANTIZER_OUT        -1
+#define MAX_SCF                     (255 + BITS_DEQUANTIZER_OUT*4 - 210)
+#define MAX_SCFI                    ((MAX_SCF + 3) & ~3)
+
+#define MINIMP3_MIN(a, b)           ((a) > (b) ? (b) : (a))
+#define MINIMP3_MAX(a, b)           ((a) < (b) ? (b) : (a))
+
+#if !defined(MINIMP3_NO_SIMD)
+
+#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64))
+/* x64 always have SSE2, arm64 always have neon, no need for generic code */
+#define MINIMP3_ONLY_SIMD
+#endif /* SIMD checks... */
+
+#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__))
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif /* defined(_MSC_VER) */
+#include <immintrin.h>
+#define HAVE_SSE 1
+#define HAVE_SIMD 1
+#define VSTORE _mm_storeu_ps
+#define VLD _mm_loadu_ps
+#define VSET _mm_set1_ps
+#define VADD _mm_add_ps
+#define VSUB _mm_sub_ps
+#define VMUL _mm_mul_ps
+#define VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y))
+#define VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y))
+#define VMUL_S(x, s)  _mm_mul_ps(x, _mm_set1_ps(s))
+#define VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3))
+typedef __m128 f4;
+#if defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD)
+#define minimp3_cpuid __cpuid
+#else /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
+static __inline__ __attribute__((always_inline)) void minimp3_cpuid(int CPUInfo[], const int InfoType)
+{
+#if defined(__PIC__)
+    __asm__ __volatile__(
+#if defined(__x86_64__)
+        "push %%rbx\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+        "pop  %%rbx\n"
+#else /* defined(__x86_64__) */
+        "xchgl %%ebx, %1\n"
+        "cpuid\n"
+        "xchgl %%ebx, %1\n"
+#endif /* defined(__x86_64__) */
+        : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#else /* defined(__PIC__) */
+    __asm__ __volatile__(
+        "cpuid"
+        : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+        : "a" (InfoType));
+#endif /* defined(__PIC__)*/
+}
+#endif /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
+static int have_simd(void)
+{
+#ifdef MINIMP3_ONLY_SIMD
+    return 1;
+#else /* MINIMP3_ONLY_SIMD */
+    static int g_have_simd;
+    int CPUInfo[4];
+#ifdef MINIMP3_TEST
+    static int g_counter;
+    if (g_counter++ > 100)
+        return 0;
+#endif /* MINIMP3_TEST */
+    if (g_have_simd)
+        goto end;
+    minimp3_cpuid(CPUInfo, 0);
+    g_have_simd = 1;
+    if (CPUInfo[0] > 0)
+    {
+        minimp3_cpuid(CPUInfo, 1);
+        g_have_simd = (CPUInfo[3] & (1 << 26)) + 1; /* SSE2 */
+    }
+end:
+    return g_have_simd - 1;
+#endif /* MINIMP3_ONLY_SIMD */
+}
+#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+#define HAVE_SSE 0
+#define HAVE_SIMD 1
+#define VSTORE vst1q_f32
+#define VLD vld1q_f32
+#define VSET vmovq_n_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VMAC(a, x, y) vmlaq_f32(a, x, y)
+#define VMSB(a, x, y) vmlsq_f32(a, x, y)
+#define VMUL_S(x, s)  vmulq_f32(x, vmovq_n_f32(s))
+#define VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x)))
+typedef float32x4_t f4;
+static int have_simd()
+{   /* TODO: detect neon for !MINIMP3_ONLY_SIMD */
+    return 1;
+}
+#else /* SIMD checks... */
+#define HAVE_SSE 0
+#define HAVE_SIMD 0
+#ifdef MINIMP3_ONLY_SIMD
+#error MINIMP3_ONLY_SIMD used, but SSE/NEON not enabled
+#endif /* MINIMP3_ONLY_SIMD */
+#endif /* SIMD checks... */
+#else /* !defined(MINIMP3_NO_SIMD) */
+#define HAVE_SIMD 0
+#endif /* !defined(MINIMP3_NO_SIMD) */
+
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64)
+#define HAVE_ARMV6 1
+static __inline__ __attribute__((always_inline)) int32_t minimp3_clip_int16_arm(int32_t a)
+{
+    int32_t x = 0;
+    __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a));
+    return x;
+}
+#else
+#define HAVE_ARMV6 0
+#endif
+
+typedef struct
+{
+    const uint8_t *buf;
+    int pos, limit;
+} bs_t;
+
+typedef struct
+{
+    float scf[3*64];
+    uint8_t total_bands, stereo_bands, bitalloc[64], scfcod[64];
+} L12_scale_info;
+
+typedef struct
+{
+    uint8_t tab_offset, code_tab_width, band_count;
+} L12_subband_alloc_t;
+
+typedef struct
+{
+    const uint8_t *sfbtab;
+    uint16_t part_23_length, big_values, scalefac_compress;
+    uint8_t global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb;
+    uint8_t table_select[3], region_count[3], subblock_gain[3];
+    uint8_t preflag, scalefac_scale, count1_table, scfsi;
+} L3_gr_info_t;
+
+typedef struct
+{
+    bs_t bs;
+    uint8_t maindata[MAX_BITRESERVOIR_BYTES + MAX_L3_FRAME_PAYLOAD_BYTES];
+    L3_gr_info_t gr_info[4];
+    float grbuf[2][576], scf[40], syn[18 + 15][2*32];
+    uint8_t ist_pos[2][39];
+} mp3dec_scratch_t;
+
+static void bs_init(bs_t *bs, const uint8_t *data, int bytes)
+{
+    bs->buf   = data;
+    bs->pos   = 0;
+    bs->limit = bytes*8;
+}
+
+static uint32_t get_bits(bs_t *bs, int n)
+{
+    uint32_t next, cache = 0, s = bs->pos & 7;
+    int shl = n + s;
+    const uint8_t *p = bs->buf + (bs->pos >> 3);
+    if ((bs->pos += n) > bs->limit)
+        return 0;
+    next = *p++ & (255 >> s);
+    while ((shl -= 8) > 0)
+    {
+        cache |= next << shl;
+        next = *p++;
+    }
+    return cache | (next >> -shl);
+}
+
+static int hdr_valid(const uint8_t *h)
+{
+    return h[0] == 0xff &&
+        ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) &&
+        (HDR_GET_LAYER(h) != 0) &&
+        (HDR_GET_BITRATE(h) != 15) &&
+        (HDR_GET_SAMPLE_RATE(h) != 3);
+}
+
+static int hdr_compare(const uint8_t *h1, const uint8_t *h2)
+{
+    return hdr_valid(h2) &&
+        ((h1[1] ^ h2[1]) & 0xFE) == 0 &&
+        ((h1[2] ^ h2[2]) & 0x0C) == 0 &&
+        !(HDR_IS_FREE_FORMAT(h1) ^ HDR_IS_FREE_FORMAT(h2));
+}
+
+static unsigned hdr_bitrate_kbps(const uint8_t *h)
+{
+    static const uint8_t halfrate[2][3][15] = {
+        { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } },
+        { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } },
+    };
+    return 2*halfrate[!!HDR_TEST_MPEG1(h)][HDR_GET_LAYER(h) - 1][HDR_GET_BITRATE(h)];
+}
+
+static unsigned hdr_sample_rate_hz(const uint8_t *h)
+{
+    static const unsigned g_hz[3] = { 44100, 48000, 32000 };
+    return g_hz[HDR_GET_SAMPLE_RATE(h)] >> (int)!HDR_TEST_MPEG1(h) >> (int)!HDR_TEST_NOT_MPEG25(h);
+}
+
+static unsigned hdr_frame_samples(const uint8_t *h)
+{
+    return HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)HDR_IS_FRAME_576(h));
+}
+
+static int hdr_frame_bytes(const uint8_t *h, int free_format_size)
+{
+    int frame_bytes = hdr_frame_samples(h)*hdr_bitrate_kbps(h)*125/hdr_sample_rate_hz(h);
+    if (HDR_IS_LAYER_1(h))
+    {
+        frame_bytes &= ~3; /* slot align */
+    }
+    return frame_bytes ? frame_bytes : free_format_size;
+}
+
+static int hdr_padding(const uint8_t *h)
+{
+    return HDR_TEST_PADDING(h) ? (HDR_IS_LAYER_1(h) ? 4 : 1) : 0;
+}
+
+#ifndef MINIMP3_ONLY_MP3
+static const L12_subband_alloc_t *L12_subband_alloc_table(const uint8_t *hdr, L12_scale_info *sci)
+{
+    const L12_subband_alloc_t *alloc;
+    int mode = HDR_GET_STEREO_MODE(hdr);
+    int nbands, stereo_bands = (mode == MODE_MONO) ? 0 : (mode == MODE_JOINT_STEREO) ? (HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32;
+
+    if (HDR_IS_LAYER_1(hdr))
+    {
+        static const L12_subband_alloc_t g_alloc_L1[] = { { 76, 4, 32 } };
+        alloc = g_alloc_L1;
+        nbands = 32;
+    } else if (!HDR_TEST_MPEG1(hdr))
+    {
+        static const L12_subband_alloc_t g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } };
+        alloc = g_alloc_L2M2;
+        nbands = 30;
+    } else
+    {
+        static const L12_subband_alloc_t g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } };
+        int sample_rate_idx = HDR_GET_SAMPLE_RATE(hdr);
+        unsigned kbps = hdr_bitrate_kbps(hdr) >> (int)(mode != MODE_MONO);
+        if (!kbps) /* free-format */
+        {
+            kbps = 192;
+        }
+
+        alloc = g_alloc_L2M1;
+        nbands = 27;
+        if (kbps < 56)
+        {
+            static const L12_subband_alloc_t g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } };
+            alloc = g_alloc_L2M1_lowrate;
+            nbands = sample_rate_idx == 2 ? 12 : 8;
+        } else if (kbps >= 96 && sample_rate_idx != 1)
+        {
+            nbands = 30;
+        }
+    }
+
+    sci->total_bands = (uint8_t)nbands;
+    sci->stereo_bands = (uint8_t)MINIMP3_MIN(stereo_bands, nbands);
+
+    return alloc;
+}
+
+static void L12_read_scalefactors(bs_t *bs, uint8_t *pba, uint8_t *scfcod, int bands, float *scf)
+{
+    static const float g_deq_L12[18*3] = {
+#define DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x
+        DQ(3),DQ(7),DQ(15),DQ(31),DQ(63),DQ(127),DQ(255),DQ(511),DQ(1023),DQ(2047),DQ(4095),DQ(8191),DQ(16383),DQ(32767),DQ(65535),DQ(3),DQ(5),DQ(9)
+    };
+    int i, m;
+    for (i = 0; i < bands; i++)
+    {
+        float s = 0;
+        int ba = *pba++;
+        int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0;
+        for (m = 4; m; m >>= 1)
+        {
+            if (mask & m)
+            {
+                int b = get_bits(bs, 6);
+                s = g_deq_L12[ba*3 - 6 + b % 3]*(1 << 21 >> b/3);
+            }
+            *scf++ = s;
+        }
+    }
+}
+
+static void L12_read_scale_info(const uint8_t *hdr, bs_t *bs, L12_scale_info *sci)
+{
+    static const uint8_t g_bitalloc_code_tab[] = {
+        0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16,
+        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16,
+        0,17,18, 3,19,4,5,16,
+        0,17,18,16,
+        0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15,
+        0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14,
+        0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16
+    };
+    const L12_subband_alloc_t *subband_alloc = L12_subband_alloc_table(hdr, sci);
+
+    int i, k = 0, ba_bits = 0;
+    const uint8_t *ba_code_tab = g_bitalloc_code_tab;
+
+    for (i = 0; i < sci->total_bands; i++)
+    {
+        uint8_t ba;
+        if (i == k)
+        {
+            k += subband_alloc->band_count;
+            ba_bits = subband_alloc->code_tab_width;
+            ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset;
+            subband_alloc++;
+        }
+        ba = ba_code_tab[get_bits(bs, ba_bits)];
+        sci->bitalloc[2*i] = ba;
+        if (i < sci->stereo_bands)
+        {
+            ba = ba_code_tab[get_bits(bs, ba_bits)];
+        }
+        sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0;
+    }
+
+    for (i = 0; i < 2*sci->total_bands; i++)
+    {
+        sci->scfcod[i] = sci->bitalloc[i] ? HDR_IS_LAYER_1(hdr) ? 2 : get_bits(bs, 2) : 6;
+    }
+
+    L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf);
+
+    for (i = sci->stereo_bands; i < sci->total_bands; i++)
+    {
+        sci->bitalloc[2*i + 1] = 0;
+    }
+}
+
+static int L12_dequantize_granule(float *grbuf, bs_t *bs, L12_scale_info *sci, int group_size)
+{
+    int i, j, k, choff = 576;
+    for (j = 0; j < 4; j++)
+    {
+        float *dst = grbuf + group_size*j;
+        for (i = 0; i < 2*sci->total_bands; i++)
+        {
+            int ba = sci->bitalloc[i];
+            if (ba != 0)
+            {
+                if (ba < 17)
+                {
+                    int half = (1 << (ba - 1)) - 1;
+                    for (k = 0; k < group_size; k++)
+                    {
+                        dst[k] = (float)((int)get_bits(bs, ba) - half);
+                    }
+                } else
+                {
+                    unsigned mod = (2 << (ba - 17)) + 1;    /* 3, 5, 9 */
+                    unsigned code = get_bits(bs, mod + 2 - (mod >> 3));  /* 5, 7, 10 */
+                    for (k = 0; k < group_size; k++, code /= mod)
+                    {
+                        dst[k] = (float)((int)(code % mod - mod/2));
+                    }
+                }
+            }
+            dst += choff;
+            choff = 18 - choff;
+        }
+    }
+    return group_size*4;
+}
+
+static void L12_apply_scf_384(L12_scale_info *sci, const float *scf, float *dst)
+{
+    int i, k;
+    memcpy(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
+    for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6)
+    {
+        for (k = 0; k < 12; k++)
+        {
+            dst[k + 0]   *= scf[0];
+            dst[k + 576] *= scf[3];
+        }
+    }
+}
+#endif /* MINIMP3_ONLY_MP3 */
+
+static int L3_read_side_info(bs_t *bs, L3_gr_info_t *gr, const uint8_t *hdr)
+{
+    static const uint8_t g_scf_long[8][23] = {
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 },
+        { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+        { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 },
+        { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 },
+        { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 }
+    };
+    static const uint8_t g_scf_short[8][40] = {
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+        { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+        { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+    };
+    static const uint8_t g_scf_mixed[8][40] = {
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+        { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+        { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+        { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+    };
+
+    unsigned tables, scfsi = 0;
+    int main_data_begin, part_23_sum = 0;
+    int sr_idx = HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0);
+    int gr_count = HDR_IS_MONO(hdr) ? 1 : 2;
+
+    if (HDR_TEST_MPEG1(hdr))
+    {
+        gr_count *= 2;
+        main_data_begin = get_bits(bs, 9);
+        scfsi = get_bits(bs, 7 + gr_count);
+    } else
+    {
+        main_data_begin = get_bits(bs, 8 + gr_count) >> gr_count;
+    }
+
+    do
+    {
+        if (HDR_IS_MONO(hdr))
+        {
+            scfsi <<= 4;
+        }
+        gr->part_23_length = (uint16_t)get_bits(bs, 12);
+        part_23_sum += gr->part_23_length;
+        gr->big_values = (uint16_t)get_bits(bs,  9);
+        if (gr->big_values > 288)
+        {
+            return -1;
+        }
+        gr->global_gain = (uint8_t)get_bits(bs, 8);
+        gr->scalefac_compress = (uint16_t)get_bits(bs, HDR_TEST_MPEG1(hdr) ? 4 : 9);
+        gr->sfbtab = g_scf_long[sr_idx];
+        gr->n_long_sfb  = 22;
+        gr->n_short_sfb = 0;
+        if (get_bits(bs, 1))
+        {
+            gr->block_type = (uint8_t)get_bits(bs, 2);
+            if (!gr->block_type)
+            {
+                return -1;
+            }
+            gr->mixed_block_flag = (uint8_t)get_bits(bs, 1);
+            gr->region_count[0] = 7;
+            gr->region_count[1] = 255;
+            if (gr->block_type == SHORT_BLOCK_TYPE)
+            {
+                scfsi &= 0x0F0F;
+                if (!gr->mixed_block_flag)
+                {
+                    gr->region_count[0] = 8;
+                    gr->sfbtab = g_scf_short[sr_idx];
+                    gr->n_long_sfb = 0;
+                    gr->n_short_sfb = 39;
+                } else
+                {
+                    gr->sfbtab = g_scf_mixed[sr_idx];
+                    gr->n_long_sfb = HDR_TEST_MPEG1(hdr) ? 8 : 6;
+                    gr->n_short_sfb = 30;
+                }
+            }
+            tables = get_bits(bs, 10);
+            tables <<= 5;
+            gr->subblock_gain[0] = (uint8_t)get_bits(bs, 3);
+            gr->subblock_gain[1] = (uint8_t)get_bits(bs, 3);
+            gr->subblock_gain[2] = (uint8_t)get_bits(bs, 3);
+        } else
+        {
+            gr->block_type = 0;
+            gr->mixed_block_flag = 0;
+            tables = get_bits(bs, 15);
+            gr->region_count[0] = (uint8_t)get_bits(bs, 4);
+            gr->region_count[1] = (uint8_t)get_bits(bs, 3);
+            gr->region_count[2] = 255;
+        }
+        gr->table_select[0] = (uint8_t)(tables >> 10);
+        gr->table_select[1] = (uint8_t)((tables >> 5) & 31);
+        gr->table_select[2] = (uint8_t)((tables) & 31);
+        gr->preflag = HDR_TEST_MPEG1(hdr) ? get_bits(bs, 1) : (gr->scalefac_compress >= 500);
+        gr->scalefac_scale = (uint8_t)get_bits(bs, 1);
+        gr->count1_table = (uint8_t)get_bits(bs, 1);
+        gr->scfsi = (uint8_t)((scfsi >> 12) & 15);
+        scfsi <<= 4;
+        gr++;
+    } while(--gr_count);
+
+    if (part_23_sum + bs->pos > bs->limit + main_data_begin*8)
+    {
+        return -1;
+    }
+
+    return main_data_begin;
+}
+
+static void L3_read_scalefactors(uint8_t *scf, uint8_t *ist_pos, const uint8_t *scf_size, const uint8_t *scf_count, bs_t *bitbuf, int scfsi)
+{
+    int i, k;
+    for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2)
+    {
+        int cnt = scf_count[i];
+        if (scfsi & 8)
+        {
+            memcpy(scf, ist_pos, cnt);
+        } else
+        {
+            int bits = scf_size[i];
+            if (!bits)
+            {
+                memset(scf, 0, cnt);
+                memset(ist_pos, 0, cnt);
+            } else
+            {
+                int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1;
+                for (k = 0; k < cnt; k++)
+                {
+                    int s = get_bits(bitbuf, bits);
+                    ist_pos[k] = (s == max_scf ? -1 : s);
+                    scf[k] = s;
+                }
+            }
+        }
+        ist_pos += cnt;
+        scf += cnt;
+    }
+    scf[0] = scf[1] = scf[2] = 0;
+}
+
+static float L3_ldexp_q2(float y, int exp_q2)
+{
+    static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f };
+    int e;
+    do
+    {
+        e = MINIMP3_MIN(30*4, exp_q2);
+        y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2));
+    } while ((exp_q2 -= e) > 0);
+    return y;
+}
+
+static void L3_decode_scalefactors(const uint8_t *hdr, uint8_t *ist_pos, bs_t *bs, const L3_gr_info_t *gr, float *scf, int ch)
+{
+    static const uint8_t g_scf_partitions[3][28] = {
+        { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 },
+        { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 },
+        { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 }
+    };
+    const uint8_t *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb];
+    uint8_t scf_size[4], iscf[40];
+    int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi;
+    float gain;
+
+    if (HDR_TEST_MPEG1(hdr))
+    {
+        static const uint8_t g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 };
+        int part = g_scfc_decode[gr->scalefac_compress];
+        scf_size[1] = scf_size[0] = (uint8_t)(part >> 2);
+        scf_size[3] = scf_size[2] = (uint8_t)(part & 3);
+    } else
+    {
+        static const uint8_t g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 };
+        int k, modprod, sfc, ist = HDR_TEST_I_STEREO(hdr) && ch;
+        sfc = gr->scalefac_compress >> ist;
+        for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4)
+        {
+            for (modprod = 1, i = 3; i >= 0; i--)
+            {
+                scf_size[i] = (uint8_t)(sfc / modprod % g_mod[k + i]);
+                modprod *= g_mod[k + i];
+            }
+        }
+        scf_partition += k;
+        scfsi = -16;
+    }
+    L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi);
+
+    if (gr->n_short_sfb)
+    {
+        int sh = 3 - scf_shift;
+        for (i = 0; i < gr->n_short_sfb; i += 3)
+        {
+            iscf[gr->n_long_sfb + i + 0] += gr->subblock_gain[0] << sh;
+            iscf[gr->n_long_sfb + i + 1] += gr->subblock_gain[1] << sh;
+            iscf[gr->n_long_sfb + i + 2] += gr->subblock_gain[2] << sh;
+        }
+    } else if (gr->preflag)
+    {
+        static const uint8_t g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 };
+        for (i = 0; i < 10; i++)
+        {
+            iscf[11 + i] += g_preamp[i];
+        }
+    }
+
+    gain_exp = gr->global_gain + BITS_DEQUANTIZER_OUT*4 - 210 - (HDR_IS_MS_STEREO(hdr) ? 2 : 0);
+    gain = L3_ldexp_q2(1 << (MAX_SCFI/4),  MAX_SCFI - gain_exp);
+    for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++)
+    {
+        scf[i] = L3_ldexp_q2(gain, iscf[i] << scf_shift);
+    }
+}
+
+static const float g_pow43[129 + 16] = {
+    0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f,
+    0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f
+};
+
+static float L3_pow_43(int x)
+{
+    float frac;
+    int sign, mult = 256;
+
+    if (x < 129)
+    {
+        return g_pow43[16 + x];
+    }
+
+    if (x < 1024)
+    {
+        mult = 16;
+        x <<= 3;
+    }
+
+    sign = 2*x & 64;
+    frac = (float)((x & 63) - sign) / ((x & ~63) + sign);
+    return g_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult;
+}
+
+static void L3_huffman(float *dst, bs_t *bs, const L3_gr_info_t *gr_info, const float *scf, int layer3gr_limit)
+{
+    static const int16_t tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,
+        -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288,
+        -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288,
+        -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258,
+        -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259,
+        -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258,
+        -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258,
+        -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259,
+        -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258,
+        -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290,
+        -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259,
+        -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258,
+        -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259,
+        -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258,
+        -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 };
+    static const uint8_t tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205 };
+    static const uint8_t tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 };
+    static const int16_t tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 };
+    static const uint8_t g_linbits[] =  { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 };
+
+#define PEEK_BITS(n)  (bs_cache >> (32 - n))
+#define FLUSH_BITS(n) { bs_cache <<= (n); bs_sh += (n); }
+#define CHECK_BITS    while (bs_sh >= 0) { bs_cache |= (uint32_t)*bs_next_ptr++ << bs_sh; bs_sh -= 8; }
+#define BSPOS         ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh)
+
+    float one = 0.0f;
+    int ireg = 0, big_val_cnt = gr_info->big_values;
+    const uint8_t *sfb = gr_info->sfbtab;
+    const uint8_t *bs_next_ptr = bs->buf + bs->pos/8;
+    uint32_t bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7);
+    int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8;
+    bs_next_ptr += 4;
+
+    while (big_val_cnt > 0)
+    {
+        int tab_num = gr_info->table_select[ireg];
+        int sfb_cnt = gr_info->region_count[ireg++];
+        const int16_t *codebook = tabs + tabindex[tab_num];
+        int linbits = g_linbits[tab_num];
+        if (linbits)
+        {
+            do
+            {
+                np = *sfb++ / 2;
+                pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
+                one = *scf++;
+                do
+                {
+                    int j, w = 5;
+                    int leaf = codebook[PEEK_BITS(w)];
+                    while (leaf < 0)
+                    {
+                        FLUSH_BITS(w);
+                        w = leaf & 7;
+                        leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
+                    }
+                    FLUSH_BITS(leaf >> 8);
+
+                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+                    {
+                        int lsb = leaf & 0x0F;
+                        if (lsb == 15)
+                        {
+                            lsb += PEEK_BITS(linbits);
+                            FLUSH_BITS(linbits);
+                            CHECK_BITS;
+                            *dst = one*L3_pow_43(lsb)*((int32_t)bs_cache < 0 ? -1: 1);
+                        } else
+                        {
+                            *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+                        }
+                        FLUSH_BITS(lsb ? 1 : 0);
+                    }
+                    CHECK_BITS;
+                } while (--pairs_to_decode);
+            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+        } else
+        {
+            do
+            {
+                np = *sfb++ / 2;
+                pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
+                one = *scf++;
+                do
+                {
+                    int j, w = 5;
+                    int leaf = codebook[PEEK_BITS(w)];
+                    while (leaf < 0)
+                    {
+                        FLUSH_BITS(w);
+                        w = leaf & 7;
+                        leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
+                    }
+                    FLUSH_BITS(leaf >> 8);
+
+                    for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+                    {
+                        int lsb = leaf & 0x0F;
+                        *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+                        FLUSH_BITS(lsb ? 1 : 0);
+                    }
+                    CHECK_BITS;
+                } while (--pairs_to_decode);
+            } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+        }
+    }
+
+    for (np = 1 - big_val_cnt;; dst += 4)
+    {
+        const uint8_t *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32;
+        int leaf = codebook_count1[PEEK_BITS(4)];
+        if (!(leaf & 8))
+        {
+            leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))];
+        }
+        FLUSH_BITS(leaf & 7);
+        if (BSPOS > layer3gr_limit)
+        {
+            break;
+        }
+#define RELOAD_SCALEFACTOR  if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; }
+#define DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((int32_t)bs_cache < 0) ? -one : one; FLUSH_BITS(1) }
+        RELOAD_SCALEFACTOR;
+        DEQ_COUNT1(0);
+        DEQ_COUNT1(1);
+        RELOAD_SCALEFACTOR;
+        DEQ_COUNT1(2);
+        DEQ_COUNT1(3);
+        CHECK_BITS;
+    }
+
+    bs->pos = layer3gr_limit;
+}
+
+static void L3_midside_stereo(float *left, int n)
+{
+    int i = 0;
+    float *right = left + 576;
+#if HAVE_SIMD
+    if (have_simd()) for (; i < n - 3; i += 4)
+    {
+        f4 vl = VLD(left + i);
+        f4 vr = VLD(right + i);
+        VSTORE(left + i, VADD(vl, vr));
+        VSTORE(right + i, VSUB(vl, vr));
+    }
+#endif /* HAVE_SIMD */
+    for (; i < n; i++)
+    {
+        float a = left[i];
+        float b = right[i];
+        left[i] = a + b;
+        right[i] = a - b;
+    }
+}
+
+static void L3_intensity_stereo_band(float *left, int n, float kl, float kr)
+{
+    int i;
+    for (i = 0; i < n; i++)
+    {
+        left[i + 576] = left[i]*kr;
+        left[i] = left[i]*kl;
+    }
+}
+
+static void L3_stereo_top_band(const float *right, const uint8_t *sfb, int nbands, int max_band[3])
+{
+    int i, k;
+
+    max_band[0] = max_band[1] = max_band[2] = -1;
+
+    for (i = 0; i < nbands; i++)
+    {
+        for (k = 0; k < sfb[i]; k += 2)
+        {
+            if (right[k] != 0 || right[k + 1] != 0)
+            {
+                max_band[i % 3] = i;
+                break;
+            }
+        }
+        right += sfb[i];
+    }
+}
+
+static void L3_stereo_process(float *left, const uint8_t *ist_pos, const uint8_t *sfb, const uint8_t *hdr, int max_band[3], int mpeg2_sh)
+{
+    static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 };
+    unsigned i, max_pos = HDR_TEST_MPEG1(hdr) ? 7 : 64;
+
+    for (i = 0; sfb[i]; i++)
+    {
+        unsigned ipos = ist_pos[i];
+        if ((int)i > max_band[i % 3] && ipos < max_pos)
+        {
+            float kl, kr, s = HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1;
+            if (HDR_TEST_MPEG1(hdr))
+            {
+                kl = g_pan[2*ipos];
+                kr = g_pan[2*ipos + 1];
+            } else
+            {
+                kl = 1;
+                kr = L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh);
+                if (ipos & 1)
+                {
+                    kl = kr;
+                    kr = 1;
+                }
+            }
+            L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s);
+        } else if (HDR_TEST_MS_STEREO(hdr))
+        {
+            L3_midside_stereo(left, sfb[i]);
+        }
+        left += sfb[i];
+    }
+}
+
+static void L3_intensity_stereo(float *left, uint8_t *ist_pos, const L3_gr_info_t *gr, const uint8_t *hdr)
+{
+    int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb;
+    int i, max_blocks = gr->n_short_sfb ? 3 : 1;
+
+    L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band);
+    if (gr->n_long_sfb)
+    {
+        max_band[0] = max_band[1] = max_band[2] = MINIMP3_MAX(MINIMP3_MAX(max_band[0], max_band[1]), max_band[2]);
+    }
+    for (i = 0; i < max_blocks; i++)
+    {
+        int default_pos = HDR_TEST_MPEG1(hdr) ? 3 : 0;
+        int itop = n_sfb - max_blocks + i;
+        int prev = itop - max_blocks;
+        ist_pos[itop] = max_band[i] >= prev ? default_pos : ist_pos[prev];
+    }
+    L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1);
+}
+
+static void L3_reorder(float *grbuf, float *scratch, const uint8_t *sfb)
+{
+    int i, len;
+    float *src = grbuf, *dst = scratch;
+
+    for (;0 != (len = *sfb); sfb += 3, src += 2*len)
+    {
+        for (i = 0; i < len; i++, src++)
+        {
+            *dst++ = src[0*len];
+            *dst++ = src[1*len];
+            *dst++ = src[2*len];
+        }
+    }
+    memcpy(grbuf, scratch, (dst - scratch)*sizeof(float));
+}
+
+static void L3_antialias(float *grbuf, int nbands)
+{
+    static const float g_aa[2][8] = {
+        {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f},
+        {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f}
+    };
+
+    for (; nbands > 0; nbands--, grbuf += 18)
+    {
+        int i = 0;
+#if HAVE_SIMD
+        if (have_simd()) for (; i < 8; i += 4)
+        {
+            f4 vu = VLD(grbuf + 18 + i);
+            f4 vd = VLD(grbuf + 14 - i);
+            f4 vc0 = VLD(g_aa[0] + i);
+            f4 vc1 = VLD(g_aa[1] + i);
+            vd = VREV(vd);
+            VSTORE(grbuf + 18 + i, VSUB(VMUL(vu, vc0), VMUL(vd, vc1)));
+            vd = VADD(VMUL(vu, vc1), VMUL(vd, vc0));
+            VSTORE(grbuf + 14 - i, VREV(vd));
+        }
+#endif /* HAVE_SIMD */
+#ifndef MINIMP3_ONLY_SIMD
+        for(; i < 8; i++)
+        {
+            float u = grbuf[18 + i];
+            float d = grbuf[17 - i];
+            grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i];
+            grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i];
+        }
+#endif /* MINIMP3_ONLY_SIMD */
+    }
+}
+
+static void L3_dct3_9(float *y)
+{
+    float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4;
+
+    s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8];
+    t0 = s0 + s6*0.5f;
+    s0 -= s6;
+    t4 = (s4 + s2)*0.93969262f;
+    t2 = (s8 + s2)*0.76604444f;
+    s6 = (s4 - s8)*0.17364818f;
+    s4 += s8 - s2;
+
+    s2 = s0 - s4*0.5f;
+    y[4] = s4 + s0;
+    s8 = t0 - t2 + s6;
+    s0 = t0 - t4 + t2;
+    s4 = t0 + t4 - s6;
+
+    s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7];
+
+    s3 *= 0.86602540f;
+    t0 = (s5 + s1)*0.98480775f;
+    t4 = (s5 - s7)*0.34202014f;
+    t2 = (s1 + s7)*0.64278761f;
+    s1 = (s1 - s5 - s7)*0.86602540f;
+
+    s5 = t0 - s3 - t2;
+    s7 = t4 - s3 - t0;
+    s3 = t4 + s3 - t2;
+
+    y[0] = s4 - s7;
+    y[1] = s2 + s1;
+    y[2] = s0 - s3;
+    y[3] = s8 + s5;
+    y[5] = s8 - s5;
+    y[6] = s0 + s3;
+    y[7] = s2 - s1;
+    y[8] = s4 + s7;
+}
+
+static void L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands)
+{
+    int i, j;
+    static const float g_twid9[18] = {
+        0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f
+    };
+
+    for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9)
+    {
+        float co[9], si[9];
+        co[0] = -grbuf[0];
+        si[0] = grbuf[17];
+        for (i = 0; i < 4; i++)
+        {
+            si[8 - 2*i] =   grbuf[4*i + 1] - grbuf[4*i + 2];
+            co[1 + 2*i] =   grbuf[4*i + 1] + grbuf[4*i + 2];
+            si[7 - 2*i] =   grbuf[4*i + 4] - grbuf[4*i + 3];
+            co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]);
+        }
+        L3_dct3_9(co);
+        L3_dct3_9(si);
+
+        si[1] = -si[1];
+        si[3] = -si[3];
+        si[5] = -si[5];
+        si[7] = -si[7];
+
+        i = 0;
+
+#if HAVE_SIMD
+        if (have_simd()) for (; i < 8; i += 4)
+        {
+            f4 vovl = VLD(overlap + i);
+            f4 vc = VLD(co + i);
+            f4 vs = VLD(si + i);
+            f4 vr0 = VLD(g_twid9 + i);
+            f4 vr1 = VLD(g_twid9 + 9 + i);
+            f4 vw0 = VLD(window + i);
+            f4 vw1 = VLD(window + 9 + i);
+            f4 vsum = VADD(VMUL(vc, vr1), VMUL(vs, vr0));
+            VSTORE(overlap + i, VSUB(VMUL(vc, vr0), VMUL(vs, vr1)));
+            VSTORE(grbuf + i, VSUB(VMUL(vovl, vw0), VMUL(vsum, vw1)));
+            vsum = VADD(VMUL(vovl, vw1), VMUL(vsum, vw0));
+            VSTORE(grbuf + 14 - i, VREV(vsum));
+        }
+#endif /* HAVE_SIMD */
+        for (; i < 9; i++)
+        {
+            float ovl  = overlap[i];
+            float sum  = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i];
+            overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i];
+            grbuf[i]      = ovl*window[0 + i] - sum*window[9 + i];
+            grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i];
+        }
+    }
+}
+
+static void L3_idct3(float x0, float x1, float x2, float *dst)
+{
+    float m1 = x1*0.86602540f;
+    float a1 = x0 - x2*0.5f;
+    dst[1] = x0 + x2;
+    dst[0] = a1 + m1;
+    dst[2] = a1 - m1;
+}
+
+static void L3_imdct12(float *x, float *dst, float *overlap)
+{
+    static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f };
+    float co[3], si[3];
+    int i;
+
+    L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co);
+    L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si);
+    si[1] = -si[1];
+
+    for (i = 0; i < 3; i++)
+    {
+        float ovl  = overlap[i];
+        float sum  = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i];
+        overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i];
+        dst[i]     = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i];
+        dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i];
+    }
+}
+
+static void L3_imdct_short(float *grbuf, float *overlap, int nbands)
+{
+    for (;nbands > 0; nbands--, overlap += 9, grbuf += 18)
+    {
+        float tmp[18];
+        memcpy(tmp, grbuf, sizeof(tmp));
+        memcpy(grbuf, overlap, 6*sizeof(float));
+        L3_imdct12(tmp, grbuf + 6, overlap + 6);
+        L3_imdct12(tmp + 1, grbuf + 12, overlap + 6);
+        L3_imdct12(tmp + 2, overlap, overlap + 6);
+    }
+}
+
+static void L3_change_sign(float *grbuf)
+{
+    int b, i;
+    for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36)
+        for (i = 1; i < 18; i += 2)
+            grbuf[i] = -grbuf[i];
+}
+
+static void L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands)
+{
+    static const float g_mdct_window[2][18] = {
+        { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f },
+        { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f }
+    };
+    if (n_long_bands)
+    {
+        L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands);
+        grbuf += 18*n_long_bands;
+        overlap += 9*n_long_bands;
+    }
+    if (block_type == SHORT_BLOCK_TYPE)
+        L3_imdct_short(grbuf, overlap, 32 - n_long_bands);
+    else
+        L3_imdct36(grbuf, overlap, g_mdct_window[block_type == STOP_BLOCK_TYPE], 32 - n_long_bands);
+}
+
+static void L3_save_reservoir(mp3dec_t *h, mp3dec_scratch_t *s)
+{
+    int pos = (s->bs.pos + 7)/8u;
+    int remains = s->bs.limit/8u - pos;
+    if (remains > MAX_BITRESERVOIR_BYTES)
+    {
+        pos += remains - MAX_BITRESERVOIR_BYTES;
+        remains = MAX_BITRESERVOIR_BYTES;
+    }
+    if (remains > 0)
+    {
+        memmove(h->reserv_buf, s->maindata + pos, remains);
+    }
+    h->reserv = remains;
+}
+
+static int L3_restore_reservoir(mp3dec_t *h, bs_t *bs, mp3dec_scratch_t *s, int main_data_begin)
+{
+    int frame_bytes = (bs->limit - bs->pos)/8;
+    int bytes_have = MINIMP3_MIN(h->reserv, main_data_begin);
+    memcpy(s->maindata, h->reserv_buf + MINIMP3_MAX(0, h->reserv - main_data_begin), MINIMP3_MIN(h->reserv, main_data_begin));
+    memcpy(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
+    bs_init(&s->bs, s->maindata, bytes_have + frame_bytes);
+    return h->reserv >= main_data_begin;
+}
+
+static void L3_decode(mp3dec_t *h, mp3dec_scratch_t *s, L3_gr_info_t *gr_info, int nch)
+{
+    int ch;
+
+    for (ch = 0; ch < nch; ch++)
+    {
+        int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length;
+        L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch);
+        L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit);
+    }
+
+    if (HDR_TEST_I_STEREO(h->header))
+    {
+        L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header);
+    } else if (HDR_IS_MS_STEREO(h->header))
+    {
+        L3_midside_stereo(s->grbuf[0], 576);
+    }
+
+    for (ch = 0; ch < nch; ch++, gr_info++)
+    {
+        int aa_bands = 31;
+        int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(HDR_GET_MY_SAMPLE_RATE(h->header) == 2);
+
+        if (gr_info->n_short_sfb)
+        {
+            aa_bands = n_long_bands - 1;
+            L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb);
+        }
+
+        L3_antialias(s->grbuf[ch], aa_bands);
+        L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands);
+        L3_change_sign(s->grbuf[ch]);
+    }
+}
+
+static void mp3d_DCT_II(float *grbuf, int n)
+{
+    static const float g_sec[24] = {
+        10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f
+    };
+    int i, k = 0;
+#if HAVE_SIMD
+    if (have_simd()) for (; k < n; k += 4)
+    {
+        f4 t[4][8], *x;
+        float *y = grbuf + k;
+
+        for (x = t[0], i = 0; i < 8; i++, x++)
+        {
+            f4 x0 = VLD(&y[i*18]);
+            f4 x1 = VLD(&y[(15 - i)*18]);
+            f4 x2 = VLD(&y[(16 + i)*18]);
+            f4 x3 = VLD(&y[(31 - i)*18]);
+            f4 t0 = VADD(x0, x3);
+            f4 t1 = VADD(x1, x2);
+            f4 t2 = VMUL_S(VSUB(x1, x2), g_sec[3*i + 0]);
+            f4 t3 = VMUL_S(VSUB(x0, x3), g_sec[3*i + 1]);
+            x[0] = VADD(t0, t1);
+            x[8] = VMUL_S(VSUB(t0, t1), g_sec[3*i + 2]);
+            x[16] = VADD(t3, t2);
+            x[24] = VMUL_S(VSUB(t3, t2), g_sec[3*i + 2]);
+        }
+        for (x = t[0], i = 0; i < 4; i++, x += 8)
+        {
+            f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+            xt = VSUB(x0, x7); x0 = VADD(x0, x7);
+            x7 = VSUB(x1, x6); x1 = VADD(x1, x6);
+            x6 = VSUB(x2, x5); x2 = VADD(x2, x5);
+            x5 = VSUB(x3, x4); x3 = VADD(x3, x4);
+            x4 = VSUB(x0, x3); x0 = VADD(x0, x3);
+            x3 = VSUB(x1, x2); x1 = VADD(x1, x2);
+            x[0] = VADD(x0, x1);
+            x[4] = VMUL_S(VSUB(x0, x1), 0.70710677f);
+            x5 = VADD(x5, x6);
+            x6 = VMUL_S(VADD(x6, x7), 0.70710677f);
+            x7 = VADD(x7, xt);
+            x3 = VMUL_S(VADD(x3, x4), 0.70710677f);
+            x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); /* rotate by PI/8 */
+            x7 = VADD(x7, VMUL_S(x5, 0.382683432f));
+            x5 = VSUB(x5, VMUL_S(x7, 0.198912367f));
+            x0 = VSUB(xt, x6); xt = VADD(xt, x6);
+            x[1] = VMUL_S(VADD(xt, x7), 0.50979561f);
+            x[2] = VMUL_S(VADD(x4, x3), 0.54119611f);
+            x[3] = VMUL_S(VSUB(x0, x5), 0.60134488f);
+            x[5] = VMUL_S(VADD(x0, x5), 0.89997619f);
+            x[6] = VMUL_S(VSUB(x4, x3), 1.30656302f);
+            x[7] = VMUL_S(VSUB(xt, x7), 2.56291556f);
+        }
+
+        if (k > n - 3)
+        {
+#if HAVE_SSE
+#define VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v)
+#else /* HAVE_SSE */
+#define VSAVE2(i, v) vst1_f32((float32_t *)&y[i*18],  vget_low_f32(v))
+#endif /* HAVE_SSE */
+            for (i = 0; i < 7; i++, y += 4*18)
+            {
+                f4 s = VADD(t[3][i], t[3][i + 1]);
+                VSAVE2(0, t[0][i]);
+                VSAVE2(1, VADD(t[2][i], s));
+                VSAVE2(2, VADD(t[1][i], t[1][i + 1]));
+                VSAVE2(3, VADD(t[2][1 + i], s));
+            }
+            VSAVE2(0, t[0][7]);
+            VSAVE2(1, VADD(t[2][7], t[3][7]));
+            VSAVE2(2, t[1][7]);
+            VSAVE2(3, t[3][7]);
+        } else
+        {
+#define VSAVE4(i, v) VSTORE(&y[i*18], v)
+            for (i = 0; i < 7; i++, y += 4*18)
+            {
+                f4 s = VADD(t[3][i], t[3][i + 1]);
+                VSAVE4(0, t[0][i]);
+                VSAVE4(1, VADD(t[2][i], s));
+                VSAVE4(2, VADD(t[1][i], t[1][i + 1]));
+                VSAVE4(3, VADD(t[2][1 + i], s));
+            }
+            VSAVE4(0, t[0][7]);
+            VSAVE4(1, VADD(t[2][7], t[3][7]));
+            VSAVE4(2, t[1][7]);
+            VSAVE4(3, t[3][7]);
+        }
+    } else
+#endif /* HAVE_SIMD */
+#ifdef MINIMP3_ONLY_SIMD
+    {}
+#else /* MINIMP3_ONLY_SIMD */
+    for (; k < n; k++)
+    {
+        float t[4][8], *x, *y = grbuf + k;
+
+        for (x = t[0], i = 0; i < 8; i++, x++)
+        {
+            float x0 = y[i*18];
+            float x1 = y[(15 - i)*18];
+            float x2 = y[(16 + i)*18];
+            float x3 = y[(31 - i)*18];
+            float t0 = x0 + x3;
+            float t1 = x1 + x2;
+            float t2 = (x1 - x2)*g_sec[3*i + 0];
+            float t3 = (x0 - x3)*g_sec[3*i + 1];
+            x[0] = t0 + t1;
+            x[8] = (t0 - t1)*g_sec[3*i + 2];
+            x[16] = t3 + t2;
+            x[24] = (t3 - t2)*g_sec[3*i + 2];
+        }
+        for (x = t[0], i = 0; i < 4; i++, x += 8)
+        {
+            float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+            xt = x0 - x7; x0 += x7;
+            x7 = x1 - x6; x1 += x6;
+            x6 = x2 - x5; x2 += x5;
+            x5 = x3 - x4; x3 += x4;
+            x4 = x0 - x3; x0 += x3;
+            x3 = x1 - x2; x1 += x2;
+            x[0] = x0 + x1;
+            x[4] = (x0 - x1)*0.70710677f;
+            x5 =  x5 + x6;
+            x6 = (x6 + x7)*0.70710677f;
+            x7 =  x7 + xt;
+            x3 = (x3 + x4)*0.70710677f;
+            x5 -= x7*0.198912367f;  /* rotate by PI/8 */
+            x7 += x5*0.382683432f;
+            x5 -= x7*0.198912367f;
+            x0 = xt - x6; xt += x6;
+            x[1] = (xt + x7)*0.50979561f;
+            x[2] = (x4 + x3)*0.54119611f;
+            x[3] = (x0 - x5)*0.60134488f;
+            x[5] = (x0 + x5)*0.89997619f;
+            x[6] = (x4 - x3)*1.30656302f;
+            x[7] = (xt - x7)*2.56291556f;
+
+        }
+        for (i = 0; i < 7; i++, y += 4*18)
+        {
+            y[0*18] = t[0][i];
+            y[1*18] = t[2][i] + t[3][i] + t[3][i + 1];
+            y[2*18] = t[1][i] + t[1][i + 1];
+            y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1];
+        }
+        y[0*18] = t[0][7];
+        y[1*18] = t[2][7] + t[3][7];
+        y[2*18] = t[1][7];
+        y[3*18] = t[3][7];
+    }
+#endif /* MINIMP3_ONLY_SIMD */
+}
+
+#ifndef MINIMP3_FLOAT_OUTPUT
+static int16_t mp3d_scale_pcm(float sample)
+{
+#if HAVE_ARMV6
+    int32_t s32 = (int32_t)(sample + .5f);
+    s32 -= (s32 < 0);
+    int16_t s = (int16_t)minimp3_clip_int16_arm(s32);
+#else
+    if (sample >=  32766.5) return (int16_t) 32767;
+    if (sample <= -32767.5) return (int16_t)-32768;
+    int16_t s = (int16_t)(sample + .5f);
+    s -= (s < 0);   /* away from zero, to be compliant */
+#endif
+    return s;
+}
+#else /* MINIMP3_FLOAT_OUTPUT */
+static float mp3d_scale_pcm(float sample)
+{
+    return sample*(1.f/32768.f);
+}
+#endif /* MINIMP3_FLOAT_OUTPUT */
+
+static void mp3d_synth_pair(mp3d_sample_t *pcm, int nch, const float *z)
+{
+    float a;
+    a  = (z[14*64] - z[    0]) * 29;
+    a += (z[ 1*64] + z[13*64]) * 213;
+    a += (z[12*64] - z[ 2*64]) * 459;
+    a += (z[ 3*64] + z[11*64]) * 2037;
+    a += (z[10*64] - z[ 4*64]) * 5153;
+    a += (z[ 5*64] + z[ 9*64]) * 6574;
+    a += (z[ 8*64] - z[ 6*64]) * 37489;
+    a +=  z[ 7*64]             * 75038;
+    pcm[0] = mp3d_scale_pcm(a);
+
+    z += 2;
+    a  = z[14*64] * 104;
+    a += z[12*64] * 1567;
+    a += z[10*64] * 9727;
+    a += z[ 8*64] * 64019;
+    a += z[ 6*64] * -9975;
+    a += z[ 4*64] * -45;
+    a += z[ 2*64] * 146;
+    a += z[ 0*64] * -5;
+    pcm[16*nch] = mp3d_scale_pcm(a);
+}
+
+static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins)
+{
+    int i;
+    float *xr = xl + 576*(nch - 1);
+    mp3d_sample_t *dstr = dstl + (nch - 1);
+
+    static const float g_win[] = {
+        -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
+        -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
+        -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
+        -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
+        -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
+        -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
+        -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
+        -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
+        -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
+        -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
+        -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
+        -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
+        -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
+        -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
+        -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
+    };
+    float *zlin = lins + 15*64;
+    const float *w = g_win;
+
+    zlin[4*15]     = xl[18*16];
+    zlin[4*15 + 1] = xr[18*16];
+    zlin[4*15 + 2] = xl[0];
+    zlin[4*15 + 3] = xr[0];
+
+    zlin[4*31]     = xl[1 + 18*16];
+    zlin[4*31 + 1] = xr[1 + 18*16];
+    zlin[4*31 + 2] = xl[1];
+    zlin[4*31 + 3] = xr[1];
+
+    mp3d_synth_pair(dstr, nch, lins + 4*15 + 1);
+    mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1);
+    mp3d_synth_pair(dstl, nch, lins + 4*15);
+    mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64);
+
+#if HAVE_SIMD
+    if (have_simd()) for (i = 14; i >= 0; i--)
+    {
+#define VLOAD(k) f4 w0 = VSET(*w++); f4 w1 = VSET(*w++); f4 vz = VLD(&zlin[4*i - 64*k]); f4 vy = VLD(&zlin[4*i - 64*(15 - k)]);
+#define V0(k) { VLOAD(k) b =         VADD(VMUL(vz, w1), VMUL(vy, w0)) ; a =         VSUB(VMUL(vz, w0), VMUL(vy, w1));  }
+#define V1(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vz, w0), VMUL(vy, w1))); }
+#define V2(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vy, w1), VMUL(vz, w0))); }
+        f4 a, b;
+        zlin[4*i]     = xl[18*(31 - i)];
+        zlin[4*i + 1] = xr[18*(31 - i)];
+        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+        zlin[4*i + 64] = xl[1 + 18*(1 + i)];
+        zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)];
+        zlin[4*i - 64 + 2] = xl[18*(1 + i)];
+        zlin[4*i - 64 + 3] = xr[18*(1 + i)];
+
+        V0(0) V2(1) V1(2) V2(3) V1(4) V2(5) V1(6) V2(7)
+
+        {
+#ifndef MINIMP3_FLOAT_OUTPUT
+#if HAVE_SSE
+            static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
+            static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
+            __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
+                                           _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
+            dstr[(15 - i)*nch] = _mm_extract_epi16(pcm8, 1);
+            dstr[(17 + i)*nch] = _mm_extract_epi16(pcm8, 5);
+            dstl[(15 - i)*nch] = _mm_extract_epi16(pcm8, 0);
+            dstl[(17 + i)*nch] = _mm_extract_epi16(pcm8, 4);
+            dstr[(47 - i)*nch] = _mm_extract_epi16(pcm8, 3);
+            dstr[(49 + i)*nch] = _mm_extract_epi16(pcm8, 7);
+            dstl[(47 - i)*nch] = _mm_extract_epi16(pcm8, 2);
+            dstl[(49 + i)*nch] = _mm_extract_epi16(pcm8, 6);
+#else /* HAVE_SSE */
+            int16x4_t pcma, pcmb;
+            a = VADD(a, VSET(0.5f));
+            b = VADD(b, VSET(0.5f));
+            pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
+            pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
+            vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1);
+            vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1);
+            vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0);
+            vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0);
+            vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3);
+            vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3);
+            vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2);
+            vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2);
+#endif /* HAVE_SSE */
+
+#else /* MINIMP3_FLOAT_OUTPUT */
+
+            static const f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f };
+            a = VMUL(a, g_scale);
+            b = VMUL(b, g_scale);
+#if HAVE_SSE
+            _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
+            _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
+            _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)));
+            _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0)));
+            _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
+            _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3)));
+            _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+            _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2)));
+#else /* HAVE_SSE */
+            vst1q_lane_f32(dstr + (15 - i)*nch, a, 1);
+            vst1q_lane_f32(dstr + (17 + i)*nch, b, 1);
+            vst1q_lane_f32(dstl + (15 - i)*nch, a, 0);
+            vst1q_lane_f32(dstl + (17 + i)*nch, b, 0);
+            vst1q_lane_f32(dstr + (47 - i)*nch, a, 3);
+            vst1q_lane_f32(dstr + (49 + i)*nch, b, 3);
+            vst1q_lane_f32(dstl + (47 - i)*nch, a, 2);
+            vst1q_lane_f32(dstl + (49 + i)*nch, b, 2);
+#endif /* HAVE_SSE */
+#endif /* MINIMP3_FLOAT_OUTPUT */
+        }
+    } else
+#endif /* HAVE_SIMD */
+#ifdef MINIMP3_ONLY_SIMD
+    {}
+#else /* MINIMP3_ONLY_SIMD */
+    for (i = 14; i >= 0; i--)
+    {
+#define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64];
+#define S0(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j]  = vz[j]*w1 + vy[j]*w0, a[j]  = vz[j]*w0 - vy[j]*w1; }
+#define S1(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; }
+#define S2(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; }
+        float a[4], b[4];
+
+        zlin[4*i]     = xl[18*(31 - i)];
+        zlin[4*i + 1] = xr[18*(31 - i)];
+        zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+        zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+        zlin[4*(i + 16)]   = xl[1 + 18*(1 + i)];
+        zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
+        zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
+        zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
+
+        S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7)
+
+        dstr[(15 - i)*nch] = mp3d_scale_pcm(a[1]);
+        dstr[(17 + i)*nch] = mp3d_scale_pcm(b[1]);
+        dstl[(15 - i)*nch] = mp3d_scale_pcm(a[0]);
+        dstl[(17 + i)*nch] = mp3d_scale_pcm(b[0]);
+        dstr[(47 - i)*nch] = mp3d_scale_pcm(a[3]);
+        dstr[(49 + i)*nch] = mp3d_scale_pcm(b[3]);
+        dstl[(47 - i)*nch] = mp3d_scale_pcm(a[2]);
+        dstl[(49 + i)*nch] = mp3d_scale_pcm(b[2]);
+    }
+#endif /* MINIMP3_ONLY_SIMD */
+}
+
+static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, mp3d_sample_t *pcm, float *lins)
+{
+    int i;
+    for (i = 0; i < nch; i++)
+    {
+        mp3d_DCT_II(grbuf + 576*i, nbands);
+    }
+
+    memcpy(lins, qmf_state, sizeof(float)*15*64);
+
+    for (i = 0; i < nbands; i += 2)
+    {
+        mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64);
+    }
+#ifndef MINIMP3_NONSTANDARD_BUT_LOGICAL
+    if (nch == 1)
+    {
+        for (i = 0; i < 15*64; i += 2)
+        {
+            qmf_state[i] = lins[nbands*64 + i];
+        }
+    } else
+#endif /* MINIMP3_NONSTANDARD_BUT_LOGICAL */
+    {
+        memcpy(qmf_state, lins + nbands*64, sizeof(float)*15*64);
+    }
+}
+
+static int mp3d_match_frame(const uint8_t *hdr, int mp3_bytes, int frame_bytes)
+{
+    int i, nmatch;
+    for (i = 0, nmatch = 0; nmatch < MAX_FRAME_SYNC_MATCHES; nmatch++)
+    {
+        i += hdr_frame_bytes(hdr + i, frame_bytes) + hdr_padding(hdr + i);
+        if (i + HDR_SIZE > mp3_bytes)
+            return nmatch > 0;
+        if (!hdr_compare(hdr, hdr + i))
+            return 0;
+    }
+    return 1;
+}
+
+static int mp3d_find_frame(const uint8_t *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes)
+{
+    int i, k;
+    for (i = 0; i < mp3_bytes - HDR_SIZE; i++, mp3++)
+    {
+        if (hdr_valid(mp3))
+        {
+            int frame_bytes = hdr_frame_bytes(mp3, *free_format_bytes);
+            int frame_and_padding = frame_bytes + hdr_padding(mp3);
+
+            for (k = HDR_SIZE; !frame_bytes && k < MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - HDR_SIZE; k++)
+            {
+                if (hdr_compare(mp3, mp3 + k))
+                {
+                    int fb = k - hdr_padding(mp3);
+                    int nextfb = fb + hdr_padding(mp3 + k);
+                    if (i + k + nextfb + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + k + nextfb))
+                        continue;
+                    frame_and_padding = k;
+                    frame_bytes = fb;
+                    *free_format_bytes = fb;
+                }
+            }
+            if ((frame_bytes && i + frame_and_padding <= mp3_bytes &&
+                mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) ||
+                (!i && frame_and_padding == mp3_bytes))
+            {
+                *ptr_frame_bytes = frame_and_padding;
+                return i;
+            }
+            *free_format_bytes = 0;
+        }
+    }
+    *ptr_frame_bytes = 0;
+    return mp3_bytes;
+}
+
+void mp3dec_init(mp3dec_t *dec)
+{
+    dec->header[0] = 0;
+}
+
+int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info)
+{
+    int i = 0, igr, frame_size = 0, success = 1;
+    const uint8_t *hdr;
+    bs_t bs_frame[1];
+    mp3dec_scratch_t scratch;
+
+    if (mp3_bytes > 4 && dec->header[0] == 0xff && hdr_compare(dec->header, mp3))
+    {
+        frame_size = hdr_frame_bytes(mp3, dec->free_format_bytes) + hdr_padding(mp3);
+        if (frame_size != mp3_bytes && (frame_size + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + frame_size)))
+        {
+            frame_size = 0;
+        }
+    }
+    if (!frame_size)
+    {
+        memset(dec, 0, sizeof(mp3dec_t));
+        i = mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size);
+        if (!frame_size || i + frame_size > mp3_bytes)
+        {
+            info->frame_bytes = i;
+            return 0;
+        }
+    }
+
+    hdr = mp3 + i;
+    memcpy(dec->header, hdr, HDR_SIZE);
+    info->frame_bytes = i + frame_size;
+    info->frame_offset = i;
+    info->channels = HDR_IS_MONO(hdr) ? 1 : 2;
+    info->hz = hdr_sample_rate_hz(hdr);
+    info->layer = 4 - HDR_GET_LAYER(hdr);
+    info->bitrate_kbps = hdr_bitrate_kbps(hdr);
+
+    if (!pcm)
+    {
+        return hdr_frame_samples(hdr);
+    }
+
+    bs_init(bs_frame, hdr + HDR_SIZE, frame_size - HDR_SIZE);
+    if (HDR_IS_CRC(hdr))
+    {
+        get_bits(bs_frame, 16);
+    }
+
+    if (info->layer == 3)
+    {
+        int main_data_begin = L3_read_side_info(bs_frame, scratch.gr_info, hdr);
+        if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit)
+        {
+            mp3dec_init(dec);
+            return 0;
+        }
+        success = L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin);
+        if (success)
+        {
+            for (igr = 0; igr < (HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm += 576*info->channels)
+            {
+                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+                L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels);
+                mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, pcm, scratch.syn[0]);
+            }
+        }
+        L3_save_reservoir(dec, &scratch);
+    } else
+    {
+#ifdef MINIMP3_ONLY_MP3
+        return 0;
+#else /* MINIMP3_ONLY_MP3 */
+        L12_scale_info sci[1];
+        L12_read_scale_info(hdr, bs_frame, sci);
+
+        memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+        for (i = 0, igr = 0; igr < 3; igr++)
+        {
+            if (12 == (i += L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1)))
+            {
+                i = 0;
+                L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]);
+                mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, pcm, scratch.syn[0]);
+                memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+                pcm += 384*info->channels;
+            }
+            if (bs_frame->pos > bs_frame->limit)
+            {
+                mp3dec_init(dec);
+                return 0;
+            }
+        }
+#endif /* MINIMP3_ONLY_MP3 */
+    }
+    return success*hdr_frame_samples(dec->header);
+}
+
+#ifdef MINIMP3_FLOAT_OUTPUT
+void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples)
+{
+    int i = 0;
+#if HAVE_SIMD
+    int aligned_count = num_samples & ~7;
+    for(; i < aligned_count; i += 8)
+    {
+        static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f };
+        f4 a = VMUL(VLD(&in[i  ]), g_scale);
+        f4 b = VMUL(VLD(&in[i+4]), g_scale);
+#if HAVE_SSE
+        static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
+        static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
+        __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
+                                       _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
+        out[i  ] = _mm_extract_epi16(pcm8, 0);
+        out[i+1] = _mm_extract_epi16(pcm8, 1);
+        out[i+2] = _mm_extract_epi16(pcm8, 2);
+        out[i+3] = _mm_extract_epi16(pcm8, 3);
+        out[i+4] = _mm_extract_epi16(pcm8, 4);
+        out[i+5] = _mm_extract_epi16(pcm8, 5);
+        out[i+6] = _mm_extract_epi16(pcm8, 6);
+        out[i+7] = _mm_extract_epi16(pcm8, 7);
+#else /* HAVE_SSE */
+        int16x4_t pcma, pcmb;
+        a = VADD(a, VSET(0.5f));
+        b = VADD(b, VSET(0.5f));
+        pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
+        pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
+        vst1_lane_s16(out+i  , pcma, 0);
+        vst1_lane_s16(out+i+1, pcma, 1);
+        vst1_lane_s16(out+i+2, pcma, 2);
+        vst1_lane_s16(out+i+3, pcma, 3);
+        vst1_lane_s16(out+i+4, pcmb, 0);
+        vst1_lane_s16(out+i+5, pcmb, 1);
+        vst1_lane_s16(out+i+6, pcmb, 2);
+        vst1_lane_s16(out+i+7, pcmb, 3);
+#endif /* HAVE_SSE */
+    }
+#endif /* HAVE_SIMD */
+    for(; i < num_samples; i++)
+    {
+        float sample = in[i] * 32768.0f;
+        if (sample >=  32766.5)
+            out[i] = (int16_t) 32767;
+        else if (sample <= -32767.5)
+            out[i] = (int16_t)-32768;
+        else
+        {
+            int16_t s = (int16_t)(sample + .5f);
+            s -= (s < 0);   /* away from zero, to be compliant */
+            out[i] = s;
+        }
+    }
+}
+#endif /* MINIMP3_FLOAT_OUTPUT */
+#endif /* MINIMP3_IMPLEMENTATION && !_MINIMP3_IMPLEMENTATION_GUARD */
diff --git a/thirdparty/minimp3/minimp3_ex.h b/thirdparty/minimp3/minimp3_ex.h
new file mode 100644
index 0000000000..e29dd15b2e
--- /dev/null
+++ b/thirdparty/minimp3/minimp3_ex.h
@@ -0,0 +1,1394 @@
+#ifndef MINIMP3_EXT_H
+#define MINIMP3_EXT_H
+/*
+    https://github.com/lieff/minimp3
+    To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
+    This software is distributed without any warranty.
+    See <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#include "minimp3.h"
+
+/* flags for mp3dec_ex_open_* functions */
+#define MP3D_SEEK_TO_BYTE   0      /* mp3dec_ex_seek seeks to byte in stream */
+#define MP3D_SEEK_TO_SAMPLE 1      /* mp3dec_ex_seek precisely seeks to sample using index (created during duration calculation scan or when mp3dec_ex_seek called) */
+#define MP3D_DO_NOT_SCAN    2      /* do not scan whole stream for duration if vbrtag not found, mp3dec_ex_t::samples will be filled only if mp3dec_ex_t::vbr_tag_found == 1 */
+#ifdef MINIMP3_ALLOW_MONO_STEREO_TRANSITION
+#define MP3D_ALLOW_MONO_STEREO_TRANSITION  4
+#define MP3D_FLAGS_MASK 7
+#else
+#define MP3D_FLAGS_MASK 3
+#endif
+
+/* compile-time config */
+#define MINIMP3_PREDECODE_FRAMES 2 /* frames to pre-decode and skip after seek (to fill internal structures) */
+/*#define MINIMP3_SEEK_IDX_LINEAR_SEARCH*/ /* define to use linear index search instead of binary search on seek */
+#define MINIMP3_IO_SIZE (128*1024) /* io buffer size for streaming functions, must be greater than MINIMP3_BUF_SIZE */
+#define MINIMP3_BUF_SIZE (16*1024) /* buffer which can hold minimum 10 consecutive mp3 frames (~16KB) worst case */
+/*#define MINIMP3_SCAN_LIMIT (256*1024)*/ /* how many bytes will be scanned to search first valid mp3 frame, to prevent stall on large non-mp3 files */
+#define MINIMP3_ENABLE_RING 0      /* WIP enable hardware magic ring buffer if available, to make less input buffer memmove(s) in callback IO mode */
+
+/* return error codes */
+#define MP3D_E_PARAM   -1
+#define MP3D_E_MEMORY  -2
+#define MP3D_E_IOERROR -3
+#define MP3D_E_USER    -4  /* can be used to stop processing from callbacks without indicating specific error */
+#define MP3D_E_DECODE  -5  /* decode error which can't be safely skipped, such as sample rate, layer and channels change */
+
+typedef struct
+{
+    mp3d_sample_t *buffer;
+    size_t samples; /* channels included, byte size = samples*sizeof(mp3d_sample_t) */
+    int channels, hz, layer, avg_bitrate_kbps;
+} mp3dec_file_info_t;
+
+typedef struct
+{
+    const uint8_t *buffer;
+    size_t size;
+} mp3dec_map_info_t;
+
+typedef struct
+{
+    uint64_t sample;
+    uint64_t offset;
+} mp3dec_frame_t;
+
+typedef struct
+{
+    mp3dec_frame_t *frames;
+    size_t num_frames, capacity;
+} mp3dec_index_t;
+
+typedef size_t (*MP3D_READ_CB)(void *buf, size_t size, void *user_data);
+typedef int (*MP3D_SEEK_CB)(uint64_t position, void *user_data);
+
+typedef struct
+{
+    MP3D_READ_CB read;
+    void *read_data;
+    MP3D_SEEK_CB seek;
+    void *seek_data;
+} mp3dec_io_t;
+
+typedef struct
+{
+    mp3dec_t mp3d;
+    mp3dec_map_info_t file;
+    mp3dec_io_t *io;
+    mp3dec_index_t index;
+    uint64_t offset, samples, detected_samples, cur_sample, start_offset, end_offset;
+    mp3dec_frame_info_t info;
+    mp3d_sample_t buffer[MINIMP3_MAX_SAMPLES_PER_FRAME];
+    size_t input_consumed, input_filled;
+    int is_file, flags, vbr_tag_found, indexes_built;
+    int free_format_bytes;
+    int buffer_samples, buffer_consumed, to_skip, start_delay;
+    int last_error;
+} mp3dec_ex_t;
+
+typedef int (*MP3D_ITERATE_CB)(void *user_data, const uint8_t *frame, int frame_size, int free_format_bytes, size_t buf_size, uint64_t offset, mp3dec_frame_info_t *info);
+typedef int (*MP3D_PROGRESS_CB)(void *user_data, size_t file_size, uint64_t offset, mp3dec_frame_info_t *info);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* detect mp3/mpa format */
+int mp3dec_detect_buf(const uint8_t *buf, size_t buf_size);
+int mp3dec_detect_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size);
+/* decode whole buffer block */
+int mp3dec_load_buf(mp3dec_t *dec, const uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+int mp3dec_load_cb(mp3dec_t *dec, mp3dec_io_t *io, uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+/* iterate through frames */
+int mp3dec_iterate_buf(const uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data);
+int mp3dec_iterate_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data);
+/* streaming decoder with seeking capability */
+int mp3dec_ex_open_buf(mp3dec_ex_t *dec, const uint8_t *buf, size_t buf_size, int flags);
+int mp3dec_ex_open_cb(mp3dec_ex_t *dec, mp3dec_io_t *io, int flags);
+void mp3dec_ex_close(mp3dec_ex_t *dec);
+int mp3dec_ex_seek(mp3dec_ex_t *dec, uint64_t position);
+size_t mp3dec_ex_read_frame(mp3dec_ex_t *dec, mp3d_sample_t **buf, mp3dec_frame_info_t *frame_info, size_t max_samples);
+size_t mp3dec_ex_read(mp3dec_ex_t *dec, mp3d_sample_t *buf, size_t samples);
+#ifndef MINIMP3_NO_STDIO
+/* stdio versions of file detect, load, iterate and stream */
+int mp3dec_detect(const char *file_name);
+int mp3dec_load(mp3dec_t *dec, const char *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+int mp3dec_iterate(const char *file_name, MP3D_ITERATE_CB callback, void *user_data);
+int mp3dec_ex_open(mp3dec_ex_t *dec, const char *file_name, int flags);
+#ifdef _WIN32
+int mp3dec_detect_w(const wchar_t *file_name);
+int mp3dec_load_w(mp3dec_t *dec, const wchar_t *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+int mp3dec_iterate_w(const wchar_t *file_name, MP3D_ITERATE_CB callback, void *user_data);
+int mp3dec_ex_open_w(mp3dec_ex_t *dec, const wchar_t *file_name, int flags);
+#endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*MINIMP3_EXT_H*/
+
+#ifdef MINIMP3_IMPLEMENTATION
+#include <limits.h>
+
+static void mp3dec_skip_id3v1(const uint8_t *buf, size_t *pbuf_size)
+{
+    size_t buf_size = *pbuf_size;
+#ifndef MINIMP3_NOSKIP_ID3V1
+    if (buf_size >= 128 && !memcmp(buf + buf_size - 128, "TAG", 3))
+    {
+        buf_size -= 128;
+        if (buf_size >= 227 && !memcmp(buf + buf_size - 227, "TAG+", 4))
+            buf_size -= 227;
+    }
+#endif
+#ifndef MINIMP3_NOSKIP_APEV2
+    if (buf_size > 32 && !memcmp(buf + buf_size - 32, "APETAGEX", 8))
+    {
+        buf_size -= 32;
+        const uint8_t *tag = buf + buf_size + 8 + 4;
+        uint32_t tag_size = (uint32_t)(tag[3] << 24) | (tag[2] << 16) | (tag[1] << 8) | tag[0];
+        if (buf_size >= tag_size)
+            buf_size -= tag_size;
+    }
+#endif
+    *pbuf_size = buf_size;
+}
+
+static size_t mp3dec_skip_id3v2(const uint8_t *buf, size_t buf_size)
+{
+#define MINIMP3_ID3_DETECT_SIZE 10
+#ifndef MINIMP3_NOSKIP_ID3V2
+    if (buf_size >= MINIMP3_ID3_DETECT_SIZE && !memcmp(buf, "ID3", 3) && !((buf[5] & 15) || (buf[6] & 0x80) || (buf[7] & 0x80) || (buf[8] & 0x80) || (buf[9] & 0x80)))
+    {
+        size_t id3v2size = (((buf[6] & 0x7f) << 21) | ((buf[7] & 0x7f) << 14) | ((buf[8] & 0x7f) << 7) | (buf[9] & 0x7f)) + 10;
+        if ((buf[5] & 16))
+            id3v2size += 10; /* footer */
+        return id3v2size;
+    }
+#endif
+    return 0;
+}
+
+static void mp3dec_skip_id3(const uint8_t **pbuf, size_t *pbuf_size)
+{
+    uint8_t *buf = (uint8_t *)(*pbuf);
+    size_t buf_size = *pbuf_size;
+    size_t id3v2size = mp3dec_skip_id3v2(buf, buf_size);
+    if (id3v2size)
+    {
+        if (id3v2size >= buf_size)
+            id3v2size = buf_size;
+        buf      += id3v2size;
+        buf_size -= id3v2size;
+    }
+    mp3dec_skip_id3v1(buf, &buf_size);
+    *pbuf = (const uint8_t *)buf;
+    *pbuf_size = buf_size;
+}
+
+static int mp3dec_check_vbrtag(const uint8_t *frame, int frame_size, uint32_t *frames, int *delay, int *padding)
+{
+    static const char g_xing_tag[4] = { 'X', 'i', 'n', 'g' };
+    static const char g_info_tag[4] = { 'I', 'n', 'f', 'o' };
+#define FRAMES_FLAG     1
+#define BYTES_FLAG      2
+#define TOC_FLAG        4
+#define VBR_SCALE_FLAG  8
+    /* Side info offsets after header:
+    /                Mono  Stereo
+    /  MPEG1          17     32
+    /  MPEG2 & 2.5     9     17*/
+    bs_t bs[1];
+    L3_gr_info_t gr_info[4];
+    bs_init(bs, frame + HDR_SIZE, frame_size - HDR_SIZE);
+    if (HDR_IS_CRC(frame))
+        get_bits(bs, 16);
+    if (L3_read_side_info(bs, gr_info, frame) < 0)
+        return 0; /* side info corrupted */
+
+    const uint8_t *tag = frame + HDR_SIZE + bs->pos/8;
+    if (memcmp(g_xing_tag, tag, 4) && memcmp(g_info_tag, tag, 4))
+        return 0;
+    int flags = tag[7];
+    if (!((flags & FRAMES_FLAG)))
+        return -1;
+    tag += 8;
+    *frames = (uint32_t)(tag[0] << 24) | (tag[1] << 16) | (tag[2] << 8) | tag[3];
+    tag += 4;
+    if (flags & BYTES_FLAG)
+        tag += 4;
+    if (flags & TOC_FLAG)
+        tag += 100;
+    if (flags & VBR_SCALE_FLAG)
+        tag += 4;
+    *delay = *padding = 0;
+    if (*tag)
+    {   /* extension, LAME, Lavc, etc. Should be the same structure. */
+        tag += 21;
+        if (tag - frame + 14 >= frame_size)
+            return 0;
+        *delay   = ((tag[0] << 4) | (tag[1] >> 4)) + (528 + 1);
+        *padding = (((tag[1] & 0xF) << 8) | tag[2]) - (528 + 1);
+    }
+    return 1;
+}
+
+int mp3dec_detect_buf(const uint8_t *buf, size_t buf_size)
+{
+    return mp3dec_detect_cb(0, (uint8_t *)buf, buf_size);
+}
+
+int mp3dec_detect_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size)
+{
+    if (!buf || (size_t)-1 == buf_size || (io && buf_size < MINIMP3_BUF_SIZE))
+        return MP3D_E_PARAM;
+    size_t filled = buf_size;
+    if (io)
+    {
+        if (io->seek(0, io->seek_data))
+            return MP3D_E_IOERROR;
+        filled = io->read(buf, MINIMP3_ID3_DETECT_SIZE, io->read_data);
+        if (filled > MINIMP3_ID3_DETECT_SIZE)
+            return MP3D_E_IOERROR;
+    }
+    if (filled < MINIMP3_ID3_DETECT_SIZE)
+        return MP3D_E_USER; /* too small, can't be mp3/mpa */
+    if (mp3dec_skip_id3v2(buf, filled))
+        return 0; /* id3v2 tag is enough evidence */
+    if (io)
+    {
+        size_t readed = io->read(buf + MINIMP3_ID3_DETECT_SIZE, buf_size - MINIMP3_ID3_DETECT_SIZE, io->read_data);
+        if (readed > (buf_size - MINIMP3_ID3_DETECT_SIZE))
+            return MP3D_E_IOERROR;
+        filled += readed;
+        if (filled < MINIMP3_BUF_SIZE)
+            mp3dec_skip_id3v1(buf, &filled);
+    } else
+    {
+        mp3dec_skip_id3v1(buf, &filled);
+        if (filled > MINIMP3_BUF_SIZE)
+            filled = MINIMP3_BUF_SIZE;
+    }
+    int free_format_bytes, frame_size;
+    mp3d_find_frame(buf, filled, &free_format_bytes, &frame_size);
+    if (frame_size)
+        return 0; /* MAX_FRAME_SYNC_MATCHES consecutive frames found */
+    return MP3D_E_USER;
+}
+
+int mp3dec_load_buf(mp3dec_t *dec, const uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+    return mp3dec_load_cb(dec, 0, (uint8_t *)buf, buf_size, info, progress_cb, user_data);
+}
+
+int mp3dec_load_cb(mp3dec_t *dec, mp3dec_io_t *io, uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+    if (!dec || !buf || !info || (size_t)-1 == buf_size || (io && buf_size < MINIMP3_BUF_SIZE))
+        return MP3D_E_PARAM;
+    uint64_t detected_samples = 0;
+    size_t orig_buf_size = buf_size;
+    int to_skip = 0;
+    mp3dec_frame_info_t frame_info;
+    memset(info, 0, sizeof(*info));
+    memset(&frame_info, 0, sizeof(frame_info));
+
+    /* skip id3 */
+    size_t filled = 0, consumed = 0;
+    int eof = 0, ret = 0;
+    if (io)
+    {
+        if (io->seek(0, io->seek_data))
+            return MP3D_E_IOERROR;
+        filled = io->read(buf, MINIMP3_ID3_DETECT_SIZE, io->read_data);
+        if (filled > MINIMP3_ID3_DETECT_SIZE)
+            return MP3D_E_IOERROR;
+        if (MINIMP3_ID3_DETECT_SIZE != filled)
+            return 0;
+        size_t id3v2size = mp3dec_skip_id3v2(buf, filled);
+        if (id3v2size)
+        {
+            if (io->seek(id3v2size, io->seek_data))
+                return MP3D_E_IOERROR;
+            filled = io->read(buf, buf_size, io->read_data);
+            if (filled > buf_size)
+                return MP3D_E_IOERROR;
+        } else
+        {
+            size_t readed = io->read(buf + MINIMP3_ID3_DETECT_SIZE, buf_size - MINIMP3_ID3_DETECT_SIZE, io->read_data);
+            if (readed > (buf_size - MINIMP3_ID3_DETECT_SIZE))
+                return MP3D_E_IOERROR;
+            filled += readed;
+        }
+        if (filled < MINIMP3_BUF_SIZE)
+            mp3dec_skip_id3v1(buf, &filled);
+    } else
+    {
+        mp3dec_skip_id3((const uint8_t **)&buf, &buf_size);
+        if (!buf_size)
+            return 0;
+    }
+    /* try to make allocation size assumption by first frame or vbr tag */
+    mp3dec_init(dec);
+    int samples;
+    do
+    {
+        uint32_t frames;
+        int i, delay, padding, free_format_bytes = 0, frame_size = 0;
+        const uint8_t *hdr;
+        if (io)
+        {
+            if (!eof && filled - consumed < MINIMP3_BUF_SIZE)
+            {   /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+                memmove(buf, buf + consumed, filled - consumed);
+                filled -= consumed;
+                consumed = 0;
+                size_t readed = io->read(buf + filled, buf_size - filled, io->read_data);
+                if (readed > (buf_size - filled))
+                    return MP3D_E_IOERROR;
+                if (readed != (buf_size - filled))
+                    eof = 1;
+                filled += readed;
+                if (eof)
+                    mp3dec_skip_id3v1(buf, &filled);
+            }
+            i = mp3d_find_frame(buf + consumed, filled - consumed, &free_format_bytes, &frame_size);
+            consumed += i;
+            hdr = buf + consumed;
+        } else
+        {
+            i = mp3d_find_frame(buf, buf_size, &free_format_bytes, &frame_size);
+            buf      += i;
+            buf_size -= i;
+            hdr = buf;
+        }
+        if (i && !frame_size)
+            continue;
+        if (!frame_size)
+            return 0;
+        frame_info.channels = HDR_IS_MONO(hdr) ? 1 : 2;
+        frame_info.hz = hdr_sample_rate_hz(hdr);
+        frame_info.layer = 4 - HDR_GET_LAYER(hdr);
+        frame_info.bitrate_kbps = hdr_bitrate_kbps(hdr);
+        frame_info.frame_bytes = frame_size;
+        samples = hdr_frame_samples(hdr)*frame_info.channels;
+        if (3 != frame_info.layer)
+            break;
+        int ret = mp3dec_check_vbrtag(hdr, frame_size, &frames, &delay, &padding);
+        if (ret > 0)
+        {
+            padding *= frame_info.channels;
+            to_skip = delay*frame_info.channels;
+            detected_samples = samples*(uint64_t)frames;
+            if (detected_samples >= (uint64_t)to_skip)
+                detected_samples -= to_skip;
+            if (padding > 0 && detected_samples >= (uint64_t)padding)
+                detected_samples -= padding;
+            if (!detected_samples)
+                return 0;
+        }
+        if (ret)
+        {
+            if (io)
+            {
+                consumed += frame_size;
+            } else
+            {
+                buf      += frame_size;
+                buf_size -= frame_size;
+            }
+        }
+        break;
+    } while(1);
+    size_t allocated = MINIMP3_MAX_SAMPLES_PER_FRAME*sizeof(mp3d_sample_t);
+    if (detected_samples)
+        allocated += detected_samples*sizeof(mp3d_sample_t);
+    else
+        allocated += (buf_size/frame_info.frame_bytes)*samples*sizeof(mp3d_sample_t);
+    info->buffer = (mp3d_sample_t*)malloc(allocated);
+    if (!info->buffer)
+        return MP3D_E_MEMORY;
+    /* save info */
+    info->channels = frame_info.channels;
+    info->hz       = frame_info.hz;
+    info->layer    = frame_info.layer;
+    /* decode all frames */
+    size_t avg_bitrate_kbps = 0, frames = 0;
+    do
+    {
+        if ((allocated - info->samples*sizeof(mp3d_sample_t)) < MINIMP3_MAX_SAMPLES_PER_FRAME*sizeof(mp3d_sample_t))
+        {
+            allocated *= 2;
+            mp3d_sample_t *alloc_buf = (mp3d_sample_t*)realloc(info->buffer, allocated);
+            if (!alloc_buf)
+                return MP3D_E_MEMORY;
+            info->buffer = alloc_buf;
+        }
+        if (io)
+        {
+            if (!eof && filled - consumed < MINIMP3_BUF_SIZE)
+            {   /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+                memmove(buf, buf + consumed, filled - consumed);
+                filled -= consumed;
+                consumed = 0;
+                size_t readed = io->read(buf + filled, buf_size - filled, io->read_data);
+                if (readed != (buf_size - filled))
+                    eof = 1;
+                filled += readed;
+                if (eof)
+                    mp3dec_skip_id3v1(buf, &filled);
+            }
+            samples = mp3dec_decode_frame(dec, buf + consumed, filled - consumed, info->buffer + info->samples, &frame_info);
+            consumed += frame_info.frame_bytes;
+        } else
+        {
+            samples = mp3dec_decode_frame(dec, buf, MINIMP3_MIN(buf_size, (size_t)INT_MAX), info->buffer + info->samples, &frame_info);
+            buf      += frame_info.frame_bytes;
+            buf_size -= frame_info.frame_bytes;
+        }
+        if (samples)
+        {
+            if (info->hz != frame_info.hz || info->layer != frame_info.layer)
+            {
+                ret = MP3D_E_DECODE;
+                break;
+            }
+            if (info->channels && info->channels != frame_info.channels)
+            {
+#ifdef MINIMP3_ALLOW_MONO_STEREO_TRANSITION
+                info->channels = 0; /* mark file with mono-stereo transition */
+#else
+                ret = MP3D_E_DECODE;
+                break;
+#endif
+            }
+            samples *= frame_info.channels;
+            if (to_skip)
+            {
+                size_t skip = MINIMP3_MIN(samples, to_skip);
+                to_skip -= skip;
+                samples -= skip;
+                memmove(info->buffer, info->buffer + skip, samples*sizeof(mp3d_sample_t));
+            }
+            info->samples += samples;
+            avg_bitrate_kbps += frame_info.bitrate_kbps;
+            frames++;
+            if (progress_cb)
+            {
+                ret = progress_cb(user_data, orig_buf_size, orig_buf_size - buf_size, &frame_info);
+                if (ret)
+                    break;
+            }
+        }
+    } while (frame_info.frame_bytes);
+    if (detected_samples && info->samples > detected_samples)
+        info->samples = detected_samples; /* cut padding */
+    /* reallocate to normal buffer size */
+    if (allocated != info->samples*sizeof(mp3d_sample_t))
+    {
+        mp3d_sample_t *alloc_buf = (mp3d_sample_t*)realloc(info->buffer, info->samples*sizeof(mp3d_sample_t));
+        if (!alloc_buf && info->samples)
+            return MP3D_E_MEMORY;
+        info->buffer = alloc_buf;
+    }
+    if (frames)
+        info->avg_bitrate_kbps = avg_bitrate_kbps/frames;
+    return ret;
+}
+
+int mp3dec_iterate_buf(const uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data)
+{
+    const uint8_t *orig_buf = buf;
+    if (!buf || (size_t)-1 == buf_size || !callback)
+        return MP3D_E_PARAM;
+    /* skip id3 */
+    mp3dec_skip_id3(&buf, &buf_size);
+    if (!buf_size)
+        return 0;
+    mp3dec_frame_info_t frame_info;
+    memset(&frame_info, 0, sizeof(frame_info));
+    do
+    {
+        int free_format_bytes = 0, frame_size = 0, ret;
+        int i = mp3d_find_frame(buf, buf_size, &free_format_bytes, &frame_size);
+        buf      += i;
+        buf_size -= i;
+        if (i && !frame_size)
+            continue;
+        if (!frame_size)
+            break;
+        const uint8_t *hdr = buf;
+        frame_info.channels = HDR_IS_MONO(hdr) ? 1 : 2;
+        frame_info.hz = hdr_sample_rate_hz(hdr);
+        frame_info.layer = 4 - HDR_GET_LAYER(hdr);
+        frame_info.bitrate_kbps = hdr_bitrate_kbps(hdr);
+        frame_info.frame_bytes = frame_size;
+
+        if (callback)
+        {
+            if ((ret = callback(user_data, hdr, frame_size, free_format_bytes, buf_size, hdr - orig_buf, &frame_info)))
+                return ret;
+        }
+        buf      += frame_size;
+        buf_size -= frame_size;
+    } while (1);
+    return 0;
+}
+
+int mp3dec_iterate_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data)
+{
+    if (!io || !buf || (size_t)-1 == buf_size || buf_size < MINIMP3_BUF_SIZE || !callback)
+        return MP3D_E_PARAM;
+    size_t filled = io->read(buf, MINIMP3_ID3_DETECT_SIZE, io->read_data), consumed = 0;
+    uint64_t readed = 0;
+    mp3dec_frame_info_t frame_info;
+    int eof = 0;
+    memset(&frame_info, 0, sizeof(frame_info));
+    if (filled > MINIMP3_ID3_DETECT_SIZE)
+        return MP3D_E_IOERROR;
+    if (MINIMP3_ID3_DETECT_SIZE != filled)
+        return 0;
+    size_t id3v2size = mp3dec_skip_id3v2(buf, filled);
+    if (id3v2size)
+    {
+        if (io->seek(id3v2size, io->seek_data))
+            return MP3D_E_IOERROR;
+        filled = io->read(buf, buf_size, io->read_data);
+        if (filled > buf_size)
+            return MP3D_E_IOERROR;
+        readed += id3v2size;
+    } else
+    {
+        size_t readed = io->read(buf + MINIMP3_ID3_DETECT_SIZE, buf_size - MINIMP3_ID3_DETECT_SIZE, io->read_data);
+        if (readed > (buf_size - MINIMP3_ID3_DETECT_SIZE))
+            return MP3D_E_IOERROR;
+        filled += readed;
+    }
+    if (filled < MINIMP3_BUF_SIZE)
+        mp3dec_skip_id3v1(buf, &filled);
+    do
+    {
+        int free_format_bytes = 0, frame_size = 0, ret;
+        int i = mp3d_find_frame(buf + consumed, filled - consumed, &free_format_bytes, &frame_size);
+        if (i && !frame_size)
+        {
+            consumed += i;
+            continue;
+        }
+        if (!frame_size)
+            break;
+        const uint8_t *hdr = buf + consumed + i;
+        frame_info.channels = HDR_IS_MONO(hdr) ? 1 : 2;
+        frame_info.hz = hdr_sample_rate_hz(hdr);
+        frame_info.layer = 4 - HDR_GET_LAYER(hdr);
+        frame_info.bitrate_kbps = hdr_bitrate_kbps(hdr);
+        frame_info.frame_bytes = frame_size;
+
+        readed += i;
+        if (callback)
+        {
+            if ((ret = callback(user_data, hdr, frame_size, free_format_bytes, filled - consumed, readed, &frame_info)))
+                return ret;
+        }
+        readed += frame_size;
+        consumed += i + frame_size;
+        if (!eof && filled - consumed < MINIMP3_BUF_SIZE)
+        {   /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+            memmove(buf, buf + consumed, filled - consumed);
+            filled -= consumed;
+            consumed = 0;
+            size_t readed = io->read(buf + filled, buf_size - filled, io->read_data);
+            if (readed > (buf_size - filled))
+                return MP3D_E_IOERROR;
+            if (readed != (buf_size - filled))
+                eof = 1;
+            filled += readed;
+            if (eof)
+                mp3dec_skip_id3v1(buf, &filled);
+        }
+    } while (1);
+    return 0;
+}
+
+static int mp3dec_load_index(void *user_data, const uint8_t *frame, int frame_size, int free_format_bytes, size_t buf_size, uint64_t offset, mp3dec_frame_info_t *info)
+{
+    mp3dec_frame_t *idx_frame;
+    mp3dec_ex_t *dec = (mp3dec_ex_t *)user_data;
+    if (!dec->index.frames && !dec->start_offset)
+    {   /* detect VBR tag and try to avoid full scan */
+        uint32_t frames;
+        int delay, padding;
+        dec->info = *info;
+        dec->start_offset = dec->offset = offset;
+        dec->end_offset   = offset + buf_size;
+        dec->free_format_bytes = free_format_bytes; /* should not change */
+        if (3 == dec->info.layer)
+        {
+            int ret = mp3dec_check_vbrtag(frame, frame_size, &frames, &delay, &padding);
+            if (ret)
+                dec->start_offset = dec->offset = offset + frame_size;
+            if (ret > 0)
+            {
+                padding *= info->channels;
+                dec->start_delay = dec->to_skip = delay*info->channels;
+                dec->samples = hdr_frame_samples(frame)*info->channels*(uint64_t)frames;
+                if (dec->samples >= (uint64_t)dec->start_delay)
+                    dec->samples -= dec->start_delay;
+                if (padding > 0 && dec->samples >= (uint64_t)padding)
+                    dec->samples -= padding;
+                dec->detected_samples = dec->samples;
+                dec->vbr_tag_found = 1;
+                return MP3D_E_USER;
+            } else if (ret < 0)
+                return 0;
+        }
+    }
+    if (dec->flags & MP3D_DO_NOT_SCAN)
+        return MP3D_E_USER;
+    if (dec->index.num_frames + 1 > dec->index.capacity)
+    {
+        if (!dec->index.capacity)
+            dec->index.capacity = 4096;
+        else
+            dec->index.capacity *= 2;
+        mp3dec_frame_t *alloc_buf = (mp3dec_frame_t *)realloc((void*)dec->index.frames, sizeof(mp3dec_frame_t)*dec->index.capacity);
+        if (!alloc_buf)
+            return MP3D_E_MEMORY;
+        dec->index.frames = alloc_buf;
+    }
+    idx_frame = &dec->index.frames[dec->index.num_frames++];
+    idx_frame->offset = offset;
+    idx_frame->sample = dec->samples;
+    if (!dec->buffer_samples && dec->index.num_frames < 256)
+    {   /* for some cutted mp3 frames, bit-reservoir not filled and decoding can't be started from first frames */
+        /* try to decode up to 255 first frames till samples starts to decode */
+        dec->buffer_samples = mp3dec_decode_frame(&dec->mp3d, frame, MINIMP3_MIN(buf_size, (size_t)INT_MAX), dec->buffer, info);
+        dec->samples += dec->buffer_samples*info->channels;
+    } else
+        dec->samples += hdr_frame_samples(frame)*info->channels;
+    return 0;
+}
+
+int mp3dec_ex_open_buf(mp3dec_ex_t *dec, const uint8_t *buf, size_t buf_size, int flags)
+{
+    if (!dec || !buf || (size_t)-1 == buf_size || (flags & (~MP3D_FLAGS_MASK)))
+        return MP3D_E_PARAM;
+    memset(dec, 0, sizeof(*dec));
+    dec->file.buffer = buf;
+    dec->file.size   = buf_size;
+    dec->flags       = flags;
+    mp3dec_init(&dec->mp3d);
+    int ret = mp3dec_iterate_buf(dec->file.buffer, dec->file.size, mp3dec_load_index, dec);
+    if (ret && MP3D_E_USER != ret)
+        return ret;
+    mp3dec_init(&dec->mp3d);
+    dec->buffer_samples = 0;
+    dec->indexes_built = !(dec->vbr_tag_found || (flags & MP3D_DO_NOT_SCAN));
+    dec->flags &= (~MP3D_DO_NOT_SCAN);
+    return 0;
+}
+
+#ifndef MINIMP3_SEEK_IDX_LINEAR_SEARCH
+static size_t mp3dec_idx_binary_search(mp3dec_index_t *idx, uint64_t position)
+{
+    size_t end = idx->num_frames, start = 0, index = 0;
+    while (start <= end)
+    {
+        size_t mid = (start + end) / 2;
+        if (idx->frames[mid].sample >= position)
+        {   /* move left side. */
+            if (idx->frames[mid].sample == position)
+                return mid;
+            end = mid - 1;
+        }  else
+        {   /* move to right side */
+            index = mid;
+            start = mid + 1;
+            if (start == idx->num_frames)
+                break;
+        }
+    }
+    return index;
+}
+#endif
+
+int mp3dec_ex_seek(mp3dec_ex_t *dec, uint64_t position)
+{
+    size_t i;
+    if (!dec)
+        return MP3D_E_PARAM;
+    if (!(dec->flags & MP3D_SEEK_TO_SAMPLE))
+    {
+        if (dec->io)
+        {
+            dec->offset = position;
+        } else
+        {
+            dec->offset = MINIMP3_MIN(position, dec->file.size);
+        }
+        dec->cur_sample = 0;
+        goto do_exit;
+    }
+    dec->cur_sample = position;
+    position += dec->start_delay;
+    if (0 == position)
+    {   /* optimize seek to zero, no index needed */
+seek_zero:
+        dec->offset  = dec->start_offset;
+        dec->to_skip = 0;
+        goto do_exit;
+    }
+    if (!dec->indexes_built)
+    {   /* no index created yet (vbr tag used to calculate track length or MP3D_DO_NOT_SCAN open flag used) */
+        dec->indexes_built = 1;
+        dec->samples = 0;
+        dec->buffer_samples = 0;
+        if (dec->io)
+        {
+            if (dec->io->seek(dec->start_offset, dec->io->seek_data))
+                return MP3D_E_IOERROR;
+            int ret = mp3dec_iterate_cb(dec->io, (uint8_t *)dec->file.buffer, dec->file.size, mp3dec_load_index, dec);
+            if (ret && MP3D_E_USER != ret)
+                return ret;
+        } else
+        {
+            int ret = mp3dec_iterate_buf(dec->file.buffer + dec->start_offset, dec->file.size - dec->start_offset, mp3dec_load_index, dec);
+            if (ret && MP3D_E_USER != ret)
+                return ret;
+        }
+        for (i = 0; i < dec->index.num_frames; i++)
+            dec->index.frames[i].offset += dec->start_offset;
+        dec->samples = dec->detected_samples;
+    }
+    if (!dec->index.frames)
+        goto seek_zero; /* no frames in file - seek to zero */
+#ifdef MINIMP3_SEEK_IDX_LINEAR_SEARCH
+    for (i = 0; i < dec->index.num_frames; i++)
+    {
+        if (dec->index.frames[i].sample >= position)
+            break;
+    }
+#else
+    i = mp3dec_idx_binary_search(&dec->index, position);
+#endif
+    if (i)
+    {
+        int to_fill_bytes = 511;
+        int skip_frames = MINIMP3_PREDECODE_FRAMES
+#ifdef MINIMP3_SEEK_IDX_LINEAR_SEARCH
+         + ((dec->index.frames[i].sample == position) ? 0 : 1)
+#endif
+        ;
+        i -= MINIMP3_MIN(i, (size_t)skip_frames);
+        if (3 == dec->info.layer)
+        {
+            while (i && to_fill_bytes)
+            {   /* make sure bit-reservoir is filled when we start decoding */
+                bs_t bs[1];
+                L3_gr_info_t gr_info[4];
+                int frame_bytes, frame_size;
+                const uint8_t *hdr;
+                if (dec->io)
+                {
+                    hdr = dec->file.buffer;
+                    if (dec->io->seek(dec->index.frames[i - 1].offset, dec->io->seek_data))
+                        return MP3D_E_IOERROR;
+                    size_t readed = dec->io->read((uint8_t *)hdr, HDR_SIZE, dec->io->read_data);
+                    if (readed != HDR_SIZE)
+                        return MP3D_E_IOERROR;
+                    frame_size = hdr_frame_bytes(hdr, dec->free_format_bytes) + hdr_padding(hdr);
+                    readed = dec->io->read((uint8_t *)hdr + HDR_SIZE, frame_size - HDR_SIZE, dec->io->read_data);
+                    if (readed != (size_t)(frame_size - HDR_SIZE))
+                        return MP3D_E_IOERROR;
+                    bs_init(bs, hdr + HDR_SIZE, frame_size - HDR_SIZE);
+                } else
+                {
+                    hdr = dec->file.buffer + dec->index.frames[i - 1].offset;
+                    frame_size = hdr_frame_bytes(hdr, dec->free_format_bytes) + hdr_padding(hdr);
+                    bs_init(bs, hdr + HDR_SIZE, frame_size - HDR_SIZE);
+                }
+                if (HDR_IS_CRC(hdr))
+                    get_bits(bs, 16);
+                i--;
+                if (L3_read_side_info(bs, gr_info, hdr) < 0)
+                    break; /* frame not decodable, we can start from here */
+                frame_bytes = (bs->limit - bs->pos)/8;
+                to_fill_bytes -= MINIMP3_MIN(to_fill_bytes, frame_bytes);
+            }
+        }
+    }
+    dec->offset = dec->index.frames[i].offset;
+    dec->to_skip = position - dec->index.frames[i].sample;
+    while ((i + 1) < dec->index.num_frames && !dec->index.frames[i].sample && !dec->index.frames[i + 1].sample)
+    {   /* skip not decodable first frames */
+        const uint8_t *hdr;
+        if (dec->io)
+        {
+            hdr = dec->file.buffer;
+            if (dec->io->seek(dec->index.frames[i].offset, dec->io->seek_data))
+                return MP3D_E_IOERROR;
+            size_t readed = dec->io->read((uint8_t *)hdr, HDR_SIZE, dec->io->read_data);
+            if (readed != HDR_SIZE)
+                return MP3D_E_IOERROR;
+        } else
+            hdr = dec->file.buffer + dec->index.frames[i].offset;
+        dec->to_skip += hdr_frame_samples(hdr)*dec->info.channels;
+        i++;
+    }
+do_exit:
+    if (dec->io)
+    {
+        if (dec->io->seek(dec->offset, dec->io->seek_data))
+            return MP3D_E_IOERROR;
+    }
+    dec->buffer_samples  = 0;
+    dec->buffer_consumed = 0;
+    dec->input_consumed  = 0;
+    dec->input_filled    = 0;
+    dec->last_error      = 0;
+    mp3dec_init(&dec->mp3d);
+    return 0;
+}
+
+size_t mp3dec_ex_read_frame(mp3dec_ex_t *dec, mp3d_sample_t **buf, mp3dec_frame_info_t *frame_info, size_t max_samples)
+{
+    if (!dec || !buf || !frame_info)
+    {
+        if (dec)
+            dec->last_error = MP3D_E_PARAM;
+        return 0;
+    }
+    if (dec->detected_samples && dec->cur_sample >= dec->detected_samples)
+        return 0; /* at end of stream */
+    if (dec->last_error)
+        return 0; /* error eof state, seek can reset it */
+    *buf = NULL;
+    uint64_t end_offset = dec->end_offset ? dec->end_offset : dec->file.size;
+    int eof = 0;
+    while (dec->buffer_consumed == dec->buffer_samples)
+    {
+        const uint8_t *dec_buf;
+        if (dec->io)
+        {
+            if (!eof && (dec->input_filled - dec->input_consumed) < MINIMP3_BUF_SIZE)
+            {   /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+                memmove((uint8_t*)dec->file.buffer, (uint8_t*)dec->file.buffer + dec->input_consumed, dec->input_filled - dec->input_consumed);
+                dec->input_filled -= dec->input_consumed;
+                dec->input_consumed = 0;
+                size_t readed = dec->io->read((uint8_t*)dec->file.buffer + dec->input_filled, dec->file.size - dec->input_filled, dec->io->read_data);
+                if (readed > (dec->file.size - dec->input_filled))
+                {
+                    dec->last_error = MP3D_E_IOERROR;
+                    readed = 0;
+                }
+                if (readed != (dec->file.size - dec->input_filled))
+                    eof = 1;
+                dec->input_filled += readed;
+                if (eof)
+                    mp3dec_skip_id3v1((uint8_t*)dec->file.buffer, &dec->input_filled);
+            }
+            dec_buf = dec->file.buffer + dec->input_consumed;
+            if (!(dec->input_filled - dec->input_consumed))
+                return 0;
+            dec->buffer_samples = mp3dec_decode_frame(&dec->mp3d, dec_buf, dec->input_filled - dec->input_consumed, dec->buffer, frame_info);
+            dec->input_consumed += frame_info->frame_bytes;
+        } else
+        {
+            dec_buf = dec->file.buffer + dec->offset;
+            uint64_t buf_size = end_offset - dec->offset;
+            if (!buf_size)
+                return 0;
+            dec->buffer_samples = mp3dec_decode_frame(&dec->mp3d, dec_buf, MINIMP3_MIN(buf_size, (uint64_t)INT_MAX), dec->buffer, frame_info);
+        }
+        dec->buffer_consumed = 0;
+        if (dec->info.hz != frame_info->hz || dec->info.layer != frame_info->layer)
+        {
+return_e_decode:
+            dec->last_error = MP3D_E_DECODE;
+            return 0;
+        }
+        if (dec->buffer_samples)
+        {
+            dec->buffer_samples *= frame_info->channels;
+            if (dec->to_skip)
+            {
+                size_t skip = MINIMP3_MIN(dec->buffer_samples, dec->to_skip);
+                dec->buffer_consumed += skip;
+                dec->to_skip -= skip;
+            }
+            if (
+#ifdef MINIMP3_ALLOW_MONO_STEREO_TRANSITION
+                !(dec->flags & MP3D_ALLOW_MONO_STEREO_TRANSITION) &&
+#endif
+                dec->buffer_consumed != dec->buffer_samples && dec->info.channels != frame_info->channels)
+            {
+                goto return_e_decode;
+            }
+        } else if (dec->to_skip)
+        {   /* In mp3 decoding not always can start decode from any frame because of bit reservoir,
+               count skip samples for such frames */
+            int frame_samples = hdr_frame_samples(dec_buf)*frame_info->channels;
+            dec->to_skip -= MINIMP3_MIN(frame_samples, dec->to_skip);
+        }
+        dec->offset += frame_info->frame_bytes;
+    }
+    size_t out_samples = MINIMP3_MIN((size_t)(dec->buffer_samples - dec->buffer_consumed), max_samples);
+    if (dec->detected_samples)
+    {   /* count decoded samples to properly cut padding */
+        if (dec->cur_sample + out_samples >= dec->detected_samples)
+            out_samples = dec->detected_samples - dec->cur_sample;
+    }
+    dec->cur_sample += out_samples;
+    *buf = dec->buffer + dec->buffer_consumed;
+    dec->buffer_consumed += out_samples;
+    return out_samples;
+}
+
+size_t mp3dec_ex_read(mp3dec_ex_t *dec, mp3d_sample_t *buf, size_t samples)
+{
+    if (!dec || !buf)
+    {
+        if (dec)
+            dec->last_error = MP3D_E_PARAM;
+        return 0;
+    }
+    mp3dec_frame_info_t frame_info;
+    memset(&frame_info, 0, sizeof(frame_info));
+    size_t samples_requested = samples;
+    while (samples)
+    {
+        mp3d_sample_t *buf_frame = NULL;
+        size_t read_samples = mp3dec_ex_read_frame(dec, &buf_frame, &frame_info, samples);
+        if (!read_samples)
+        {
+            break;
+        }
+        memcpy(buf, buf_frame, read_samples * sizeof(mp3d_sample_t));
+        buf += read_samples;
+        samples -= read_samples;
+    }
+    return samples_requested - samples;
+}
+
+int mp3dec_ex_open_cb(mp3dec_ex_t *dec, mp3dec_io_t *io, int flags)
+{
+    if (!dec || !io || (flags & (~MP3D_FLAGS_MASK)))
+        return MP3D_E_PARAM;
+    memset(dec, 0, sizeof(*dec));
+#ifdef MINIMP3_HAVE_RING
+    int ret;
+    if (ret = mp3dec_open_ring(&dec->file, MINIMP3_IO_SIZE))
+        return ret;
+#else
+    dec->file.size = MINIMP3_IO_SIZE;
+    dec->file.buffer = (const uint8_t*)malloc(dec->file.size);
+    if (!dec->file.buffer)
+        return MP3D_E_MEMORY;
+#endif
+    dec->flags = flags;
+    dec->io = io;
+    mp3dec_init(&dec->mp3d);
+    if (io->seek(0, io->seek_data))
+        return MP3D_E_IOERROR;
+    int ret = mp3dec_iterate_cb(io, (uint8_t *)dec->file.buffer, dec->file.size, mp3dec_load_index, dec);
+    if (ret && MP3D_E_USER != ret)
+        return ret;
+    if (dec->io->seek(dec->start_offset, dec->io->seek_data))
+        return MP3D_E_IOERROR;
+    mp3dec_init(&dec->mp3d);
+    dec->buffer_samples = 0;
+    dec->indexes_built = !(dec->vbr_tag_found || (flags & MP3D_DO_NOT_SCAN));
+    dec->flags &= (~MP3D_DO_NOT_SCAN);
+    return 0;
+}
+
+
+#ifndef MINIMP3_NO_STDIO
+
+#if defined(__linux__) || defined(__FreeBSD__)
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#if !defined(_GNU_SOURCE)
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+#if !defined(MAP_POPULATE) && defined(__linux__)
+#define MAP_POPULATE 0x08000
+#elif !defined(MAP_POPULATE)
+#define MAP_POPULATE 0
+#endif
+
+static void mp3dec_close_file(mp3dec_map_info_t *map_info)
+{
+    if (map_info->buffer && MAP_FAILED != map_info->buffer)
+        munmap((void *)map_info->buffer, map_info->size);
+    map_info->buffer = 0;
+    map_info->size   = 0;
+}
+
+static int mp3dec_open_file(const char *file_name, mp3dec_map_info_t *map_info)
+{
+    if (!file_name)
+        return MP3D_E_PARAM;
+    int file;
+    struct stat st;
+    memset(map_info, 0, sizeof(*map_info));
+retry_open:
+    file = open(file_name, O_RDONLY);
+    if (file < 0 && (errno == EAGAIN || errno == EINTR))
+        goto retry_open;
+    if (file < 0 || fstat(file, &st) < 0)
+    {
+        close(file);
+        return MP3D_E_IOERROR;
+    }
+
+    map_info->size = st.st_size;
+retry_mmap:
+    map_info->buffer = (const uint8_t *)mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE | MAP_POPULATE, file, 0);
+    if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+        goto retry_mmap;
+    close(file);
+    if (MAP_FAILED == map_info->buffer)
+        return MP3D_E_IOERROR;
+    return 0;
+}
+
+#if MINIMP3_ENABLE_RING && defined(__linux__) && defined(_GNU_SOURCE)
+#define MINIMP3_HAVE_RING
+static void mp3dec_close_ring(mp3dec_map_info_t *map_info)
+{
+#if defined(__linux__) && defined(_GNU_SOURCE)
+    if (map_info->buffer && MAP_FAILED != map_info->buffer)
+        munmap((void *)map_info->buffer, map_info->size*2);
+#else
+    if (map_info->buffer)
+    {
+        shmdt(map_info->buffer);
+        shmdt(map_info->buffer + map_info->size);
+    }
+#endif
+    map_info->buffer = 0;
+    map_info->size   = 0;
+}
+
+static int mp3dec_open_ring(mp3dec_map_info_t *map_info, size_t size)
+{
+    int memfd, page_size;
+#if defined(__linux__) && defined(_GNU_SOURCE)
+    void *buffer;
+    int res;
+#endif
+    memset(map_info, 0, sizeof(*map_info));
+
+#ifdef _SC_PAGESIZE
+    page_size = sysconf(_SC_PAGESIZE);
+#else
+    page_size = getpagesize();
+#endif
+    map_info->size = (size + page_size - 1)/page_size*page_size;
+
+#if defined(__linux__) && defined(_GNU_SOURCE)
+    memfd = memfd_create("mp3_ring", 0);
+    if (memfd < 0)
+        return MP3D_E_MEMORY;
+
+retry_ftruncate:
+    res = ftruncate(memfd, map_info->size);
+    if (res && (errno == EAGAIN || errno == EINTR))
+        goto retry_ftruncate;
+    if (res)
+        goto error;
+
+retry_mmap:
+    map_info->buffer = (const uint8_t *)mmap(NULL, map_info->size*2, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+    if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+        goto retry_mmap;
+    if (MAP_FAILED == map_info->buffer || !map_info->buffer)
+        goto error;
+retry_mmap2:
+    buffer = mmap((void *)map_info->buffer, map_info->size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, memfd, 0);
+    if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+        goto retry_mmap2;
+    if (MAP_FAILED == map_info->buffer || buffer != (void *)map_info->buffer)
+        goto error;
+retry_mmap3:
+    buffer = mmap((void *)map_info->buffer + map_info->size, map_info->size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, memfd, 0);
+    if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+        goto retry_mmap3;
+    if (MAP_FAILED == map_info->buffer || buffer != (void *)(map_info->buffer + map_info->size))
+        goto error;
+
+    close(memfd);
+    return 0;
+error:
+    close(memfd);
+    mp3dec_close_ring(map_info);
+    return MP3D_E_MEMORY;
+#else
+    memfd = shmget(IPC_PRIVATE, map_info->size, IPC_CREAT | 0700);
+    if (memfd < 0)
+        return MP3D_E_MEMORY;
+retry_mmap:
+    map_info->buffer = (const uint8_t *)mmap(NULL, map_info->size*2, PROT_NONE, MAP_PRIVATE, -1, 0);
+    if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+        goto retry_mmap;
+    if (MAP_FAILED == map_info->buffer)
+        goto error;
+    if (map_info->buffer != shmat(memfd, map_info->buffer, 0))
+        goto error;
+    if ((map_info->buffer + map_info->size) != shmat(memfd, map_info->buffer + map_info->size, 0))
+        goto error;
+    if (shmctl(memfd, IPC_RMID, NULL) < 0)
+        return MP3D_E_MEMORY;
+    return 0;
+error:
+    shmctl(memfd, IPC_RMID, NULL);
+    mp3dec_close_ring(map_info);
+    return MP3D_E_MEMORY;
+#endif
+}
+#endif /*MINIMP3_ENABLE_RING*/
+#elif defined(_WIN32)
+#include <windows.h>
+
+static void mp3dec_close_file(mp3dec_map_info_t *map_info)
+{
+    if (map_info->buffer)
+        UnmapViewOfFile(map_info->buffer);
+    map_info->buffer = 0;
+    map_info->size   = 0;
+}
+
+static int mp3dec_open_file_h(HANDLE file, mp3dec_map_info_t *map_info)
+{
+    memset(map_info, 0, sizeof(*map_info));
+
+    HANDLE mapping = NULL;
+    LARGE_INTEGER s;
+    s.LowPart = GetFileSize(file, (DWORD*)&s.HighPart);
+    if (s.LowPart == INVALID_FILE_SIZE && GetLastError() != NO_ERROR)
+        goto error;
+    map_info->size = s.QuadPart;
+
+    mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
+    if (!mapping)
+        goto error;
+    map_info->buffer = (const uint8_t*)MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, s.QuadPart);
+    CloseHandle(mapping);
+    if (!map_info->buffer)
+        goto error;
+
+    CloseHandle(file);
+    return 0;
+error:
+    mp3dec_close_file(map_info);
+    CloseHandle(file);
+    return MP3D_E_IOERROR;
+}
+
+static int mp3dec_open_file(const char *file_name, mp3dec_map_info_t *map_info)
+{
+    if (!file_name)
+        return MP3D_E_PARAM;
+    HANDLE file = CreateFileA(file_name, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
+    if (INVALID_HANDLE_VALUE == file)
+        return MP3D_E_IOERROR;
+    return mp3dec_open_file_h(file, map_info);
+}
+
+static int mp3dec_open_file_w(const wchar_t *file_name, mp3dec_map_info_t *map_info)
+{
+    if (!file_name)
+        return MP3D_E_PARAM;
+    HANDLE file = CreateFileW(file_name, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
+    if (INVALID_HANDLE_VALUE == file)
+        return MP3D_E_IOERROR;
+    return mp3dec_open_file_h(file, map_info);
+}
+#else
+#include <stdio.h>
+
+static void mp3dec_close_file(mp3dec_map_info_t *map_info)
+{
+    if (map_info->buffer)
+        free((void *)map_info->buffer);
+    map_info->buffer = 0;
+    map_info->size = 0;
+}
+
+static int mp3dec_open_file(const char *file_name, mp3dec_map_info_t *map_info)
+{
+    if (!file_name)
+        return MP3D_E_PARAM;
+    memset(map_info, 0, sizeof(*map_info));
+    FILE *file = fopen(file_name, "rb");
+    if (!file)
+        return MP3D_E_IOERROR;
+    int res = MP3D_E_IOERROR;
+    long size = -1;
+    if (fseek(file, 0, SEEK_END))
+        goto error;
+    size = ftell(file);
+    if (size < 0)
+        goto error;
+    map_info->size = (size_t)size;
+    if (fseek(file, 0, SEEK_SET))
+        goto error;
+    map_info->buffer = (uint8_t *)malloc(map_info->size);
+    if (!map_info->buffer)
+    {
+        res = MP3D_E_MEMORY;
+        goto error;
+    }
+    if (fread((void *)map_info->buffer, 1, map_info->size, file) != map_info->size)
+        goto error;
+    fclose(file);
+    return 0;
+error:
+    mp3dec_close_file(map_info);
+    fclose(file);
+    return res;
+}
+#endif
+
+static int mp3dec_detect_mapinfo(mp3dec_map_info_t *map_info)
+{
+    int ret = mp3dec_detect_buf(map_info->buffer, map_info->size);
+    mp3dec_close_file(map_info);
+    return ret;
+}
+
+static int mp3dec_load_mapinfo(mp3dec_t *dec, mp3dec_map_info_t *map_info, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+    int ret = mp3dec_load_buf(dec, map_info->buffer, map_info->size, info, progress_cb, user_data);
+    mp3dec_close_file(map_info);
+    return ret;
+}
+
+static int mp3dec_iterate_mapinfo(mp3dec_map_info_t *map_info, MP3D_ITERATE_CB callback, void *user_data)
+{
+    int ret = mp3dec_iterate_buf(map_info->buffer, map_info->size, callback, user_data);
+    mp3dec_close_file(map_info);
+    return ret;
+}
+
+static int mp3dec_ex_open_mapinfo(mp3dec_ex_t *dec, int flags)
+{
+    int ret = mp3dec_ex_open_buf(dec, dec->file.buffer, dec->file.size, flags);
+    dec->is_file = 1;
+    if (ret)
+        mp3dec_ex_close(dec);
+    return ret;
+}
+
+int mp3dec_detect(const char *file_name)
+{
+    int ret;
+    mp3dec_map_info_t map_info;
+    if ((ret = mp3dec_open_file(file_name, &map_info)))
+        return ret;
+    return mp3dec_detect_mapinfo(&map_info);
+}
+
+int mp3dec_load(mp3dec_t *dec, const char *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+    int ret;
+    mp3dec_map_info_t map_info;
+    if ((ret = mp3dec_open_file(file_name, &map_info)))
+        return ret;
+    return mp3dec_load_mapinfo(dec, &map_info, info, progress_cb, user_data);
+}
+
+int mp3dec_iterate(const char *file_name, MP3D_ITERATE_CB callback, void *user_data)
+{
+    int ret;
+    mp3dec_map_info_t map_info;
+    if ((ret = mp3dec_open_file(file_name, &map_info)))
+        return ret;
+    return mp3dec_iterate_mapinfo(&map_info, callback, user_data);
+}
+
+int mp3dec_ex_open(mp3dec_ex_t *dec, const char *file_name, int flags)
+{
+    int ret;
+    if (!dec)
+        return MP3D_E_PARAM;
+    if ((ret = mp3dec_open_file(file_name, &dec->file)))
+        return ret;
+    return mp3dec_ex_open_mapinfo(dec, flags);
+}
+
+void mp3dec_ex_close(mp3dec_ex_t *dec)
+{
+#ifdef MINIMP3_HAVE_RING
+    if (dec->io)
+        mp3dec_close_ring(&dec->file);
+#else
+    if (dec->io && dec->file.buffer)
+        free((void*)dec->file.buffer);
+#endif
+    if (dec->is_file)
+        mp3dec_close_file(&dec->file);
+    if (dec->index.frames)
+        free(dec->index.frames);
+    memset(dec, 0, sizeof(*dec));
+}
+
+#ifdef _WIN32
+int mp3dec_detect_w(const wchar_t *file_name)
+{
+    int ret;
+    mp3dec_map_info_t map_info;
+    if ((ret = mp3dec_open_file_w(file_name, &map_info)))
+        return ret;
+    return mp3dec_detect_mapinfo(&map_info);
+}
+
+int mp3dec_load_w(mp3dec_t *dec, const wchar_t *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+    int ret;
+    mp3dec_map_info_t map_info;
+    if ((ret = mp3dec_open_file_w(file_name, &map_info)))
+        return ret;
+    return mp3dec_load_mapinfo(dec, &map_info, info, progress_cb, user_data);
+}
+
+int mp3dec_iterate_w(const wchar_t *file_name, MP3D_ITERATE_CB callback, void *user_data)
+{
+    int ret;
+    mp3dec_map_info_t map_info;
+    if ((ret = mp3dec_open_file_w(file_name, &map_info)))
+        return ret;
+    return mp3dec_iterate_mapinfo(&map_info, callback, user_data);
+}
+
+int mp3dec_ex_open_w(mp3dec_ex_t *dec, const wchar_t *file_name, int flags)
+{
+    int ret;
+    if ((ret = mp3dec_open_file_w(file_name, &dec->file)))
+        return ret;
+    return mp3dec_ex_open_mapinfo(dec, flags);
+}
+#endif
+#else /* MINIMP3_NO_STDIO */
+void mp3dec_ex_close(mp3dec_ex_t *dec)
+{
+#ifdef MINIMP3_HAVE_RING
+    if (dec->io)
+        mp3dec_close_ring(&dec->file);
+#else
+    if (dec->io && dec->file.buffer)
+        free((void*)dec->file.buffer);
+#endif
+    if (dec->index.frames)
+        free(dec->index.frames);
+    memset(dec, 0, sizeof(*dec));
+}
+#endif
+
+#endif /*MINIMP3_IMPLEMENTATION*/
diff --git a/thirdparty/misc/easing_equations.cpp b/thirdparty/misc/easing_equations.cpp
index bc84564b19..af48aaf079 100644
--- a/thirdparty/misc/easing_equations.cpp
+++ b/thirdparty/misc/easing_equations.cpp
@@ -188,7 +188,8 @@ static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
 ///////////////////////////////////////////////////////////////////////////
 namespace cubic {
 static real_t in(real_t t, real_t b, real_t c, real_t d) {
-	return c * (t /= d) * t * t + b;
+	t /= d;
+	return c * t * t * t + b;
 }
 
 static real_t out(real_t t, real_t b, real_t c, real_t d) {
@@ -197,8 +198,10 @@ static real_t out(real_t t, real_t b, real_t c, real_t d) {
 }
 
 static real_t in_out(real_t t, real_t b, real_t c, real_t d) {
-	if ((t /= d / 2) < 1) return c / 2 * t * t * t + b;
-	return c / 2 * ((t -= 2) * t * t + 2) + b;
+	t /= d / 2;
+	if (t < 1) return c / 2 * t * t * t + b;
+	t -= 2;
+	return c / 2 * (t * t * t + 2) + b;
 }
 
 static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
@@ -210,16 +213,22 @@ static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
 ///////////////////////////////////////////////////////////////////////////
 namespace circ {
 static real_t in(real_t t, real_t b, real_t c, real_t d) {
-	return -c * (sqrt(1 - (t /= d) * t) - 1) + b; // TODO: ehrich: operation with t is undefined
+	t /= d;
+	return -c * (sqrt(1 - t * t) - 1) + b;
 }
 
 static real_t out(real_t t, real_t b, real_t c, real_t d) {
-	return c * sqrt(1 - (t = t / d - 1) * t) + b; // TODO: ehrich: operation with t is undefined
+	t = t / d - 1;
+	return c * sqrt(1 - t * t) + b;
 }
 
 static real_t in_out(real_t t, real_t b, real_t c, real_t d) {
-	if ((t /= d / 2) < 1) return -c / 2 * (sqrt(1 - t * t) - 1) + b;
-	return c / 2 * (sqrt(1 - t * (t -= 2)) + 1) + b; // TODO: ehrich: operation with t is undefined
+	t /= d / 2;
+	if (t < 1) {
+		return -c / 2 * (sqrt(1 - t * t) - 1) + b;
+	}
+	t -= 2;
+	return c / 2 * (sqrt(1 - t * t) + 1) + b;
 }
 
 static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
@@ -271,14 +280,16 @@ static real_t in(real_t t, real_t b, real_t c, real_t d) {
 
 static real_t out(real_t t, real_t b, real_t c, real_t d) {
 	float s = 1.70158f;
-	return c * ((t = t / d - 1) * t * ((s + 1) * t + s) + 1) + b; // TODO: ehrich: operation with t is undefined
+	t = t / d - 1;
+	return c * (t * t * ((s + 1) * t + s) + 1) + b;
 }
 
 static real_t in_out(real_t t, real_t b, real_t c, real_t d) {
-	float s = 1.70158f;
-	if ((t /= d / 2) < 1) return c / 2 * (t * t * (((s *= (1.525f)) + 1) * t - s)) + b; // TODO: ehrich: operation with s is undefined
-	float postFix = t -= 2;
-	return c / 2 * ((postFix)*t * (((s *= (1.525f)) + 1) * t + s) + 2) + b; // TODO: ehrich: operation with s is undefined
+	float s = 1.70158f * 1.525f;
+	t /= d / 2;
+	if (t < 1) return c / 2 * (t * t * ((s + 1) * t - s)) + b;
+	t -= 2;
+	return c / 2 * (t * t * ((s + 1) * t + s) + 2) + b;
 }
 
 static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
diff --git a/thirdparty/misc/open-simplex-noise.c b/thirdparty/misc/open-simplex-noise.c
index 88fbd3e51d..44a072cad1 100644
--- a/thirdparty/misc/open-simplex-noise.c
+++ b/thirdparty/misc/open-simplex-noise.c
@@ -100,27 +100,27 @@ static const signed char gradients4D[] = {
 	-3, -1, -1, -1,     -1, -3, -1, -1,     -1, -1, -3, -1,     -1, -1, -1, -3,
 };
 
-static double extrapolate2(struct osn_context *ctx, int xsb, int ysb, double dx, double dy)
+static double extrapolate2(const struct osn_context *ctx, int xsb, int ysb, double dx, double dy)
 {
-	int16_t *perm = ctx->perm;	
+	const int16_t *perm = ctx->perm;
 	int index = perm[(perm[xsb & 0xFF] + ysb) & 0xFF] & 0x0E;
 	return gradients2D[index] * dx
 		+ gradients2D[index + 1] * dy;
 }
 	
-static double extrapolate3(struct osn_context *ctx, int xsb, int ysb, int zsb, double dx, double dy, double dz)
+static double extrapolate3(const struct osn_context *ctx, int xsb, int ysb, int zsb, double dx, double dy, double dz)
 {
-	int16_t *perm = ctx->perm;	
-	int16_t *permGradIndex3D = ctx->permGradIndex3D;
+	const int16_t *perm = ctx->perm;
+	const int16_t *permGradIndex3D = ctx->permGradIndex3D;
 	int index = permGradIndex3D[(perm[(perm[xsb & 0xFF] + ysb) & 0xFF] + zsb) & 0xFF];
 	return gradients3D[index] * dx
 		+ gradients3D[index + 1] * dy
 		+ gradients3D[index + 2] * dz;
 }
 	
-static double extrapolate4(struct osn_context *ctx, int xsb, int ysb, int zsb, int wsb, double dx, double dy, double dz, double dw)
+static double extrapolate4(const struct osn_context *ctx, int xsb, int ysb, int zsb, int wsb, double dx, double dy, double dz, double dw)
 {
-	int16_t *perm = ctx->perm;
+	const int16_t *perm = ctx->perm;
 	int index = perm[(perm[(perm[(perm[xsb & 0xFF] + ysb) & 0xFF] + zsb) & 0xFF] + wsb) & 0xFF] & 0xFC;
 	return gradients4D[index] * dx
 		+ gradients4D[index + 1] * dy
@@ -227,7 +227,7 @@ void open_simplex_noise_free(struct osn_context *ctx)
 // -- GODOT end --
 	
 /* 2D OpenSimplex (Simplectic) Noise. */
-double open_simplex_noise2(struct osn_context *ctx, double x, double y) 
+double open_simplex_noise2(const struct osn_context *ctx, double x, double y)
 {
 	
 	/* Place input coordinates onto grid. */
@@ -355,7 +355,7 @@ double open_simplex_noise2(struct osn_context *ctx, double x, double y)
 /*
  * 3D OpenSimplex (Simplectic) Noise
  */
-double open_simplex_noise3(struct osn_context *ctx, double x, double y, double z)
+double open_simplex_noise3(const struct osn_context *ctx, double x, double y, double z)
 {
 
 	/* Place input coordinates on simplectic honeycomb. */
@@ -928,7 +928,7 @@ double open_simplex_noise3(struct osn_context *ctx, double x, double y, double z
 /* 
  * 4D OpenSimplex (Simplectic) Noise.
  */
-double open_simplex_noise4(struct osn_context *ctx, double x, double y, double z, double w)
+double open_simplex_noise4(const struct osn_context *ctx, double x, double y, double z, double w)
 {
 	double uins;
 	double dx1, dy1, dz1, dw1;
diff --git a/thirdparty/misc/open-simplex-noise.h b/thirdparty/misc/open-simplex-noise.h
index 89e0df8218..fd9248c3a1 100644
--- a/thirdparty/misc/open-simplex-noise.h
+++ b/thirdparty/misc/open-simplex-noise.h
@@ -47,9 +47,9 @@ int open_simplex_noise(int64_t seed, struct osn_context *ctx);
 //int open_simplex_noise_init_perm(struct osn_context *ctx, int16_t p[], int nelements);
 // -- GODOT end --
 void open_simplex_noise_free(struct osn_context *ctx);
-double open_simplex_noise2(struct osn_context *ctx, double x, double y);
-double open_simplex_noise3(struct osn_context *ctx, double x, double y, double z);
-double open_simplex_noise4(struct osn_context *ctx, double x, double y, double z, double w);
+double open_simplex_noise2(const struct osn_context *ctx, double x, double y);
+double open_simplex_noise3(const struct osn_context *ctx, double x, double y, double z);
+double open_simplex_noise4(const struct osn_context *ctx, double x, double y, double z, double w);
 
 #ifdef __cplusplus
 	}
diff --git a/thirdparty/xatlas/xatlas.cpp b/thirdparty/xatlas/xatlas.cpp
index 43aec33a9f..9f66ae0067 100644
--- a/thirdparty/xatlas/xatlas.cpp
+++ b/thirdparty/xatlas/xatlas.cpp
@@ -33,19 +33,25 @@ https://github.com/brandonpelfrey/Fast-BVH
 MIT License
 Copyright (c) 2012 Brandon Pelfrey
 */
-#include <atomic>
-#include <condition_variable>
-#include <mutex>
-#include <thread>
+#include "xatlas.h"
+#ifndef XATLAS_C_API
+#define XATLAS_C_API 0
+#endif
+#if XATLAS_C_API
+#include "xatlas_c.h"
+#endif
 #include <assert.h>
 #include <float.h> // FLT_MAX
 #include <limits.h>
 #include <math.h>
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <thread>
 #define __STDC_LIMIT_MACROS
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
-#include "xatlas.h"
 
 #ifndef XA_DEBUG
 #ifdef NDEBUG
@@ -59,7 +65,7 @@ Copyright (c) 2012 Brandon Pelfrey
 #define XA_PROFILE 0
 #endif
 #if XA_PROFILE
-#include <time.h>
+#include <chrono>
 #endif
 
 #ifndef XA_MULTITHREADED
@@ -70,7 +76,10 @@ Copyright (c) 2012 Brandon Pelfrey
 #define XA_XSTR(x) XA_STR(x)
 
 #ifndef XA_ASSERT
-#define XA_ASSERT(exp) if (!(exp)) { XA_PRINT_WARNING("\rASSERT: %s %s %d\n", XA_XSTR(exp), __FILE__, __LINE__); }
+#define XA_ASSERT(exp)                                                              \
+	if (!(exp)) {                                                                   \
+		XA_PRINT_WARNING("\rASSERT: %s %s %d\n", XA_XSTR(exp), __FILE__, __LINE__); \
+	}
 #endif
 
 #ifndef XA_DEBUG_ASSERT
@@ -78,13 +87,13 @@ Copyright (c) 2012 Brandon Pelfrey
 #endif
 
 #ifndef XA_PRINT
-#define XA_PRINT(...) \
+#define XA_PRINT(...)                                                  \
 	if (xatlas::internal::s_print && xatlas::internal::s_printVerbose) \
 		xatlas::internal::s_print(__VA_ARGS__);
 #endif
 
 #ifndef XA_PRINT_WARNING
-#define XA_PRINT_WARNING(...) \
+#define XA_PRINT_WARNING(...)      \
 	if (xatlas::internal::s_print) \
 		xatlas::internal::s_print(__VA_ARGS__);
 #endif
@@ -116,9 +125,9 @@ Copyright (c) 2012 Brandon Pelfrey
 #define XA_MERGE_CHARTS 1
 #define XA_MERGE_CHARTS_MIN_NORMAL_DEVIATION 0.5f
 #define XA_RECOMPUTE_CHARTS 1
-#define XA_CLOSE_HOLES_CHECK_EDGE_INTERSECTION 0
-#define XA_FIX_INTERNAL_BOUNDARY_LOOPS 1
-#define XA_PRINT_CHART_WARNINGS 0
+#define XA_CHECK_PARAM_WINDING 0
+#define XA_CHECK_PIECEWISE_CHART_QUALITY 0
+#define XA_CHECK_T_JUNCTIONS 0
 
 #define XA_DEBUG_HEAP 0
 #define XA_DEBUG_SINGLE_CHART 0
@@ -131,25 +140,19 @@ Copyright (c) 2012 Brandon Pelfrey
 #define XA_DEBUG_EXPORT_OBJ_CHART_GROUPS 0
 #define XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS 0
 #define XA_DEBUG_EXPORT_OBJ_CHARTS 0
-#define XA_DEBUG_EXPORT_OBJ_BEFORE_FIX_TJUNCTION 0
-#define XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR 0
+#define XA_DEBUG_EXPORT_OBJ_TJUNCTION 0 // XA_CHECK_T_JUNCTIONS must also be set
 #define XA_DEBUG_EXPORT_OBJ_CHARTS_AFTER_PARAMETERIZATION 0
 #define XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION 0
 #define XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS 0
 
-#define XA_DEBUG_EXPORT_OBJ (0 \
-	|| XA_DEBUG_EXPORT_OBJ_FACE_GROUPS \
-	|| XA_DEBUG_EXPORT_OBJ_CHART_GROUPS \
-	|| XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS \
-	|| XA_DEBUG_EXPORT_OBJ_CHARTS \
-	|| XA_DEBUG_EXPORT_OBJ_BEFORE_FIX_TJUNCTION \
-	|| XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR \
-	|| XA_DEBUG_EXPORT_OBJ_CHARTS_AFTER_PARAMETERIZATION \
-	|| XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION \
-	|| XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS)
+#define XA_DEBUG_EXPORT_OBJ (0 || XA_DEBUG_EXPORT_OBJ_FACE_GROUPS || XA_DEBUG_EXPORT_OBJ_CHART_GROUPS || XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS || XA_DEBUG_EXPORT_OBJ_CHARTS || XA_DEBUG_EXPORT_OBJ_TJUNCTION || XA_DEBUG_EXPORT_OBJ_CHARTS_AFTER_PARAMETERIZATION || XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION || XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS)
 
 #ifdef _MSC_VER
-#define XA_FOPEN(_file, _filename, _mode) { if (fopen_s(&_file, _filename, _mode) != 0) _file = NULL; }
+#define XA_FOPEN(_file, _filename, _mode)           \
+	{                                               \
+		if (fopen_s(&_file, _filename, _mode) != 0) \
+			_file = NULL;                           \
+	}
 #define XA_SPRINTF(_buffer, _size, _format, ...) sprintf_s(_buffer, _size, _format, __VA_ARGS__)
 #else
 #define XA_FOPEN(_file, _filename, _mode) _file = fopen(_filename, _mode)
@@ -165,74 +168,76 @@ static PrintFunc s_print = printf;
 static bool s_printVerbose = false;
 
 #if XA_PROFILE
-#define XA_PROFILE_START(var) const clock_t var##Start = clock();
-#define XA_PROFILE_END(var) internal::s_profile.var += clock() - var##Start;
-#define XA_PROFILE_PRINT_AND_RESET(label, var) XA_PRINT("%s%.2f seconds (%g ms)\n", label, internal::clockToSeconds(internal::s_profile.var), internal::clockToMs(internal::s_profile.var)); internal::s_profile.var = 0;
+typedef uint64_t Duration;
+
+#define XA_PROFILE_START(var) const std::chrono::time_point<std::chrono::high_resolution_clock> var##Start = std::chrono::high_resolution_clock::now();
+#define XA_PROFILE_END(var) internal::s_profile.var += uint64_t(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - var##Start).count());
+#define XA_PROFILE_PRINT_AND_RESET(label, var)                                                                                                          \
+	XA_PRINT("%s%.2f seconds (%g ms)\n", label, internal::durationToSeconds(internal::s_profile.var), internal::durationToMs(internal::s_profile.var)); \
+	internal::s_profile.var = 0u;
 #define XA_PROFILE_ALLOC 0
 
-struct ProfileData
-{
+struct ProfileData {
 #if XA_PROFILE_ALLOC
-	std::atomic<clock_t> alloc;
+	std::atomic<Duration> alloc;
 #endif
-	clock_t addMeshReal;
-	clock_t addMeshCopyData;
-	std::atomic<clock_t> addMeshThread;
-	std::atomic<clock_t> addMeshCreateColocals;
-	clock_t computeChartsReal;
-	std::atomic<clock_t> computeChartsThread;
-	std::atomic<clock_t> createFaceGroups;
-	std::atomic<clock_t> extractInvalidMeshGeometry;
-	std::atomic<clock_t> chartGroupComputeChartsReal;
-	std::atomic<clock_t> chartGroupComputeChartsThread;
-	std::atomic<clock_t> createChartGroupMesh;
-	std::atomic<clock_t> createChartGroupMeshColocals;
-	std::atomic<clock_t> createChartGroupMeshBoundaries;
-	std::atomic<clock_t> buildAtlas;
-	std::atomic<clock_t> buildAtlasInit;
-	std::atomic<clock_t> planarCharts;
-	std::atomic<clock_t> clusteredCharts;
-	std::atomic<clock_t> clusteredChartsPlaceSeeds;
-	std::atomic<clock_t> clusteredChartsPlaceSeedsBoundaryIntersection;
-	std::atomic<clock_t> clusteredChartsRelocateSeeds;
-	std::atomic<clock_t> clusteredChartsReset;
-	std::atomic<clock_t> clusteredChartsGrow;
-	std::atomic<clock_t> clusteredChartsGrowBoundaryIntersection;
-	std::atomic<clock_t> clusteredChartsMerge;
-	std::atomic<clock_t> clusteredChartsFillHoles;
-	std::atomic<clock_t> copyChartFaces;
-	clock_t parameterizeChartsReal;
-	std::atomic<clock_t> parameterizeChartsThread;
-	std::atomic<clock_t> createChartMesh;
-	std::atomic<clock_t> fixChartMeshTJunctions;
-	std::atomic<clock_t> closeChartMeshHoles;
-	std::atomic<clock_t> parameterizeChartsOrthogonal;
-	std::atomic<clock_t> parameterizeChartsLSCM;
-	std::atomic<clock_t> parameterizeChartsRecompute;
-	std::atomic<clock_t> parameterizeChartsPiecewise;
-	std::atomic<clock_t> parameterizeChartsPiecewiseBoundaryIntersection;
-	std::atomic<clock_t> parameterizeChartsEvaluateQuality;
-	clock_t packCharts;
-	clock_t packChartsAddCharts;
-	std::atomic<clock_t> packChartsAddChartsThread;
-	std::atomic<clock_t> packChartsAddChartsRestoreTexcoords;
-	clock_t packChartsRasterize;
-	clock_t packChartsDilate;
-	clock_t packChartsFindLocation;
-	clock_t packChartsBlit;
-	clock_t buildOutputMeshes;
+	std::chrono::time_point<std::chrono::high_resolution_clock> addMeshRealStart;
+	Duration addMeshReal;
+	Duration addMeshCopyData;
+	std::atomic<Duration> addMeshThread;
+	std::atomic<Duration> addMeshCreateColocals;
+	Duration computeChartsReal;
+	std::atomic<Duration> computeChartsThread;
+	std::atomic<Duration> createFaceGroups;
+	std::atomic<Duration> extractInvalidMeshGeometry;
+	std::atomic<Duration> chartGroupComputeChartsReal;
+	std::atomic<Duration> chartGroupComputeChartsThread;
+	std::atomic<Duration> createChartGroupMesh;
+	std::atomic<Duration> createChartGroupMeshColocals;
+	std::atomic<Duration> createChartGroupMeshBoundaries;
+	std::atomic<Duration> buildAtlas;
+	std::atomic<Duration> buildAtlasInit;
+	std::atomic<Duration> planarCharts;
+	std::atomic<Duration> originalUvCharts;
+	std::atomic<Duration> clusteredCharts;
+	std::atomic<Duration> clusteredChartsPlaceSeeds;
+	std::atomic<Duration> clusteredChartsPlaceSeedsBoundaryIntersection;
+	std::atomic<Duration> clusteredChartsRelocateSeeds;
+	std::atomic<Duration> clusteredChartsReset;
+	std::atomic<Duration> clusteredChartsGrow;
+	std::atomic<Duration> clusteredChartsGrowBoundaryIntersection;
+	std::atomic<Duration> clusteredChartsMerge;
+	std::atomic<Duration> clusteredChartsFillHoles;
+	std::atomic<Duration> copyChartFaces;
+	std::atomic<Duration> createChartMeshAndParameterizeReal;
+	std::atomic<Duration> createChartMeshAndParameterizeThread;
+	std::atomic<Duration> createChartMesh;
+	std::atomic<Duration> parameterizeCharts;
+	std::atomic<Duration> parameterizeChartsOrthogonal;
+	std::atomic<Duration> parameterizeChartsLSCM;
+	std::atomic<Duration> parameterizeChartsRecompute;
+	std::atomic<Duration> parameterizeChartsPiecewise;
+	std::atomic<Duration> parameterizeChartsPiecewiseBoundaryIntersection;
+	std::atomic<Duration> parameterizeChartsEvaluateQuality;
+	Duration packCharts;
+	Duration packChartsAddCharts;
+	std::atomic<Duration> packChartsAddChartsThread;
+	std::atomic<Duration> packChartsAddChartsRestoreTexcoords;
+	Duration packChartsRasterize;
+	Duration packChartsDilate;
+	Duration packChartsFindLocation;
+	Duration packChartsBlit;
+	Duration buildOutputMeshes;
 };
 
 static ProfileData s_profile;
 
-static double clockToMs(clock_t c)
-{
-	return c * 1000.0 / CLOCKS_PER_SEC;
+static double durationToMs(Duration c) {
+	return (double)c * 0.001;
 }
 
-static double clockToSeconds(clock_t c)
-{
-	return c / (double)CLOCKS_PER_SEC;
+static double durationToSeconds(Duration c) {
+	return (double)c * 0.000001;
 }
 #else
 #define XA_PROFILE_START(var)
@@ -241,10 +246,8 @@ static double clockToSeconds(clock_t c)
 #define XA_PROFILE_ALLOC 0
 #endif
 
-struct MemTag
-{
-	enum
-	{
+struct MemTag {
+	enum {
 		Default,
 		BitImage,
 		BVH,
@@ -267,8 +270,7 @@ struct MemTag
 };
 
 #if XA_DEBUG_HEAP
-struct AllocHeader
-{
+struct AllocHeader {
 	size_t size;
 	const char *file;
 	int line;
@@ -281,11 +283,10 @@ struct AllocHeader
 static std::mutex s_allocMutex;
 static AllocHeader *s_allocRoot = nullptr;
 static size_t s_allocTotalCount = 0, s_allocTotalSize = 0, s_allocPeakSize = 0, s_allocCount[MemTag::Count] = { 0 }, s_allocTotalTagSize[MemTag::Count] = { 0 }, s_allocPeakTagSize[MemTag::Count] = { 0 };
-static uint32_t s_allocId =0 ;
+static uint32_t s_allocId = 0;
 static constexpr uint32_t kAllocRedzone = 0x12345678;
 
-static void *Realloc(void *ptr, size_t size, int tag, const char *file, int line)
-{
+static void *Realloc(void *ptr, size_t size, int tag, const char *file, int line) {
 	std::unique_lock<std::mutex> lock(s_allocMutex);
 	if (!size && !ptr)
 		return nullptr;
@@ -346,8 +347,7 @@ static void *Realloc(void *ptr, size_t size, int tag, const char *file, int line
 	return newPtr + sizeof(AllocHeader);
 }
 
-static void ReportLeaks()
-{
+static void ReportLeaks() {
 	printf("Checking for memory leaks...\n");
 	bool anyLeaks = false;
 	AllocHeader *header = s_allocRoot;
@@ -375,8 +375,7 @@ static void ReportLeaks()
 		s_allocTotalTagSize[i] = s_allocPeakTagSize[i] = 0;
 }
 
-static void PrintMemoryUsage()
-{
+static void PrintMemoryUsage() {
 	XA_PRINT("Total allocations: %zu\n", s_allocTotalCount);
 	XA_PRINT("Memory usage: %0.2fMB current, %0.2fMB peak\n", internal::s_allocTotalSize / 1024.0f / 1024.0f, internal::s_allocPeakSize / 1024.0f / 1024.0f);
 	static const char *labels[] = { // Sync with MemTag
@@ -405,8 +404,7 @@ static void PrintMemoryUsage()
 
 #define XA_PRINT_MEM_USAGE internal::PrintMemoryUsage();
 #else
-static void *Realloc(void *ptr, size_t size, int /*tag*/, const char * /*file*/, int /*line*/)
-{
+static void *Realloc(void *ptr, size_t size, int /*tag*/, const char * /*file*/, int /*line*/) {
 	if (size == 0 && !ptr)
 		return nullptr;
 	if (size == 0 && s_free) {
@@ -432,89 +430,75 @@ static constexpr float kEpsilon = 0.0001f;
 static constexpr float kAreaEpsilon = FLT_EPSILON;
 static constexpr float kNormalEpsilon = 0.001f;
 
-static int align(int x, int a)
-{
+static int align(int x, int a) {
 	return (x + a - 1) & ~(a - 1);
 }
 
 template <typename T>
-static T max(const T &a, const T &b)
-{
+static T max(const T &a, const T &b) {
 	return a > b ? a : b;
 }
 
 template <typename T>
-static T min(const T &a, const T &b)
-{
+static T min(const T &a, const T &b) {
 	return a < b ? a : b;
 }
 
 template <typename T>
-static T max3(const T &a, const T &b, const T &c)
-{
+static T max3(const T &a, const T &b, const T &c) {
 	return max(a, max(b, c));
 }
 
 /// Return the maximum of the three arguments.
 template <typename T>
-static T min3(const T &a, const T &b, const T &c)
-{
+static T min3(const T &a, const T &b, const T &c) {
 	return min(a, min(b, c));
 }
 
 /// Clamp between two values.
 template <typename T>
-static T clamp(const T &x, const T &a, const T &b)
-{
+static T clamp(const T &x, const T &a, const T &b) {
 	return min(max(x, a), b);
 }
 
 template <typename T>
-static void swap(T &a, T &b)
-{
+static void swap(T &a, T &b) {
 	T temp = a;
 	a = b;
 	b = temp;
 }
 
-union FloatUint32
-{
+union FloatUint32 {
 	float f;
 	uint32_t u;
 };
 
-static bool isFinite(float f)
-{
+static bool isFinite(float f) {
 	FloatUint32 fu;
 	fu.f = f;
 	return fu.u != 0x7F800000u && fu.u != 0x7F800001u;
 }
 
-static bool isNan(float f)
-{
+static bool isNan(float f) {
 	return f != f;
 }
 
 // Robust floating point comparisons:
 // http://realtimecollisiondetection.net/blog/?p=89
-static bool equal(const float f0, const float f1, const float epsilon)
-{
+static bool equal(const float f0, const float f1, const float epsilon) {
 	//return fabs(f0-f1) <= epsilon;
 	return fabs(f0 - f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
 }
 
-static int ftoi_ceil(float val)
-{
+static int ftoi_ceil(float val) {
 	return (int)ceilf(val);
 }
 
-static bool isZero(const float f, const float epsilon)
-{
+static bool isZero(const float f, const float epsilon) {
 	return fabs(f) <= epsilon;
 }
 
-static float square(float f)
-{
+static float square(float f) {
 	return f * f;
 }
 
@@ -524,9 +508,8 @@ static float square(float f)
 * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
 * @note nextPowerOfTwo(x) = 2 << log2(x-1)
 */
-static uint32_t nextPowerOfTwo(uint32_t x)
-{
-	XA_DEBUG_ASSERT( x != 0 );
+static uint32_t nextPowerOfTwo(uint32_t x) {
+	XA_DEBUG_ASSERT(x != 0);
 	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
 	x--;
 	x |= x >> 1;
@@ -537,38 +520,34 @@ static uint32_t nextPowerOfTwo(uint32_t x)
 	return x + 1;
 }
 
-class Vector2
-{
+class Vector2 {
 public:
 	Vector2() {}
-	explicit Vector2(float f) : x(f), y(f) {}
-	Vector2(float x, float y): x(x), y(y) {}
+	explicit Vector2(float f) :
+			x(f), y(f) {}
+	Vector2(float _x, float _y) :
+			x(_x), y(_y) {}
 
-	Vector2 operator-() const
-	{
+	Vector2 operator-() const {
 		return Vector2(-x, -y);
 	}
 
-	void operator+=(const Vector2 &v)
-	{
+	void operator+=(const Vector2 &v) {
 		x += v.x;
 		y += v.y;
 	}
 
-	void operator-=(const Vector2 &v)
-	{
+	void operator-=(const Vector2 &v) {
 		x -= v.x;
 		y -= v.y;
 	}
 
-	void operator*=(float s)
-	{
+	void operator*=(float s) {
 		x *= s;
 		y *= s;
 	}
 
-	void operator*=(const Vector2 &v)
-	{
+	void operator*=(const Vector2 &v) {
 		x *= v.x;
 		y *= v.y;
 	}
@@ -576,13 +555,11 @@ public:
 	float x, y;
 };
 
-static bool operator==(const Vector2 &a, const Vector2 &b)
-{
+static bool operator==(const Vector2 &a, const Vector2 &b) {
 	return a.x == b.x && a.y == b.y;
 }
 
-static bool operator!=(const Vector2 &a, const Vector2 &b)
-{
+static bool operator!=(const Vector2 &a, const Vector2 &b) {
 	return a.x != b.x || a.y != b.y;
 }
 
@@ -591,78 +568,64 @@ static bool operator!=(const Vector2 &a, const Vector2 &b)
 	return Vector2(a.x + b.x, a.y + b.y);
 }*/
 
-static Vector2 operator-(const Vector2 &a, const Vector2 &b)
-{
+static Vector2 operator-(const Vector2 &a, const Vector2 &b) {
 	return Vector2(a.x - b.x, a.y - b.y);
 }
 
-static Vector2 operator*(const Vector2 &v, float s)
-{
+static Vector2 operator*(const Vector2 &v, float s) {
 	return Vector2(v.x * s, v.y * s);
 }
 
-static float dot(const Vector2 &a, const Vector2 &b)
-{
+static float dot(const Vector2 &a, const Vector2 &b) {
 	return a.x * b.x + a.y * b.y;
 }
 
-static float lengthSquared(const Vector2 &v)
-{
+static float lengthSquared(const Vector2 &v) {
 	return v.x * v.x + v.y * v.y;
 }
 
-static float length(const Vector2 &v)
-{
+static float length(const Vector2 &v) {
 	return sqrtf(lengthSquared(v));
 }
 
 #if XA_DEBUG
-static bool isNormalized(const Vector2 &v, float epsilon = kNormalEpsilon)
-{
+static bool isNormalized(const Vector2 &v, float epsilon = kNormalEpsilon) {
 	return equal(length(v), 1, epsilon);
 }
 #endif
 
-static Vector2 normalize(const Vector2 &v, float epsilon)
-{
-	float l = length(v);
-	XA_DEBUG_ASSERT(!isZero(l, epsilon));
-	XA_UNUSED(epsilon);
-	Vector2 n = v * (1.0f / l);
+static Vector2 normalize(const Vector2 &v) {
+	const float l = length(v);
+	XA_DEBUG_ASSERT(l > 0.0f); // Never negative.
+	const Vector2 n = v * (1.0f / l);
 	XA_DEBUG_ASSERT(isNormalized(n));
 	return n;
 }
 
-static Vector2 normalizeSafe(const Vector2 &v, const Vector2 &fallback, float epsilon)
-{
-	float l = length(v);
-	if (isZero(l, epsilon))
-		return fallback;
-	return v * (1.0f / l);
+static Vector2 normalizeSafe(const Vector2 &v, const Vector2 &fallback) {
+	const float l = length(v);
+	if (l > 0.0f) // Never negative.
+		return v * (1.0f / l);
+	return fallback;
 }
 
-static bool equal(const Vector2 &v1, const Vector2 &v2, float epsilon)
-{
+static bool equal(const Vector2 &v1, const Vector2 &v2, float epsilon) {
 	return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
 }
 
-static Vector2 min(const Vector2 &a, const Vector2 &b)
-{
+static Vector2 min(const Vector2 &a, const Vector2 &b) {
 	return Vector2(min(a.x, b.x), min(a.y, b.y));
 }
 
-static Vector2 max(const Vector2 &a, const Vector2 &b)
-{
+static Vector2 max(const Vector2 &a, const Vector2 &b) {
 	return Vector2(max(a.x, b.x), max(a.y, b.y));
 }
 
-static bool isFinite(const Vector2 &v)
-{
+static bool isFinite(const Vector2 &v) {
 	return isFinite(v.x) && isFinite(v.y);
 }
 
-static float triangleArea(const Vector2 &a, const Vector2 &b, const Vector2 &c)
-{
+static float triangleArea(const Vector2 &a, const Vector2 &b, const Vector2 &c) {
 	// IC: While it may be appealing to use the following expression:
 	//return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y) * 0.5f;
 	// That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point
@@ -676,8 +639,7 @@ static float triangleArea(const Vector2 &a, const Vector2 &b, const Vector2 &c)
 	return (v0.x * v1.y - v0.y * v1.x) * 0.5f;
 }
 
-static bool linesIntersect(const Vector2 &a1, const Vector2 &a2, const Vector2 &b1, const Vector2 &b2, float epsilon)
-{
+static bool linesIntersect(const Vector2 &a1, const Vector2 &a2, const Vector2 &b1, const Vector2 &b2, float epsilon) {
 	const Vector2 v0 = a2 - a1;
 	const Vector2 v1 = b2 - b1;
 	const float denom = -v1.x * v0.y + v0.x * v1.y;
@@ -685,76 +647,70 @@ static bool linesIntersect(const Vector2 &a1, const Vector2 &a2, const Vector2 &
 		return false;
 	const float s = (-v0.y * (a1.x - b1.x) + v0.x * (a1.y - b1.y)) / denom;
 	if (s > epsilon && s < 1.0f - epsilon) {
-		const float t = ( v1.x * (a1.y - b1.y) - v1.y * (a1.x - b1.x)) / denom;
+		const float t = (v1.x * (a1.y - b1.y) - v1.y * (a1.x - b1.x)) / denom;
 		return t > epsilon && t < 1.0f - epsilon;
 	}
 	return false;
 }
 
-struct Vector2i
-{
+struct Vector2i {
 	Vector2i() {}
-	Vector2i(int32_t x, int32_t y) : x(x), y(y) {}
+	Vector2i(int32_t _x, int32_t _y) :
+			x(_x), y(_y) {}
 
 	int32_t x, y;
 };
 
-class Vector3
-{
+class Vector3 {
 public:
 	Vector3() {}
-	explicit Vector3(float f) : x(f), y(f), z(f) {}
-	Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
-	Vector3(const Vector2 &v, float z) : x(v.x), y(v.y), z(z) {}
-
-	Vector2 xy() const
-	{
+	explicit Vector3(float f) :
+			x(f), y(f), z(f) {}
+	Vector3(float _x, float _y, float _z) :
+			x(_x), y(_y), z(_z) {}
+	Vector3(const Vector2 &v, float _z) :
+			x(v.x), y(v.y), z(_z) {}
+
+	Vector2 xy() const {
 		return Vector2(x, y);
 	}
 
-	Vector3 operator-() const
-	{
+	Vector3 operator-() const {
 		return Vector3(-x, -y, -z);
 	}
 
-	void operator+=(const Vector3 &v)
-	{
+	void operator+=(const Vector3 &v) {
 		x += v.x;
 		y += v.y;
 		z += v.z;
 	}
 
-	void operator-=(const Vector3 &v)
-	{
+	void operator-=(const Vector3 &v) {
 		x -= v.x;
 		y -= v.y;
 		z -= v.z;
 	}
 
-	void operator*=(float s)
-	{
+	void operator*=(float s) {
 		x *= s;
 		y *= s;
 		z *= s;
 	}
 
-	void operator/=(float s)
-	{
+	void operator/=(float s) {
 		float is = 1.0f / s;
 		x *= is;
 		y *= is;
 		z *= is;
 	}
 
-	void operator*=(const Vector3 &v)
-	{
+	void operator*=(const Vector3 &v) {
 		x *= v.x;
 		y *= v.y;
 		z *= v.z;
 	}
 
-	void operator/=(const Vector3 &v)
-	{
+	void operator/=(const Vector3 &v) {
 		x /= v.x;
 		y /= v.y;
 		z /= v.z;
@@ -763,260 +719,151 @@ public:
 	float x, y, z;
 };
 
-static Vector3 operator+(const Vector3 &a, const Vector3 &b)
-{
+static Vector3 operator+(const Vector3 &a, const Vector3 &b) {
 	return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
 }
 
-static Vector3 operator-(const Vector3 &a, const Vector3 &b)
-{
+static Vector3 operator-(const Vector3 &a, const Vector3 &b) {
 	return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
 }
 
-static Vector3 cross(const Vector3 &a, const Vector3 &b)
-{
+static bool operator==(const Vector3 &a, const Vector3 &b) {
+	return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+
+static Vector3 cross(const Vector3 &a, const Vector3 &b) {
 	return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 }
 
-static Vector3 operator*(const Vector3 &v, float s)
-{
+static Vector3 operator*(const Vector3 &v, float s) {
 	return Vector3(v.x * s, v.y * s, v.z * s);
 }
 
-static Vector3 operator/(const Vector3 &v, float s)
-{
+static Vector3 operator/(const Vector3 &v, float s) {
 	return v * (1.0f / s);
 }
 
-static float dot(const Vector3 &a, const Vector3 &b)
-{
+static float dot(const Vector3 &a, const Vector3 &b) {
 	return a.x * b.x + a.y * b.y + a.z * b.z;
 }
 
-static float lengthSquared(const Vector3 &v)
-{
+static float lengthSquared(const Vector3 &v) {
 	return v.x * v.x + v.y * v.y + v.z * v.z;
 }
 
-static float length(const Vector3 &v)
-{
+static float length(const Vector3 &v) {
 	return sqrtf(lengthSquared(v));
 }
 
-static bool isNormalized(const Vector3 &v, float epsilon = kNormalEpsilon)
-{
-	return equal(length(v), 1, epsilon);
+static bool isNormalized(const Vector3 &v, float epsilon = kNormalEpsilon) {
+	return equal(length(v), 1.0f, epsilon);
 }
 
-static Vector3 normalize(const Vector3 &v, float epsilon)
-{
-	float l = length(v);
-	XA_DEBUG_ASSERT(!isZero(l, epsilon));
-	XA_UNUSED(epsilon);
-	Vector3 n = v * (1.0f / l);
+static Vector3 normalize(const Vector3 &v) {
+	const float l = length(v);
+	XA_DEBUG_ASSERT(l > 0.0f); // Never negative.
+	const Vector3 n = v * (1.0f / l);
 	XA_DEBUG_ASSERT(isNormalized(n));
 	return n;
 }
 
-static Vector3 normalizeSafe(const Vector3 &v, const Vector3 &fallback, float epsilon)
-{
-	float l = length(v);
-	if (isZero(l, epsilon)) {
-		return fallback;
-	}
-	return v * (1.0f / l);
+static Vector3 normalizeSafe(const Vector3 &v, const Vector3 &fallback) {
+	const float l = length(v);
+	if (l > 0.0f) // Never negative.
+		return v * (1.0f / l);
+	return fallback;
 }
 
-static bool equal(const Vector3 &v0, const Vector3 &v1, float epsilon)
-{
+static bool equal(const Vector3 &v0, const Vector3 &v1, float epsilon) {
 	return fabs(v0.x - v1.x) <= epsilon && fabs(v0.y - v1.y) <= epsilon && fabs(v0.z - v1.z) <= epsilon;
 }
 
-static Vector3 min(const Vector3 &a, const Vector3 &b)
-{
+static Vector3 min(const Vector3 &a, const Vector3 &b) {
 	return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
 }
 
-static Vector3 max(const Vector3 &a, const Vector3 &b)
-{
+static Vector3 max(const Vector3 &a, const Vector3 &b) {
 	return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
 }
 
 #if XA_DEBUG
-bool isFinite(const Vector3 &v)
-{
+bool isFinite(const Vector3 &v) {
 	return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
 }
 #endif
 
-struct Extents2
-{
+struct Extents2 {
 	Vector2 min, max;
 
 	Extents2() {}
-	
-	Extents2(Vector2 p1, Vector2 p2)
-	{
+
+	Extents2(Vector2 p1, Vector2 p2) {
 		min = xatlas::internal::min(p1, p2);
 		max = xatlas::internal::max(p1, p2);
 	}
 
-	void reset()
-	{
+	void reset() {
 		min.x = min.y = FLT_MAX;
 		max.x = max.y = -FLT_MAX;
 	}
 
-	void add(Vector2 p)
-	{
+	void add(Vector2 p) {
 		min = xatlas::internal::min(min, p);
 		max = xatlas::internal::max(max, p);
 	}
 
-	Vector2 midpoint() const
-	{
+	Vector2 midpoint() const {
 		return Vector2(min.x + (max.x - min.x) * 0.5f, min.y + (max.y - min.y) * 0.5f);
 	}
 
-	static bool intersect(const Extents2 &e1, const Extents2 &e2)
-	{
+	static bool intersect(const Extents2 &e1, const Extents2 &e2) {
 		return e1.min.x <= e2.max.x && e1.max.x >= e2.min.x && e1.min.y <= e2.max.y && e1.max.y >= e2.min.y;
 	}
 };
 
-struct Plane
-{
-	Plane() = default;
-	
-	Plane(const Vector3 &p1, const Vector3 &p2, const Vector3 &p3)
-	{
-		normal = cross(p2 - p1, p3 - p1);
-		dist = dot(normal, p1);
-	}
-
-	float distance(const Vector3 &p) const
-	{
-		return dot(normal, p) - dist;
-	}
-
-	void normalize()
-	{
-		const float len = length(normal);
-		if (len > 0.0f) {
-			const float il = 1.0f / len;
-			normal *= il;
-			dist *= il;
-		}
-	}
-
-	Vector3 normal;
-	float dist;
-};
-
-static bool lineIntersectsPoint(const Vector3 &point, const Vector3 &lineStart, const Vector3 &lineEnd, float *t, float epsilon)
-{
-	float tt;
-	if (!t)
-		t = &tt;
-	*t = 0.0f;
-	if (equal(lineStart, point, epsilon) || equal(lineEnd, point, epsilon))
-		return false; // Vertex lies on either line vertices.
-	const Vector3 v01 = point - lineStart;
-	const Vector3 v21 = lineEnd - lineStart;
-	const float l = length(v21);
-	const float d = length(cross(v01, v21)) / l;
-	if (!isZero(d, epsilon))
-		return false;
-	*t = dot(v01, v21) / (l * l);
-	return *t > kEpsilon && *t < 1.0f - kEpsilon;
-}
-
-static bool sameSide(const Vector3 &p1, const Vector3 &p2, const Vector3 &a, const Vector3 &b)
-{
-	const Vector3 &ab = b - a;
-	return dot(cross(ab, p1 - a), cross(ab, p2 - a)) >= 0.0f;
-}
-
-// http://blackpawn.com/texts/pointinpoly/default.html
-static bool pointInTriangle(const Vector3 &p, const Vector3 &a, const Vector3 &b, const Vector3 &c)
-{
-	return sameSide(p, a, b, c) && sameSide(p, b, a, c) && sameSide(p, c, a, b);
-}
-
-#if XA_CLOSE_HOLES_CHECK_EDGE_INTERSECTION
-// https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm
-static bool rayIntersectsTriangle(const Vector3 &rayOrigin, const Vector3 &rayDir, const Vector3 *tri, float *t)
-{
-	*t = 0.0f;
-	const Vector3 &edge1 = tri[1] - tri[0];
-	const Vector3 &edge2 = tri[2] - tri[0];
-	const Vector3 h = cross(rayDir, edge2);
-	const float a = dot(edge1, h);
-	if (a > -kEpsilon && a < kEpsilon)
-		return false; // This ray is parallel to this triangle.
-	const float f = 1.0f / a;
-	const Vector3 s = rayOrigin - tri[0];
-	const float u = f * dot(s, h);
-	if (u < 0.0f || u > 1.0f)
-		return false;
-	const Vector3 q = cross(s, edge1);
-	const float v = f * dot(rayDir, q);
-	if (v < 0.0f || u + v > 1.0f)
-		return false;
-	// At this stage we can compute t to find out where the intersection point is on the line.
-	*t = f * dot(edge2, q);
-	if (*t > kEpsilon && *t < 1.0f - kEpsilon)
-		return true;
-	// This means that there is a line intersection but not a ray intersection.
-	return false;
-}
-#endif
-
 // From Fast-BVH
-struct AABB
-{
-	AABB() : min(FLT_MAX, FLT_MAX, FLT_MAX), max(-FLT_MAX, -FLT_MAX, -FLT_MAX) {}
-	AABB(const Vector3 &min, const Vector3 &max) : min(min), max(max) { }
-	AABB(const Vector3 &p, float radius = 0.0f) : min(p), max(p) { if (radius > 0.0f) expand(radius); }
-
-	bool intersect(const AABB &other) const
-	{
+struct AABB {
+	AABB() :
+			min(FLT_MAX, FLT_MAX, FLT_MAX), max(-FLT_MAX, -FLT_MAX, -FLT_MAX) {}
+	AABB(const Vector3 &_min, const Vector3 &_max) :
+			min(_min), max(_max) {}
+	AABB(const Vector3 &p, float radius = 0.0f) :
+			min(p), max(p) {
+		if (radius > 0.0f)
+			expand(radius);
+	}
+
+	bool intersect(const AABB &other) const {
 		return min.x <= other.max.x && max.x >= other.min.x && min.y <= other.max.y && max.y >= other.min.y && min.z <= other.max.z && max.z >= other.min.z;
 	}
 
-	void expandToInclude(const Vector3 &p)
-	{
+	void expandToInclude(const Vector3 &p) {
 		min = internal::min(min, p);
 		max = internal::max(max, p);
 	}
 
-	void expandToInclude(const AABB &aabb)
-	{
+	void expandToInclude(const AABB &aabb) {
 		min = internal::min(min, aabb.min);
 		max = internal::max(max, aabb.max);
 	}
 
-	void expand(float amount)
-	{
+	void expand(float amount) {
 		min -= Vector3(amount);
 		max += Vector3(amount);
 	}
 
-	Vector3 centroid() const
-	{
+	Vector3 centroid() const {
 		return min + (max - min) * 0.5f;
 	}
 
-	uint32_t maxDimension() const
-	{
+	uint32_t maxDimension() const {
 		const Vector3 extent = max - min;
 		uint32_t result = 0;
 		if (extent.y > extent.x) {
 			result = 1;
 			if (extent.z > extent.y)
 				result = 2;
-		}
-		else if(extent.z > extent.x)
+		} else if (extent.z > extent.x)
 			result = 2;
 		return result;
 	}
@@ -1024,10 +871,9 @@ struct AABB
 	Vector3 min, max;
 };
 
-struct ArrayBase
-{
-	ArrayBase(uint32_t elementSize, int memTag = MemTag::Default) : buffer(nullptr), elementSize(elementSize), size(0), capacity(0)
-	{
+struct ArrayBase {
+	ArrayBase(uint32_t _elementSize, int memTag = MemTag::Default) :
+			buffer(nullptr), elementSize(_elementSize), size(0), capacity(0) {
 #if XA_DEBUG_HEAP
 		this->memTag = memTag;
 #else
@@ -1035,31 +881,31 @@ struct ArrayBase
 #endif
 	}
 
-	~ArrayBase()
-	{
+	~ArrayBase() {
 		XA_FREE(buffer);
 	}
 
-	XA_INLINE void clear()
-	{
+	XA_INLINE void clear() {
 		size = 0;
 	}
 
-	void copyFrom(const uint8_t *data, uint32_t length)
-	{
+	void copyFrom(const uint8_t *data, uint32_t length) {
+		XA_DEBUG_ASSERT(data);
+		XA_DEBUG_ASSERT(length > 0);
 		resize(length, true);
-		memcpy(buffer, data, length * elementSize);
+		if (buffer && data && length > 0)
+			memcpy(buffer, data, length * elementSize);
 	}
 
-	void copyTo(ArrayBase &other) const
-	{
+	void copyTo(ArrayBase &other) const {
 		XA_DEBUG_ASSERT(elementSize == other.elementSize);
+		XA_DEBUG_ASSERT(size > 0);
 		other.resize(size, true);
-		memcpy(other.buffer, buffer, size * elementSize);
+		if (other.buffer && buffer && size > 0)
+			memcpy(other.buffer, buffer, size * elementSize);
 	}
 
-	void destroy()
-	{
+	void destroy() {
 		size = 0;
 		XA_FREE(buffer);
 		buffer = nullptr;
@@ -1068,17 +914,18 @@ struct ArrayBase
 	}
 
 	// Insert the given element at the given index shifting all the elements up.
-	void insertAt(uint32_t index, const uint8_t *value)
-	{
+	void insertAt(uint32_t index, const uint8_t *value) {
 		XA_DEBUG_ASSERT(index >= 0 && index <= size);
+		XA_DEBUG_ASSERT(value);
 		resize(size + 1, false);
-		if (index < size - 1)
+		XA_DEBUG_ASSERT(buffer);
+		if (buffer && index < size - 1)
 			memmove(buffer + elementSize * (index + 1), buffer + elementSize * index, elementSize * (size - 1 - index));
-		memcpy(&buffer[index * elementSize], value, elementSize);
+		if (buffer && value)
+			memcpy(&buffer[index * elementSize], value, elementSize);
 	}
 
-	void moveTo(ArrayBase &other)
-	{
+	void moveTo(ArrayBase &other) {
 		XA_DEBUG_ASSERT(elementSize == other.elementSize);
 		other.destroy();
 		other.buffer = buffer;
@@ -1092,55 +939,61 @@ struct ArrayBase
 		elementSize = size = capacity = 0;
 	}
 
-	void pop_back()
-	{
+	void pop_back() {
 		XA_DEBUG_ASSERT(size > 0);
 		resize(size - 1, false);
 	}
 
-	void push_back(const uint8_t *value)
-	{
+	void push_back(const uint8_t *value) {
 		XA_DEBUG_ASSERT(value < buffer || value >= buffer + size);
+		XA_DEBUG_ASSERT(value);
 		resize(size + 1, false);
-		memcpy(&buffer[(size - 1) * elementSize], value, elementSize);
+		XA_DEBUG_ASSERT(buffer);
+		if (buffer && value)
+			memcpy(&buffer[(size - 1) * elementSize], value, elementSize);
 	}
 
-	void push_back(const ArrayBase &other)
-	{
+	void push_back(const ArrayBase &other) {
 		XA_DEBUG_ASSERT(elementSize == other.elementSize);
-		if (other.size == 0)
-			return;
-		const uint32_t oldSize = size;
-		resize(size + other.size, false);
-		memcpy(buffer + oldSize * elementSize, other.buffer, other.size * other.elementSize);
+		if (other.size > 0) {
+			const uint32_t oldSize = size;
+			resize(size + other.size, false);
+			XA_DEBUG_ASSERT(buffer);
+			if (buffer)
+				memcpy(buffer + oldSize * elementSize, other.buffer, other.size * other.elementSize);
+		}
 	}
 
 	// Remove the element at the given index. This is an expensive operation!
-	void removeAt(uint32_t index)
-	{
+	void removeAt(uint32_t index) {
 		XA_DEBUG_ASSERT(index >= 0 && index < size);
-		if (size != 1)
-			memmove(buffer + elementSize * index, buffer + elementSize * (index + 1), elementSize * (size - 1 - index));
-		size--;
+		XA_DEBUG_ASSERT(buffer);
+		if (buffer) {
+			if (size > 1)
+				memmove(buffer + elementSize * index, buffer + elementSize * (index + 1), elementSize * (size - 1 - index));
+			if (size > 0)
+				size--;
+		}
 	}
 
 	// Element at index is swapped with the last element, then the array length is decremented.
-	void removeAtFast(uint32_t index)
-	{
+	void removeAtFast(uint32_t index) {
 		XA_DEBUG_ASSERT(index >= 0 && index < size);
-		if (size != 1 && index != size - 1)
-			memcpy(buffer + elementSize * index, buffer + elementSize * (size - 1), elementSize);
-		size--;
+		XA_DEBUG_ASSERT(buffer);
+		if (buffer) {
+			if (size > 1 && index != size - 1)
+				memcpy(buffer + elementSize * index, buffer + elementSize * (size - 1), elementSize);
+			if (size > 0)
+				size--;
+		}
 	}
 
-	void reserve(uint32_t desiredSize)
-	{
+	void reserve(uint32_t desiredSize) {
 		if (desiredSize > capacity)
 			setArrayCapacity(desiredSize);
 	}
 
-	void resize(uint32_t newSize, bool exact)
-	{
+	void resize(uint32_t newSize, bool exact) {
 		size = newSize;
 		if (size > capacity) {
 			// First allocation is always exact. Otherwise, following allocations grow array to 150% of desired size.
@@ -1153,8 +1006,7 @@ struct ArrayBase
 		}
 	}
 
-	void setArrayCapacity(uint32_t newCapacity)
-	{
+	void setArrayCapacity(uint32_t newCapacity) {
 		XA_DEBUG_ASSERT(newCapacity >= size);
 		if (newCapacity == 0) {
 			// free the buffer.
@@ -1174,8 +1026,7 @@ struct ArrayBase
 	}
 
 #if XA_DEBUG_HEAP
-	void setMemTag(int _memTag)
-	{
+	void setMemTag(int _memTag) {
 		this->memTag = _memTag;
 	}
 #endif
@@ -1189,28 +1040,27 @@ struct ArrayBase
 #endif
 };
 
-template<typename T>
-class Array
-{
+template <typename T>
+class Array {
 public:
-	Array(int memTag = MemTag::Default) : m_base(sizeof(T), memTag) {}
-	Array(const Array&) = delete;
+	Array(int memTag = MemTag::Default) :
+			m_base(sizeof(T), memTag) {}
+	Array(const Array &) = delete;
 	Array &operator=(const Array &) = delete;
 
-	XA_INLINE const T &operator[](uint32_t index) const
-	{
+	XA_INLINE const T &operator[](uint32_t index) const {
 		XA_DEBUG_ASSERT(index < m_base.size);
+		XA_DEBUG_ASSERT(m_base.buffer);
 		return ((const T *)m_base.buffer)[index];
 	}
 
-	XA_INLINE T &operator[](uint32_t index)
-	{
+	XA_INLINE T &operator[](uint32_t index) {
 		XA_DEBUG_ASSERT(index < m_base.size);
+		XA_DEBUG_ASSERT(m_base.buffer);
 		return ((T *)m_base.buffer)[index];
 	}
 
-	XA_INLINE const T &back() const
-	{
+	XA_INLINE const T &back() const {
 		XA_DEBUG_ASSERT(!isEmpty());
 		return ((const T *)m_base.buffer)[m_base.size - 1];
 	}
@@ -1218,8 +1068,7 @@ public:
 	XA_INLINE T *begin() { return (T *)m_base.buffer; }
 	XA_INLINE void clear() { m_base.clear(); }
 
-	bool contains(const T &value) const
-	{
+	bool contains(const T &value) const {
 		for (uint32_t i = 0; i < m_base.size; i++) {
 			if (((const T *)m_base.buffer)[i] == value)
 				return true;
@@ -1244,28 +1093,25 @@ public:
 	void reserve(uint32_t desiredSize) { m_base.reserve(desiredSize); }
 	void resize(uint32_t newSize) { m_base.resize(newSize, true); }
 
-	void runCtors()
-	{
+	void runCtors() {
 		for (uint32_t i = 0; i < m_base.size; i++)
 			new (&((T *)m_base.buffer)[i]) T;
 	}
 
-	void runDtors()
-	{
+	void runDtors() {
 		for (uint32_t i = 0; i < m_base.size; i++)
 			((T *)m_base.buffer)[i].~T();
 	}
 
-	void fill(const T &value)
-	{
+	void fill(const T &value) {
 		auto buffer = (T *)m_base.buffer;
 		for (uint32_t i = 0; i < m_base.size; i++)
 			buffer[i] = value;
 	}
 
-	void fillBytes(uint8_t value)
-	{
-		memset(m_base.buffer, (int)value, m_base.size * m_base.elementSize);
+	void fillBytes(uint8_t value) {
+		if (m_base.buffer && m_base.size > 0)
+			memset(m_base.buffer, (int)value, m_base.size * m_base.elementSize);
 	}
 
 #if XA_DEBUG_HEAP
@@ -1273,41 +1119,67 @@ public:
 #endif
 
 	XA_INLINE uint32_t size() const { return m_base.size; }
-	XA_INLINE void zeroOutMemory() { memset(m_base.buffer, 0, m_base.elementSize * m_base.size); }
+
+	XA_INLINE void zeroOutMemory() {
+		if (m_base.buffer && m_base.size > 0)
+			memset(m_base.buffer, 0, m_base.elementSize * m_base.size);
+	}
 
 private:
 	ArrayBase m_base;
 };
 
-template<typename T>
-struct ArrayView
-{
-	ArrayView() : data(nullptr), length(0) {}
-	ArrayView(Array<T> &a) : data(a.data()), length(a.size()) {}
-	ArrayView(T *data, uint32_t length) : data(data), length(length) {}
-	ArrayView &operator=(Array<T> &a) { data = a.data(); length = a.size(); return *this; }
-	XA_INLINE const T &operator[](uint32_t index) const { XA_DEBUG_ASSERT(index < length); return data[index]; }
+template <typename T>
+struct ArrayView {
+	ArrayView() :
+			data(nullptr), length(0) {}
+	ArrayView(Array<T> &a) :
+			data(a.data()), length(a.size()) {}
+	ArrayView(T *_data, uint32_t _length) :
+			data(_data), length(_length) {}
+	ArrayView &operator=(Array<T> &a) {
+		data = a.data();
+		length = a.size();
+		return *this;
+	}
+	XA_INLINE const T &operator[](uint32_t index) const {
+		XA_DEBUG_ASSERT(index < length);
+		return data[index];
+	}
+	XA_INLINE T &operator[](uint32_t index) {
+		XA_DEBUG_ASSERT(index < length);
+		return data[index];
+	}
 	T *data;
 	uint32_t length;
 };
 
-template<typename T>
-struct ConstArrayView
-{
-	ConstArrayView() : data(nullptr), length(0) {}
-	ConstArrayView(const Array<T> &a) : data(a.data()), length(a.size()) {}
-	ConstArrayView(const T *data, uint32_t length) : data(data), length(length) {}
-	ConstArrayView &operator=(const Array<T> &a) { data = a.data(); length = a.size(); return *this; }
-	XA_INLINE const T &operator[](uint32_t index) const { XA_DEBUG_ASSERT(index < length); return data[index]; }
+template <typename T>
+struct ConstArrayView {
+	ConstArrayView() :
+			data(nullptr), length(0) {}
+	ConstArrayView(const Array<T> &a) :
+			data(a.data()), length(a.size()) {}
+	ConstArrayView(ArrayView<T> av) :
+			data(av.data), length(av.length) {}
+	ConstArrayView(const T *_data, uint32_t _length) :
+			data(_data), length(_length) {}
+	ConstArrayView &operator=(const Array<T> &a) {
+		data = a.data();
+		length = a.size();
+		return *this;
+	}
+	XA_INLINE const T &operator[](uint32_t index) const {
+		XA_DEBUG_ASSERT(index < length);
+		return data[index];
+	}
 	const T *data;
 	uint32_t length;
 };
 
 /// Basis class to compute tangent space basis, ortogonalizations and to transform vectors from one space to another.
-struct Basis
-{
-	XA_NODISCARD static Vector3 computeTangent(const Vector3 &normal)
-	{
+struct Basis {
+	XA_NODISCARD static Vector3 computeTangent(const Vector3 &normal) {
 		XA_ASSERT(isNormalized(normal));
 		// Choose minimum axis.
 		Vector3 tangent;
@@ -1319,12 +1191,11 @@ struct Basis
 			tangent = Vector3(0, 0, 1);
 		// Ortogonalize
 		tangent -= normal * dot(normal, tangent);
-		tangent = normalize(tangent, kEpsilon);
+		tangent = normalize(tangent);
 		return tangent;
 	}
 
-	XA_NODISCARD static Vector3 computeBitangent(const Vector3 &normal, const Vector3 &tangent)
-	{
+	XA_NODISCARD static Vector3 computeBitangent(const Vector3 &normal, const Vector3 &tangent) {
 		return cross(normal, tangent);
 	}
 
@@ -1334,42 +1205,36 @@ struct Basis
 };
 
 // Simple bit array.
-class BitArray
-{
+class BitArray {
 public:
-	BitArray() : m_size(0) {}
+	BitArray() :
+			m_size(0) {}
 
-	BitArray(uint32_t sz)
-	{
+	BitArray(uint32_t sz) {
 		resize(sz);
 	}
 
-	void resize(uint32_t new_size)
-	{
+	void resize(uint32_t new_size) {
 		m_size = new_size;
 		m_wordArray.resize((m_size + 31) >> 5);
 	}
 
-	bool get(uint32_t index) const
-	{
+	bool get(uint32_t index) const {
 		XA_DEBUG_ASSERT(index < m_size);
 		return (m_wordArray[index >> 5] & (1 << (index & 31))) != 0;
 	}
 
-	void set(uint32_t index)
-	{
+	void set(uint32_t index) {
 		XA_DEBUG_ASSERT(index < m_size);
 		m_wordArray[index >> 5] |= (1 << (index & 31));
 	}
 
-	void unset(uint32_t index)
-	{
+	void unset(uint32_t index) {
 		XA_DEBUG_ASSERT(index < m_size);
 		m_wordArray[index >> 5] &= ~(1 << (index & 31));
 	}
 
-	void zeroOutMemory()
-	{
+	void zeroOutMemory() {
 		m_wordArray.zeroOutMemory();
 	}
 
@@ -1378,13 +1243,13 @@ private:
 	Array<uint32_t> m_wordArray;
 };
 
-class BitImage
-{
+class BitImage {
 public:
-	BitImage() : m_width(0), m_height(0), m_rowStride(0), m_data(MemTag::BitImage) {}
+	BitImage() :
+			m_width(0), m_height(0), m_rowStride(0), m_data(MemTag::BitImage) {}
 
-	BitImage(uint32_t w, uint32_t h) : m_width(w), m_height(h), m_data(MemTag::BitImage)
-	{
+	BitImage(uint32_t w, uint32_t h) :
+			m_width(w), m_height(h), m_data(MemTag::BitImage) {
 		m_rowStride = (m_width + 63) >> 6;
 		m_data.resize(m_rowStride * m_height);
 		m_data.zeroOutMemory();
@@ -1395,16 +1260,14 @@ public:
 	uint32_t width() const { return m_width; }
 	uint32_t height() const { return m_height; }
 
-	void copyTo(BitImage &other)
-	{
+	void copyTo(BitImage &other) {
 		other.m_width = m_width;
 		other.m_height = m_height;
 		other.m_rowStride = m_rowStride;
 		m_data.copyTo(other.m_data);
 	}
 
-	void resize(uint32_t w, uint32_t h, bool discard)
-	{
+	void resize(uint32_t w, uint32_t h, bool discard) {
 		const uint32_t rowStride = (w + 63) >> 6;
 		if (discard) {
 			m_data.resize(rowStride * h);
@@ -1428,28 +1291,24 @@ public:
 		m_rowStride = rowStride;
 	}
 
-	bool get(uint32_t x, uint32_t y) const
-	{
+	bool get(uint32_t x, uint32_t y) const {
 		XA_DEBUG_ASSERT(x < m_width && y < m_height);
 		const uint32_t index = (x >> 6) + y * m_rowStride;
 		return (m_data[index] & (UINT64_C(1) << (uint64_t(x) & UINT64_C(63)))) != 0;
 	}
 
-	void set(uint32_t x, uint32_t y)
-	{
+	void set(uint32_t x, uint32_t y) {
 		XA_DEBUG_ASSERT(x < m_width && y < m_height);
 		const uint32_t index = (x >> 6) + y * m_rowStride;
 		m_data[index] |= UINT64_C(1) << (uint64_t(x) & UINT64_C(63));
 		XA_DEBUG_ASSERT(get(x, y));
 	}
 
-	void zeroOutMemory()
-	{
+	void zeroOutMemory() {
 		m_data.zeroOutMemory();
 	}
 
-	bool canBlit(const BitImage &image, uint32_t offsetX, uint32_t offsetY) const
-	{
+	bool canBlit(const BitImage &image, uint32_t offsetX, uint32_t offsetY) const {
 		for (uint32_t y = 0; y < image.m_height; y++) {
 			const uint32_t thisY = y + offsetY;
 			if (thisY >= m_height)
@@ -1473,8 +1332,7 @@ public:
 		return true;
 	}
 
-	void dilate(uint32_t padding)
-	{
+	void dilate(uint32_t padding) {
 		BitImage tmp(m_width, m_height);
 		for (uint32_t p = 0; p < padding; p++) {
 			tmp.zeroOutMemory();
@@ -1484,15 +1342,21 @@ public:
 					if (!b) {
 						if (x > 0) {
 							b |= get(x - 1, y);
-							if (y > 0) b |= get(x - 1, y - 1);
-							if (y < m_height - 1) b |= get(x - 1, y + 1);
+							if (y > 0)
+								b |= get(x - 1, y - 1);
+							if (y < m_height - 1)
+								b |= get(x - 1, y + 1);
 						}
-						if (y > 0) b |= get(x, y - 1);
-						if (y < m_height - 1) b |= get(x, y + 1);
+						if (y > 0)
+							b |= get(x, y - 1);
+						if (y < m_height - 1)
+							b |= get(x, y + 1);
 						if (x < m_width - 1) {
 							b |= get(x + 1, y);
-							if (y > 0) b |= get(x + 1, y - 1);
-							if (y < m_height - 1) b |= get(x + 1, y + 1);
+							if (y > 0)
+								b |= get(x + 1, y - 1);
+							if (y < m_height - 1)
+								b |= get(x + 1, y + 1);
 						}
 					}
 					if (b)
@@ -1511,11 +1375,10 @@ private:
 };
 
 // From Fast-BVH
-class BVH
-{
+class BVH {
 public:
-	BVH(const Array<AABB> &objectAabbs, uint32_t leafSize = 4) : m_objectIds(MemTag::BVH), m_nodes(MemTag::BVH)
-	{
+	BVH(const Array<AABB> &objectAabbs, uint32_t leafSize = 4) :
+			m_objectIds(MemTag::BVH), m_nodes(MemTag::BVH) {
 		m_objectAabbs = &objectAabbs;
 		if (m_objectAabbs->isEmpty())
 			return;
@@ -1535,7 +1398,7 @@ public:
 		Node node;
 		m_nodes.reserve(objectAabbs.size() * 2);
 		uint32_t nNodes = 0;
-		while(stackptr > 0) {
+		while (stackptr > 0) {
 			// Pop the next item off of the stack
 			const BuildEntry &bnode = todo[--stackptr];
 			const uint32_t start = bnode.start;
@@ -1548,7 +1411,7 @@ public:
 			// Calculate the bounding box for this node
 			AABB bb(objectAabbs[m_objectIds[start]]);
 			AABB bc(objectAabbs[m_objectIds[start]].centroid());
-			for(uint32_t p = start + 1; p < end; ++p) {
+			for (uint32_t p = start + 1; p < end; ++p) {
 				bb.expandToInclude(objectAabbs[m_objectIds[p]]);
 				bc.expandToInclude(objectAabbs[m_objectIds[p]].centroid());
 			}
@@ -1564,7 +1427,7 @@ public:
 				m_nodes[bnode.parent].rightOffset--;
 				// When this is the second touch, this is the right child.
 				// The right child sets up the offset for the flat tree.
-				if (m_nodes[bnode.parent].rightOffset == kTouchedTwice )
+				if (m_nodes[bnode.parent].rightOffset == kTouchedTwice)
 					m_nodes[bnode.parent].rightOffset = nNodes - 1 - bnode.parent;
 			}
 			// If this is a leaf, no need to subdivide.
@@ -1599,21 +1462,20 @@ public:
 		}
 	}
 
-	void query(const AABB &queryAabb, Array<uint32_t> &result) const
-	{
+	void query(const AABB &queryAabb, Array<uint32_t> &result) const {
 		result.clear();
 		// Working set
 		uint32_t todo[64];
 		int32_t stackptr = 0;
 		// "Push" on the root node to the working set
 		todo[stackptr] = 0;
-		while(stackptr >= 0) {
+		while (stackptr >= 0) {
 			// Pop off the next node to work on.
 			const int ni = todo[stackptr--];
 			const Node &node = m_nodes[ni];
 			// Is leaf -> Intersect
 			if (node.rightOffset == 0) {
-				for(uint32_t o = 0; o < node.nPrims; ++o) {
+				for (uint32_t o = 0; o < node.nPrims; ++o) {
 					const uint32_t obj = node.start + o;
 					if (queryAabb.intersect((*m_objectAabbs)[m_objectIds[obj]]))
 						result.push_back(m_objectIds[obj]);
@@ -1630,14 +1492,12 @@ public:
 	}
 
 private:
-	struct BuildEntry
-	{
+	struct BuildEntry {
 		uint32_t parent; // If non-zero then this is the index of the parent. (used in offsets)
 		uint32_t start, end; // The range of objects in the object list covered by this node.
 	};
 
-	struct Node
-	{
+	struct Node {
 		AABB aabb;
 		uint32_t start, nPrims, rightOffset;
 	};
@@ -1647,16 +1507,14 @@ private:
 	Array<Node> m_nodes;
 };
 
-struct Fit
-{
-	static bool computeBasis(const Vector3 *points, uint32_t pointsCount, Basis *basis)
-	{
-		if (computeLeastSquaresNormal(points, pointsCount, &basis->normal)) {
+struct Fit {
+	static bool computeBasis(ConstArrayView<Vector3> points, Basis *basis) {
+		if (computeLeastSquaresNormal(points, &basis->normal)) {
 			basis->tangent = Basis::computeTangent(basis->normal);
 			basis->bitangent = Basis::computeBitangent(basis->normal, basis->tangent);
 			return true;
 		}
-		return computeEigen(points, pointsCount, basis);
+		return computeEigen(points, basis);
 	}
 
 private:
@@ -1664,21 +1522,20 @@ private:
 	// Fast, and accurate to within a few degrees.
 	// Returns None if the points do not span a plane.
 	// https://www.ilikebigbits.com/2015_03_04_plane_from_points.html
-	static bool computeLeastSquaresNormal(const Vector3 *points, uint32_t pointsCount, Vector3 *normal)
-	{
-		XA_DEBUG_ASSERT(pointsCount >= 3);
-		if (pointsCount == 3) {
-			*normal = normalize(cross(points[2] - points[0], points[1] - points[0]), kEpsilon);
+	static bool computeLeastSquaresNormal(ConstArrayView<Vector3> points, Vector3 *normal) {
+		XA_DEBUG_ASSERT(points.length >= 3);
+		if (points.length == 3) {
+			*normal = normalize(cross(points[2] - points[0], points[1] - points[0]));
 			return true;
 		}
-		const float invN = 1.0f / float(pointsCount);
+		const float invN = 1.0f / float(points.length);
 		Vector3 centroid(0.0f);
-		for (uint32_t i = 0; i < pointsCount; i++)
+		for (uint32_t i = 0; i < points.length; i++)
 			centroid += points[i];
 		centroid *= invN;
 		// Calculate full 3x3 covariance matrix, excluding symmetries:
 		float xx = 0.0f, xy = 0.0f, xz = 0.0f, yy = 0.0f, yz = 0.0f, zz = 0.0f;
-		for (uint32_t i = 0; i < pointsCount; i++) {
+		for (uint32_t i = 0; i < points.length; i++) {
 			Vector3 r = points[i] - centroid;
 			xx += r.x * r.x;
 			xy += r.x * r.y;
@@ -1730,7 +1587,7 @@ private:
 		// Pick path with best conditioning:
 		Vector3 dir(0.0f);
 		if (det_max == det_x)
-			dir = Vector3(det_x,xz * yz - xy * zz,xy * yz - xz * yy);
+			dir = Vector3(det_x, xz * yz - xy * zz, xy * yz - xz * yy);
 		else if (det_max == det_y)
 			dir = Vector3(xz * yz - xy * zz, det_y, xy * xz - yz * xx);
 		else if (det_max == det_z)
@@ -1743,41 +1600,37 @@ private:
 		return isNormalized(*normal);
 	}
 
-	static bool computeEigen(const Vector3 *points, uint32_t pointsCount, Basis *basis)
-	{
+	static bool computeEigen(ConstArrayView<Vector3> points, Basis *basis) {
 		float matrix[6];
-		computeCovariance(pointsCount, points, matrix);
+		computeCovariance(points, matrix);
 		if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
 			return false;
 		float eigenValues[3];
 		Vector3 eigenVectors[3];
 		if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors))
 			return false;
-		basis->normal = normalize(eigenVectors[2], kEpsilon);
-		basis->tangent = normalize(eigenVectors[0], kEpsilon);
-		basis->bitangent = normalize(eigenVectors[1], kEpsilon);
+		basis->normal = normalize(eigenVectors[2]);
+		basis->tangent = normalize(eigenVectors[0]);
+		basis->bitangent = normalize(eigenVectors[1]);
 		return true;
 	}
 
-	static Vector3 computeCentroid(int n, const Vector3 * points)
-	{
+	static Vector3 computeCentroid(ConstArrayView<Vector3> points) {
 		Vector3 centroid(0.0f);
-		for (int i = 0; i < n; i++) {
+		for (uint32_t i = 0; i < points.length; i++)
 			centroid += points[i];
-		}
-		centroid /= float(n);
+		centroid /= float(points.length);
 		return centroid;
 	}
 
-	static Vector3 computeCovariance(int n, const Vector3 * points, float * covariance)
-	{
+	static Vector3 computeCovariance(ConstArrayView<Vector3> points, float *covariance) {
 		// compute the centroid
-		Vector3 centroid = computeCentroid(n, points);
+		Vector3 centroid = computeCentroid(points);
 		// compute covariance matrix
 		for (int i = 0; i < 6; i++) {
 			covariance[i] = 0.0f;
 		}
-		for (int i = 0; i < n; i++) {
+		for (uint32_t i = 0; i < points.length; i++) {
 			Vector3 v = points[i] - centroid;
 			covariance[0] += v.x * v.x;
 			covariance[1] += v.x * v.y;
@@ -1792,8 +1645,7 @@ private:
 	// Tridiagonal solver from Charles Bloom.
 	// Householder transforms followed by QL decomposition.
 	// Seems to be based on the code from Numerical Recipes in C.
-	static bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
-	{
+	static bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]) {
 		XA_DEBUG_ASSERT(matrix != nullptr && eigenValues != nullptr && eigenVectors != nullptr);
 		float subd[3];
 		float diag[3];
@@ -1818,7 +1670,7 @@ private:
 		// eigenvectors are the columns; make them the rows :
 		for (int i = 0; i < 3; i++) {
 			for (int j = 0; j < 3; j++) {
-				(&eigenVectors[j].x)[i] = (float) work[i][j];
+				(&eigenVectors[j].x)[i] = (float)work[i][j];
 			}
 		}
 		// shuffle to sort by singular value :
@@ -1840,8 +1692,7 @@ private:
 	}
 
 private:
-	static void EigenSolver3_Tridiagonal(float mat[3][3], float *diag, float *subd)
-	{
+	static void EigenSolver3_Tridiagonal(float mat[3][3], float *diag, float *subd) {
 		// Householder reduction T = Q^t M Q
 		//   Input:
 		//     mat, symmetric 3x3 matrix M
@@ -1893,8 +1744,7 @@ private:
 		}
 	}
 
-	static bool EigenSolver3_QLAlgorithm(float mat[3][3], float *diag, float *subd)
-	{
+	static bool EigenSolver3_QLAlgorithm(float mat[3][3], float *diag, float *subd) {
 		// QL iteration with implicit shifting to reduce matrix from tridiagonal
 		// to diagonal
 		const int maxiter = 32;
@@ -1904,21 +1754,21 @@ private:
 				int m;
 				for (m = ell; m <= 1; m++) {
 					float dd = fabsf(diag[m]) + fabsf(diag[m + 1]);
-					if ( fabsf(subd[m]) + dd == dd )
+					if (fabsf(subd[m]) + dd == dd)
 						break;
 				}
-				if ( m == ell )
+				if (m == ell)
 					break;
 				float g = (diag[ell + 1] - diag[ell]) / (2 * subd[ell]);
 				float r = sqrtf(g * g + 1);
-				if ( g < 0 )
+				if (g < 0)
 					g = diag[m] - diag[ell] + subd[ell] / (g - r);
 				else
 					g = diag[m] - diag[ell] + subd[ell] / (g + r);
 				float s = 1, c = 1, p = 0;
 				for (int i = m - 1; i >= ell; i--) {
 					float f = s * subd[i], b = c * subd[i];
-					if ( fabsf(f) >= fabsf(g) ) {
+					if (fabsf(f) >= fabsf(g)) {
 						c = g / f;
 						r = sqrtf(c * c + 1);
 						subd[i + 1] = f * r;
@@ -1944,7 +1794,7 @@ private:
 				subd[ell] = g;
 				subd[m] = 0;
 			}
-			if ( iter == maxiter )
+			if (iter == maxiter)
 				// should not get here under normal circumstances
 				return false;
 		}
@@ -1952,56 +1802,48 @@ private:
 	}
 };
 
-static uint32_t sdbmHash(const void *data_in, uint32_t size, uint32_t h = 5381)
-{
-	const uint8_t *data = (const uint8_t *) data_in;
+static uint32_t sdbmHash(const void *data_in, uint32_t size, uint32_t h = 5381) {
+	const uint8_t *data = (const uint8_t *)data_in;
 	uint32_t i = 0;
 	while (i < size) {
-		h = (h << 16) + (h << 6) - h + (uint32_t ) data[i++];
+		h = (h << 16) + (h << 6) - h + (uint32_t)data[i++];
 	}
 	return h;
 }
 
 template <typename T>
-static uint32_t hash(const T &t, uint32_t h = 5381)
-{
+static uint32_t hash(const T &t, uint32_t h = 5381) {
 	return sdbmHash(&t, sizeof(T), h);
 }
 
 template <typename Key>
-struct Hash
-{
+struct Hash {
 	uint32_t operator()(const Key &k) const { return hash(k); }
 };
 
 template <typename Key>
-struct PassthroughHash
-{
+struct PassthroughHash {
 	uint32_t operator()(const Key &k) const { return (uint32_t)k; }
 };
 
 template <typename Key>
-struct Equal
-{
+struct Equal {
 	bool operator()(const Key &k0, const Key &k1) const { return k0 == k1; }
 };
 
-template<typename Key, typename H = Hash<Key>, typename E = Equal<Key> >
-class HashMap
-{
+template <typename Key, typename H = Hash<Key>, typename E = Equal<Key>>
+class HashMap {
 public:
-	HashMap(int memTag, uint32_t size) : m_memTag(memTag), m_size(size), m_numSlots(0), m_slots(nullptr), m_keys(memTag), m_next(memTag)
-	{
+	HashMap(int memTag, uint32_t size) :
+			m_memTag(memTag), m_size(size), m_numSlots(0), m_slots(nullptr), m_keys(memTag), m_next(memTag) {
 	}
 
-	~HashMap()
-	{
+	~HashMap() {
 		if (m_slots)
 			XA_FREE(m_slots);
 	}
 
-	void destroy()
-	{
+	void destroy() {
 		if (m_slots) {
 			XA_FREE(m_slots);
 			m_slots = nullptr;
@@ -2010,8 +1852,7 @@ public:
 		m_next.destroy();
 	}
 
-	uint32_t add(const Key &key)
-	{
+	uint32_t add(const Key &key) {
 		if (!m_slots)
 			alloc();
 		const uint32_t hash = computeHash(key);
@@ -2021,36 +1862,18 @@ public:
 		return m_keys.size() - 1;
 	}
 
-	uint32_t get(const Key &key) const
-	{
+	uint32_t get(const Key &key) const {
 		if (!m_slots)
 			return UINT32_MAX;
-		const uint32_t hash = computeHash(key);
-		uint32_t i = m_slots[hash];
-		E equal;
-		while (i != UINT32_MAX) {
-			if (equal(m_keys[i], key))
-				return i;
-			i = m_next[i];
-		}
-		return UINT32_MAX;
+		return find(key, m_slots[computeHash(key)]);
 	}
 
-	uint32_t getNext(uint32_t current) const
-	{
-		uint32_t i = m_next[current];
-		E equal;
-		while (i != UINT32_MAX) {
-			if (equal(m_keys[i], m_keys[current]))
-				return i;
-			i = m_next[i];
-		}
-		return UINT32_MAX;
+	uint32_t getNext(const Key &key, uint32_t current) const {
+		return find(key, m_next[current]);
 	}
 
 private:
-	void alloc()
-	{
+	void alloc() {
 		XA_DEBUG_ASSERT(m_size > 0);
 		m_numSlots = nextPowerOfTwo(m_size);
 		auto minNumSlots = uint32_t(m_size * 1.3);
@@ -2063,12 +1886,21 @@ private:
 		m_next.reserve(m_size);
 	}
 
-	uint32_t computeHash(const Key &key) const
-	{
+	uint32_t computeHash(const Key &key) const {
 		H hash;
 		return hash(key) & (m_numSlots - 1);
 	}
 
+	uint32_t find(const Key &key, uint32_t current) const {
+		E equal;
+		while (current != UINT32_MAX) {
+			if (equal(m_keys[current], key))
+				return current;
+			current = m_next[current];
+		}
+		return current;
+	}
+
 	int m_memTag;
 	uint32_t m_size;
 	uint32_t m_numSlots;
@@ -2077,9 +1909,8 @@ private:
 	Array<uint32_t> m_next;
 };
 
-template<typename T>
-static void insertionSort(T *data, uint32_t length)
-{
+template <typename T>
+static void insertionSort(T *data, uint32_t length) {
 	for (int32_t i = 1; i < (int32_t)length; i++) {
 		T x = data[i];
 		int32_t j = i - 1;
@@ -2091,21 +1922,18 @@ static void insertionSort(T *data, uint32_t length)
 	}
 }
 
-class KISSRng
-{
+class KISSRng {
 public:
 	KISSRng() { reset(); }
 
-	void reset()
-	{
+	void reset() {
 		x = 123456789;
 		y = 362436000;
 		z = 521288629;
 		c = 7654321;
 	}
 
-	uint32_t getRange(uint32_t range)
-	{
+	uint32_t getRange(uint32_t range) {
 		if (range == 0)
 			return 0;
 		x = 69069 * x + 12345;
@@ -2124,12 +1952,10 @@ private:
 // Based on Pierre Terdiman's and Michael Herf's source code.
 // http://www.codercorner.com/RadixSortRevisited.htm
 // http://www.stereopsis.com/radix.html
-class RadixSort
-{
+class RadixSort {
 public:
-	void sort(const float *input, uint32_t count)
-	{
-		if (input == nullptr || count == 0) {
+	void sort(ConstArrayView<float> input) {
+		if (input.length == 0) {
 			m_buffer1.clear();
 			m_buffer2.clear();
 			m_ranks = m_buffer1.data();
@@ -2137,33 +1963,27 @@ public:
 			return;
 		}
 		// Resize lists if needed
-		m_buffer1.resize(count);
-		m_buffer2.resize(count);
+		m_buffer1.resize(input.length);
+		m_buffer2.resize(input.length);
 		m_ranks = m_buffer1.data();
 		m_ranks2 = m_buffer2.data();
 		m_validRanks = false;
-		if (count < 32)
-			insertionSort(input, count);
+		if (input.length < 32)
+			insertionSort(input);
 		else {
 			// @@ Avoid touching the input multiple times.
-			for (uint32_t i = 0; i < count; i++) {
+			for (uint32_t i = 0; i < input.length; i++) {
 				floatFlip((uint32_t &)input[i]);
 			}
-			radixSort<uint32_t>((const uint32_t *)input, count);
-			for (uint32_t i = 0; i < count; i++) {
+			radixSort(ConstArrayView<uint32_t>((const uint32_t *)input.data, input.length));
+			for (uint32_t i = 0; i < input.length; i++) {
 				ifloatFlip((uint32_t &)input[i]);
 			}
 		}
 	}
 
-	void sort(const Array<float> &input)
-	{
-		sort(input.data(), input.size());
-	}
-
 	// Access to results. m_ranks is a list of indices in sorted order, i.e. in the order you may further process your data
-	const uint32_t *ranks() const
-	{
+	const uint32_t *ranks() const {
 		XA_DEBUG_ASSERT(m_validRanks);
 		return m_ranks;
 	}
@@ -2171,54 +1991,40 @@ public:
 private:
 	uint32_t *m_ranks, *m_ranks2;
 	Array<uint32_t> m_buffer1, m_buffer2;
-	bool m_validRanks;
+	bool m_validRanks = false;
 
-	void floatFlip(uint32_t &f)
-	{
+	void floatFlip(uint32_t &f) {
 		int32_t mask = (int32_t(f) >> 31) | 0x80000000; // Warren Hunt, Manchor Ko.
 		f ^= mask;
 	}
 
-	void ifloatFlip(uint32_t &f)
-	{
+	void ifloatFlip(uint32_t &f) {
 		uint32_t mask = ((f >> 31) - 1) | 0x80000000; // Michael Herf.
 		f ^= mask;
 	}
 
-	template<typename T>
-	void createHistograms(const T *buffer, uint32_t count, uint32_t *histogram)
-	{
-		const uint32_t bucketCount = sizeof(T); // (8 * sizeof(T)) / log2(radix)
+	void createHistograms(ConstArrayView<uint32_t> input, uint32_t *histogram) {
+		const uint32_t bucketCount = sizeof(uint32_t);
 		// Init bucket pointers.
 		uint32_t *h[bucketCount];
 		for (uint32_t i = 0; i < bucketCount; i++) {
 			h[i] = histogram + 256 * i;
 		}
 		// Clear histograms.
-		memset(histogram, 0, 256 * bucketCount * sizeof(uint32_t ));
+		memset(histogram, 0, 256 * bucketCount * sizeof(uint32_t));
 		// @@ Add support for signed integers.
 		// Build histograms.
-		const uint8_t *p = (const uint8_t *)buffer;  // @@ Does this break aliasing rules?
-		const uint8_t *pe = p + count * sizeof(T);
+		const uint8_t *p = (const uint8_t *)input.data; // @@ Does this break aliasing rules?
+		const uint8_t *pe = p + input.length * sizeof(uint32_t);
 		while (p != pe) {
 			h[0][*p++]++, h[1][*p++]++, h[2][*p++]++, h[3][*p++]++;
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4127)
-#endif
-			if (bucketCount == 8) h[4][*p++]++, h[5][*p++]++, h[6][*p++]++, h[7][*p++]++;
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
 		}
 	}
 
-	template <typename T>
-	void insertionSort(const T *input, uint32_t count)
-	{
+	void insertionSort(ConstArrayView<float> input) {
 		if (!m_validRanks) {
 			m_ranks[0] = 0;
-			for (uint32_t i = 1; i != count; ++i) {
+			for (uint32_t i = 1; i != input.length; ++i) {
 				int rank = m_ranks[i] = i;
 				uint32_t j = i;
 				while (j != 0 && input[rank] < input[m_ranks[j - 1]]) {
@@ -2231,7 +2037,7 @@ private:
 			}
 			m_validRanks = true;
 		} else {
-			for (uint32_t i = 1; i != count; ++i) {
+			for (uint32_t i = 1; i != input.length; ++i) {
 				int rank = m_ranks[i];
 				uint32_t j = i;
 				while (j != 0 && input[rank] < input[m_ranks[j - 1]]) {
@@ -2245,35 +2051,34 @@ private:
 		}
 	}
 
-	template <typename T>
-	void radixSort(const T *input, uint32_t count)
-	{
-		const uint32_t P = sizeof(T); // pass count
+	void radixSort(ConstArrayView<uint32_t> input) {
+		const uint32_t P = sizeof(uint32_t); // pass count
 		// Allocate histograms & offsets on the stack
 		uint32_t histogram[256 * P];
 		uint32_t *link[256];
-		createHistograms(input, count, histogram);
+		createHistograms(input, histogram);
 		// Radix sort, j is the pass number (0=LSB, P=MSB)
 		for (uint32_t j = 0; j < P; j++) {
 			// Pointer to this bucket.
 			const uint32_t *h = &histogram[j * 256];
-			const uint8_t *inputBytes = (const uint8_t *)input; // @@ Is this aliasing legal?
+			auto inputBytes = (const uint8_t *)input.data; // @@ Is this aliasing legal?
 			inputBytes += j;
-			if (h[inputBytes[0]] == count) {
+			if (h[inputBytes[0]] == input.length) {
 				// Skip this pass, all values are the same.
 				continue;
 			}
 			// Create offsets
 			link[0] = m_ranks2;
-			for (uint32_t i = 1; i < 256; i++) link[i] = link[i - 1] + h[i - 1];
+			for (uint32_t i = 1; i < 256; i++)
+				link[i] = link[i - 1] + h[i - 1];
 			// Perform Radix Sort
 			if (!m_validRanks) {
-				for (uint32_t i = 0; i < count; i++) {
+				for (uint32_t i = 0; i < input.length; i++) {
 					*link[inputBytes[i * P]]++ = i;
 				}
 				m_validRanks = true;
 			} else {
-				for (uint32_t i = 0; i < count; i++) {
+				for (uint32_t i = 0; i < input.length; i++) {
 					const uint32_t idx = m_ranks[i];
 					*link[inputBytes[idx * P]]++ = idx;
 				}
@@ -2283,7 +2088,7 @@ private:
 		}
 		// All values were equal, generate linear ranks.
 		if (!m_validRanks) {
-			for (uint32_t i = 0; i < count; i++)
+			for (uint32_t i = 0; i < input.length; i++)
 				m_ranks[i] = i;
 			m_validRanks = true;
 		}
@@ -2291,30 +2096,25 @@ private:
 };
 
 // Wrapping this in a class allows temporary arrays to be re-used.
-class BoundingBox2D
-{
+class BoundingBox2D {
 public:
 	Vector2 majorAxis, minorAxis, minCorner, maxCorner;
 
-	void clear()
-	{
+	void clear() {
 		m_boundaryVertices.clear();
 	}
 
-	void appendBoundaryVertex(Vector2 v)
-	{
+	void appendBoundaryVertex(Vector2 v) {
 		m_boundaryVertices.push_back(v);
 	}
 
 	// This should compute convex hull and use rotating calipers to find the best box. Currently it uses a brute force method.
-	// If vertices is null or vertexCount is 0, the boundary vertices are used.
-	void compute(const Vector2 *vertices = nullptr, uint32_t vertexCount = 0)
-	{
-		if (!vertices || vertexCount == 0) {
-			vertices = m_boundaryVertices.data();
-			vertexCount = m_boundaryVertices.size();
-		}
-		convexHull(m_boundaryVertices.data(), m_boundaryVertices.size(), m_hull, 0.00001f);
+	// If vertices are empty, the boundary vertices are used.
+	void compute(ConstArrayView<Vector2> vertices = ConstArrayView<Vector2>()) {
+		XA_DEBUG_ASSERT(!m_boundaryVertices.isEmpty());
+		if (vertices.length == 0)
+			vertices = m_boundaryVertices;
+		convexHull(m_boundaryVertices, m_hull, 0.00001f);
 		// @@ Ideally I should use rotating calipers to find the best box. Using brute force for now.
 		float best_area = FLT_MAX;
 		Vector2 best_min(0);
@@ -2324,13 +2124,13 @@ public:
 		for (uint32_t i = 0, j = hullCount - 1; i < hullCount; j = i, i++) {
 			if (equal(m_hull[i], m_hull[j], kEpsilon))
 				continue;
-			Vector2 axis = normalize(m_hull[i] - m_hull[j], 0.0f);
+			Vector2 axis = normalize(m_hull[i] - m_hull[j]);
 			XA_DEBUG_ASSERT(isFinite(axis));
 			// Compute bounding box.
 			Vector2 box_min(FLT_MAX, FLT_MAX);
 			Vector2 box_max(-FLT_MAX, -FLT_MAX);
 			// Consider all points, not only boundary points, in case the input chart is malformed.
-			for (uint32_t v = 0; v < vertexCount; v++) {
+			for (uint32_t v = 0; v < vertices.length; v++) {
 				const Vector2 &point = vertices[v];
 				const float x = dot(axis, point);
 				const float y = dot(Vector2(-axis.y, axis.x), point);
@@ -2357,28 +2157,27 @@ public:
 
 private:
 	// Compute the convex hull using Graham Scan.
-	void convexHull(const Vector2 *input, uint32_t inputCount, Array<Vector2> &output, float epsilon)
-	{
-		m_coords.resize(inputCount);
-		for (uint32_t i = 0; i < inputCount; i++)
+	void convexHull(ConstArrayView<Vector2> input, Array<Vector2> &output, float epsilon) {
+		m_coords.resize(input.length);
+		for (uint32_t i = 0; i < input.length; i++)
 			m_coords[i] = input[i].x;
 		m_radix.sort(m_coords);
 		const uint32_t *ranks = m_radix.ranks();
 		m_top.clear();
 		m_bottom.clear();
-		m_top.reserve(inputCount);
-		m_bottom.reserve(inputCount);
+		m_top.reserve(input.length);
+		m_bottom.reserve(input.length);
 		Vector2 P = input[ranks[0]];
-		Vector2 Q = input[ranks[inputCount - 1]];
+		Vector2 Q = input[ranks[input.length - 1]];
 		float topy = max(P.y, Q.y);
 		float boty = min(P.y, Q.y);
-		for (uint32_t i = 0; i < inputCount; i++) {
+		for (uint32_t i = 0; i < input.length; i++) {
 			Vector2 p = input[ranks[i]];
 			if (p.y >= boty)
 				m_top.push_back(p);
 		}
-		for (uint32_t i = 0; i < inputCount; i++) {
-			Vector2 p = input[ranks[inputCount - 1 - i]];
+		for (uint32_t i = 0; i < input.length; i++) {
+			Vector2 p = input[ranks[input.length - 1 - i]];
 			if (p.y <= topy)
 				m_bottom.push_back(p);
 		}
@@ -2387,7 +2186,7 @@ private:
 		XA_DEBUG_ASSERT(m_top.size() >= 2);
 		output.push_back(m_top[0]);
 		output.push_back(m_top[1]);
-		for (uint32_t i = 2; i < m_top.size(); ) {
+		for (uint32_t i = 2; i < m_top.size();) {
 			Vector2 a = output[output.size() - 2];
 			Vector2 b = output[output.size() - 1];
 			Vector2 c = m_top[i];
@@ -2403,7 +2202,7 @@ private:
 		XA_DEBUG_ASSERT(m_bottom.size() >= 2);
 		output.push_back(m_bottom[1]);
 		// Filter bottom list.
-		for (uint32_t i = 2; i < m_bottom.size(); ) {
+		for (uint32_t i = 2; i < m_bottom.size();) {
 			Vector2 a = output[output.size() - 2];
 			Vector2 b = output[output.size() - 1];
 			Vector2 c = m_bottom[i];
@@ -2426,32 +2225,45 @@ private:
 	RadixSort m_radix;
 };
 
-static uint32_t meshEdgeFace(uint32_t edge) { return edge / 3; }
-static uint32_t meshEdgeIndex0(uint32_t edge) { return edge; }
+struct EdgeKey {
+	EdgeKey(const EdgeKey &k) :
+			v0(k.v0), v1(k.v1) {}
+	EdgeKey(uint32_t _v0, uint32_t _v1) :
+			v0(_v0), v1(_v1) {}
+	bool operator==(const EdgeKey &k) const { return v0 == k.v0 && v1 == k.v1; }
 
-static uint32_t meshEdgeIndex1(uint32_t edge)
-{
+	uint32_t v0;
+	uint32_t v1;
+};
+
+struct EdgeHash {
+	uint32_t operator()(const EdgeKey &k) const { return k.v0 * 32768u + k.v1; }
+};
+
+static uint32_t meshEdgeFace(uint32_t edge) {
+	return edge / 3;
+}
+static uint32_t meshEdgeIndex0(uint32_t edge) {
+	return edge;
+}
+
+static uint32_t meshEdgeIndex1(uint32_t edge) {
 	const uint32_t faceFirstEdge = edge / 3 * 3;
 	return faceFirstEdge + (edge - faceFirstEdge + 1) % 3;
 }
 
-struct MeshFlags
-{
-	enum
-	{
-		HasIgnoredFaces = 1<<0,
-		HasNormals = 1<<1
+struct MeshFlags {
+	enum {
+		HasIgnoredFaces = 1 << 0,
+		HasNormals = 1 << 1,
+		HasMaterials = 1 << 2
 	};
 };
 
-class Mesh;
-static void meshGetBoundaryLoops(const Mesh &mesh, Array<uint32_t> &boundaryLoops);
-
-class Mesh
-{
+class Mesh {
 public:
-	Mesh(float epsilon, uint32_t approxVertexCount, uint32_t approxFaceCount, uint32_t flags = 0, uint32_t id = UINT32_MAX) : m_epsilon(epsilon), m_flags(flags), m_id(id), m_faceIgnore(MemTag::Mesh), m_indices(MemTag::MeshIndices), m_positions(MemTag::MeshPositions), m_normals(MemTag::MeshNormals), m_texcoords(MemTag::MeshTexcoords), m_nextColocalVertex(MemTag::MeshColocals), m_boundaryEdges(MemTag::MeshBoundaries), m_oppositeEdges(MemTag::MeshBoundaries), m_nextBoundaryEdges(MemTag::MeshBoundaries), m_edgeMap(MemTag::MeshEdgeMap, approxFaceCount * 3)
-	{
+	Mesh(float epsilon, uint32_t approxVertexCount, uint32_t approxFaceCount, uint32_t flags = 0, uint32_t id = UINT32_MAX) :
+			m_epsilon(epsilon), m_flags(flags), m_id(id), m_faceIgnore(MemTag::Mesh), m_faceMaterials(MemTag::Mesh), m_indices(MemTag::MeshIndices), m_positions(MemTag::MeshPositions), m_normals(MemTag::MeshNormals), m_texcoords(MemTag::MeshTexcoords), m_nextColocalVertex(MemTag::MeshColocals), m_firstColocalVertex(MemTag::MeshColocals), m_boundaryEdges(MemTag::MeshBoundaries), m_oppositeEdges(MemTag::MeshBoundaries), m_edgeMap(MemTag::MeshEdgeMap, approxFaceCount * 3) {
 		m_indices.reserve(approxFaceCount * 3);
 		m_positions.reserve(approxVertexCount);
 		m_texcoords.reserve(approxVertexCount);
@@ -2459,13 +2271,14 @@ public:
 			m_faceIgnore.reserve(approxFaceCount);
 		if (m_flags & MeshFlags::HasNormals)
 			m_normals.reserve(approxVertexCount);
+		if (m_flags & MeshFlags::HasMaterials)
+			m_faceMaterials.reserve(approxFaceCount);
 	}
 
 	uint32_t flags() const { return m_flags; }
 	uint32_t id() const { return m_id; }
 
-	void addVertex(const Vector3 &pos, const Vector3 &normal = Vector3(0.0f), const Vector2 &texcoord = Vector2(0.0f))
-	{
+	void addVertex(const Vector3 &pos, const Vector3 &normal = Vector3(0.0f), const Vector2 &texcoord = Vector2(0.0f)) {
 		XA_DEBUG_ASSERT(isFinite(pos));
 		m_positions.push_back(pos);
 		if (m_flags & MeshFlags::HasNormals)
@@ -2473,45 +2286,22 @@ public:
 		m_texcoords.push_back(texcoord);
 	}
 
-	struct AddFaceResult
-	{
-		enum Enum
-		{
-			OK,
-			DuplicateEdge = 1
-		};
-	};
-
-	AddFaceResult::Enum addFace(uint32_t v0, uint32_t v1, uint32_t v2, bool ignore = false)
-	{
-		uint32_t indexArray[3];
-		indexArray[0] = v0;
-		indexArray[1] = v1;
-		indexArray[2] = v2;
-		return addFace(indexArray, ignore);
-	}
-
-	AddFaceResult::Enum addFace(const uint32_t *indices, bool ignore = false)
-	{
-		AddFaceResult::Enum result = AddFaceResult::OK;
+	void addFace(const uint32_t *indices, bool ignore = false, uint32_t material = UINT32_MAX) {
 		if (m_flags & MeshFlags::HasIgnoredFaces)
 			m_faceIgnore.push_back(ignore);
+		if (m_flags & MeshFlags::HasMaterials)
+			m_faceMaterials.push_back(material);
 		const uint32_t firstIndex = m_indices.size();
 		for (uint32_t i = 0; i < 3; i++)
 			m_indices.push_back(indices[i]);
 		for (uint32_t i = 0; i < 3; i++) {
 			const uint32_t vertex0 = m_indices[firstIndex + i];
 			const uint32_t vertex1 = m_indices[firstIndex + (i + 1) % 3];
-			const EdgeKey key(vertex0, vertex1);
-			if (m_edgeMap.get(key) != UINT32_MAX)
-				result = AddFaceResult::DuplicateEdge;
-			m_edgeMap.add(key);
+			m_edgeMap.add(EdgeKey(vertex0, vertex1));
 		}
-		return result;
 	}
 
-	void createColocals()
-	{
+	void createColocalsBVH() {
 		const uint32_t vertexCount = m_positions.size();
 		Array<AABB> aabbs(MemTag::BVH);
 		aabbs.resize(vertexCount);
@@ -2522,6 +2312,8 @@ public:
 		Array<uint32_t> potential(MemTag::MeshColocals);
 		m_nextColocalVertex.resize(vertexCount);
 		m_nextColocalVertex.fillBytes(0xff);
+		m_firstColocalVertex.resize(vertexCount);
+		m_firstColocalVertex.fillBytes(0xff);
 		for (uint32_t i = 0; i < vertexCount; i++) {
 			if (m_nextColocalVertex[i] != UINT32_MAX)
 				continue; // Already linked.
@@ -2537,18 +2329,65 @@ public:
 			if (colocals.size() == 1) {
 				// No colocals for this vertex.
 				m_nextColocalVertex[i] = i;
-				continue; 
+				m_firstColocalVertex[i] = i;
+				continue;
 			}
 			// Link in ascending order.
 			insertionSort(colocals.data(), colocals.size());
-			for (uint32_t j = 0; j < colocals.size(); j++)
+			for (uint32_t j = 0; j < colocals.size(); j++) {
 				m_nextColocalVertex[colocals[j]] = colocals[(j + 1) % colocals.size()];
+				m_firstColocalVertex[colocals[j]] = colocals[0];
+			}
 			XA_DEBUG_ASSERT(m_nextColocalVertex[i] != UINT32_MAX);
 		}
 	}
 
-	void createBoundaries()
-	{
+	void createColocalsHash() {
+		const uint32_t vertexCount = m_positions.size();
+		HashMap<Vector3> positionToVertexMap(MemTag::Default, vertexCount);
+		for (uint32_t i = 0; i < vertexCount; i++)
+			positionToVertexMap.add(m_positions[i]);
+		Array<uint32_t> colocals(MemTag::MeshColocals);
+		m_nextColocalVertex.resize(vertexCount);
+		m_nextColocalVertex.fillBytes(0xff);
+		m_firstColocalVertex.resize(vertexCount);
+		m_firstColocalVertex.fillBytes(0xff);
+		for (uint32_t i = 0; i < vertexCount; i++) {
+			if (m_nextColocalVertex[i] != UINT32_MAX)
+				continue; // Already linked.
+			// Find other vertices colocal to this one.
+			colocals.clear();
+			colocals.push_back(i); // Always add this vertex.
+			uint32_t otherVertex = positionToVertexMap.get(m_positions[i]);
+			while (otherVertex != UINT32_MAX) {
+				if (otherVertex != i && equal(m_positions[i], m_positions[otherVertex], m_epsilon) && m_nextColocalVertex[otherVertex] == UINT32_MAX)
+					colocals.push_back(otherVertex);
+				otherVertex = positionToVertexMap.getNext(m_positions[i], otherVertex);
+			}
+			if (colocals.size() == 1) {
+				// No colocals for this vertex.
+				m_nextColocalVertex[i] = i;
+				m_firstColocalVertex[i] = i;
+				continue;
+			}
+			// Link in ascending order.
+			insertionSort(colocals.data(), colocals.size());
+			for (uint32_t j = 0; j < colocals.size(); j++) {
+				m_nextColocalVertex[colocals[j]] = colocals[(j + 1) % colocals.size()];
+				m_firstColocalVertex[colocals[j]] = colocals[0];
+			}
+			XA_DEBUG_ASSERT(m_nextColocalVertex[i] != UINT32_MAX);
+		}
+	}
+
+	void createColocals() {
+		if (m_epsilon <= FLT_EPSILON)
+			createColocalsHash();
+		else
+			createColocalsBVH();
+	}
+
+	void createBoundaries() {
 		const uint32_t edgeCount = m_indices.size();
 		const uint32_t vertexCount = m_positions.size();
 		m_oppositeEdges.resize(edgeCount);
@@ -2578,151 +2417,54 @@ public:
 		}
 	}
 
-	void linkBoundaries()
-	{
-		const uint32_t edgeCount = m_indices.size();
-		HashMap<uint32_t> vertexToEdgeMap(MemTag::Mesh, edgeCount); // Edge is index / 2
-		for (uint32_t i = 0; i < edgeCount; i++) {
-			vertexToEdgeMap.add(m_indices[meshEdgeIndex0(i)]);
-			vertexToEdgeMap.add(m_indices[meshEdgeIndex1(i)]);
-		}
-		m_nextBoundaryEdges.resize(edgeCount);
-		for (uint32_t i = 0; i < edgeCount; i++)
-			m_nextBoundaryEdges[i] = UINT32_MAX;
-		uint32_t numBoundaryLoops = 0, numUnclosedBoundaries = 0;
-		BitArray linkedEdges(edgeCount);
-		linkedEdges.zeroOutMemory();
-		for (;;) {
-			// Find the first boundary edge that hasn't been linked yet.
-			uint32_t firstEdge = UINT32_MAX;
-			for (uint32_t i = 0; i < edgeCount; i++) {
-				if (m_oppositeEdges[i] == UINT32_MAX && !linkedEdges.get(i)) {
-					firstEdge = i;
-					break;
-				}
-			}
-			if (firstEdge == UINT32_MAX)
-				break;
-			uint32_t currentEdge = firstEdge;
-			for (;;) {
-				// Find the next boundary edge. The first vertex will be the same as (or colocal to) the current edge second vertex.
-				const uint32_t startVertex = m_indices[meshEdgeIndex1(currentEdge)];
-				uint32_t bestNextEdge = UINT32_MAX;
-				for (ColocalVertexIterator it(this, startVertex); !it.isDone(); it.advance()) {
-					uint32_t mapIndex = vertexToEdgeMap.get(it.vertex());
-					while (mapIndex != UINT32_MAX) {
-						const uint32_t otherEdge = mapIndex / 2; // Two vertices added per edge.
-						if (m_oppositeEdges[otherEdge] != UINT32_MAX)
-							goto next; // Not a boundary edge.
-						if (linkedEdges.get(otherEdge))
-							goto next; // Already linked.
-						if (m_indices[meshEdgeIndex0(otherEdge)] != it.vertex())
-							goto next; // Edge contains the vertex, but it's the wrong one.
-						// First edge (closing the boundary loop) has the highest priority.
-						// Non-colocal vertex has the next highest.
-						if (bestNextEdge != firstEdge && (bestNextEdge == UINT32_MAX || it.vertex() == startVertex))
-							bestNextEdge = otherEdge;
-					next:
-						mapIndex = vertexToEdgeMap.getNext(mapIndex);
-					}
-				}
-				if (bestNextEdge == UINT32_MAX) {
-					numUnclosedBoundaries++;
-					if (currentEdge == firstEdge)
-						linkedEdges.set(firstEdge); // Only 1 edge in this boundary "loop".
-					break; // Can't find a next edge.
-				}
-				m_nextBoundaryEdges[currentEdge] = bestNextEdge;
-				linkedEdges.set(bestNextEdge);
-				currentEdge = bestNextEdge;
-				if (currentEdge == firstEdge) {
-					numBoundaryLoops++;
-					break; // Closed the boundary loop.
-				}
-			}
-		}
-#if XA_FIX_INTERNAL_BOUNDARY_LOOPS
-		// Find internal boundary loops and separate them.
-		// Detect by finding two edges in a boundary loop that have a colocal end vertex.
-		// Fix by swapping their next boundary edge.
-		// Need to start over after every fix since known boundary loops have changed.
-		Array<uint32_t> boundaryLoops;
-	fixInternalBoundary:
-		meshGetBoundaryLoops(*this, boundaryLoops);
-		for (uint32_t loop = 0; loop < boundaryLoops.size(); loop++) {
-			linkedEdges.zeroOutMemory();
-			for (Mesh::BoundaryLoopEdgeIterator it1(this, boundaryLoops[loop]); !it1.isDone(); it1.advance()) {
-				const uint32_t e1 = it1.edge();
-				if (linkedEdges.get(e1))
-					continue;
-				for (Mesh::BoundaryLoopEdgeIterator it2(this, boundaryLoops[loop]); !it2.isDone(); it2.advance()) {
-					const uint32_t e2 = it2.edge();
-					if (e1 == e2 || !isBoundaryEdge(e2) || linkedEdges.get(e2))
-						continue;
-					if (!areColocal(m_indices[meshEdgeIndex1(e1)], m_indices[meshEdgeIndex1(e2)]))
-						continue;
-					swap(m_nextBoundaryEdges[e1], m_nextBoundaryEdges[e2]);
-					linkedEdges.set(e1);
-					linkedEdges.set(e2);
-					goto fixInternalBoundary; // start over
-				}
-			}
-		}
-#endif
-	}
-
 	/// Find edge, test all colocals.
-	uint32_t findEdge(uint32_t vertex0, uint32_t vertex1) const
-	{
-		uint32_t result = UINT32_MAX;
-		if (m_nextColocalVertex.isEmpty()) {
+	uint32_t findEdge(uint32_t vertex0, uint32_t vertex1) const {
+		// Try to find exact vertex match first.
+		{
 			EdgeKey key(vertex0, vertex1);
 			uint32_t edge = m_edgeMap.get(key);
 			while (edge != UINT32_MAX) {
 				// Don't find edges of ignored faces.
-				if (!isFaceIgnored(meshEdgeFace(edge))) {
-					//XA_DEBUG_ASSERT(m_id != UINT32_MAX || (m_id == UINT32_MAX && result == UINT32_MAX)); // duplicate edge - ignore on initial meshes
-					result = edge;
-#if !XA_DEBUG
-					return result;
-#endif
-				}
-				edge = m_edgeMap.getNext(edge);
+				if (!isFaceIgnored(meshEdgeFace(edge)))
+					return edge;
+				edge = m_edgeMap.getNext(key, edge);
 			}
-		} else {
-			for (ColocalVertexIterator it0(this, vertex0); !it0.isDone(); it0.advance()) {
-				for (ColocalVertexIterator it1(this, vertex1); !it1.isDone(); it1.advance()) {
-					EdgeKey key(it0.vertex(), it1.vertex());
+		}
+		// If colocals were created, try every permutation.
+		if (!m_nextColocalVertex.isEmpty()) {
+			uint32_t colocalVertex0 = vertex0;
+			for (;;) {
+				uint32_t colocalVertex1 = vertex1;
+				for (;;) {
+					EdgeKey key(colocalVertex0, colocalVertex1);
 					uint32_t edge = m_edgeMap.get(key);
 					while (edge != UINT32_MAX) {
 						// Don't find edges of ignored faces.
-						if (!isFaceIgnored(meshEdgeFace(edge))) {
-							XA_DEBUG_ASSERT(m_id != UINT32_MAX || (m_id == UINT32_MAX && result == UINT32_MAX)); // duplicate edge - ignore on initial meshes
-							result = edge;
-#if !XA_DEBUG
-							return result;
-#endif
-						}
-						edge = m_edgeMap.getNext(edge);
+						if (!isFaceIgnored(meshEdgeFace(edge)))
+							return edge;
+						edge = m_edgeMap.getNext(key, edge);
 					}
+					colocalVertex1 = m_nextColocalVertex[colocalVertex1];
+					if (colocalVertex1 == vertex1)
+						break; // Back to start.
 				}
+				colocalVertex0 = m_nextColocalVertex[colocalVertex0];
+				if (colocalVertex0 == vertex0)
+					break; // Back to start.
 			}
 		}
-		return result;
+		return UINT32_MAX;
 	}
 
 	// Edge map can be destroyed when no longer used to reduce memory usage. It's used by:
 	//   * Mesh::createBoundaries()
-	//   * Mesh::ColocalEdgeIterator (used by MeshFaceGroups)
-	//   * meshCloseHole()
-	void destroyEdgeMap()
-	{
+	//   * Mesh::edgeMap() (used by MeshFaceGroups)
+	void destroyEdgeMap() {
 		m_edgeMap.destroy();
 	}
 
 #if XA_DEBUG_EXPORT_OBJ
-	void writeObjVertices(FILE *file) const
-	{
+	void writeObjVertices(FILE *file) const {
 		for (uint32_t i = 0; i < m_positions.size(); i++)
 			fprintf(file, "v %g %g %g\n", m_positions[i].x, m_positions[i].y, m_positions[i].z);
 		if (m_flags & MeshFlags::HasNormals) {
@@ -2733,8 +2475,7 @@ public:
 			fprintf(file, "vt %g %g\n", m_texcoords[i].x, m_texcoords[i].y);
 	}
 
-	void writeObjFace(FILE *file, uint32_t face, uint32_t offset = 0) const
-	{
+	void writeObjFace(FILE *file, uint32_t face, uint32_t offset = 0) const {
 		fprintf(file, "f ");
 		for (uint32_t j = 0; j < 3; j++) {
 			const uint32_t index = m_indices[face * 3 + j] + 1 + offset; // 1-indexed
@@ -2742,8 +2483,7 @@ public:
 		}
 	}
 
-	void writeObjBoundaryEges(FILE *file) const
-	{
+	void writeObjBoundaryEges(FILE *file) const {
 		if (m_oppositeEdges.isEmpty())
 			return; // Boundaries haven't been created.
 		fprintf(file, "o boundary_edges\n");
@@ -2754,31 +2494,7 @@ public:
 		}
 	}
 
-	void writeObjLinkedBoundaries(FILE *file) const
-	{
-		if (m_oppositeEdges.isEmpty() || m_nextBoundaryEdges.isEmpty())
-			return; // Boundaries haven't been created and/or linked.
-		Array<uint32_t> boundaryLoops;
-		meshGetBoundaryLoops(*this, boundaryLoops);
-		for (uint32_t i = 0; i < boundaryLoops.size(); i++) {
-			uint32_t edge = boundaryLoops[i];
-			fprintf(file, "o boundary_%04d\n", i);
-			fprintf(file, "l");
-			for (;;) {
-				const uint32_t vertex0 = m_indices[meshEdgeIndex0(edge)];
-				const uint32_t vertex1 = m_indices[meshEdgeIndex1(edge)];
-				fprintf(file, " %d", vertex0 + 1); // 1-indexed
-				edge = m_nextBoundaryEdges[edge];
-				if (edge == boundaryLoops[i] || edge == UINT32_MAX) {
-					fprintf(file, " %d\n", vertex1 + 1); // 1-indexed
-					break;
-				}
-			}
-		}
-	}
-
-	void writeObjFile(const char *filename) const
-	{
+	void writeObjFile(const char *filename) const {
 		FILE *file;
 		XA_FOPEN(file, filename, "w");
 		if (!file)
@@ -2789,13 +2505,11 @@ public:
 		for (uint32_t i = 0; i < faceCount(); i++)
 			writeObjFace(file, i);
 		writeObjBoundaryEges(file);
-		writeObjLinkedBoundaries(file);
 		fclose(file);
 	}
 #endif
 
-	float computeSurfaceArea() const
-	{
+	float computeSurfaceArea() const {
 		float area = 0;
 		for (uint32_t f = 0; f < faceCount(); f++)
 			area += computeFaceArea(f);
@@ -2804,24 +2518,21 @@ public:
 	}
 
 	// Returned value is always positive, even if some triangles are flipped.
-	float computeParametricArea() const
-	{
+	float computeParametricArea() const {
 		float area = 0;
 		for (uint32_t f = 0; f < faceCount(); f++)
 			area += fabsf(computeFaceParametricArea(f)); // May be negative, depends on texcoord winding.
-		return area; 
+		return area;
 	}
 
-	float computeFaceArea(uint32_t face) const
-	{
+	float computeFaceArea(uint32_t face) const {
 		const Vector3 &p0 = m_positions[m_indices[face * 3 + 0]];
 		const Vector3 &p1 = m_positions[m_indices[face * 3 + 1]];
 		const Vector3 &p2 = m_positions[m_indices[face * 3 + 2]];
 		return length(cross(p1 - p0, p2 - p0)) * 0.5f;
 	}
 
-	Vector3 computeFaceCentroid(uint32_t face) const
-	{
+	Vector3 computeFaceCentroid(uint32_t face) const {
 		Vector3 sum(0.0f);
 		for (uint32_t i = 0; i < 3; i++)
 			sum += m_positions[m_indices[face * 3 + i]];
@@ -2830,8 +2541,7 @@ public:
 
 	// Average of the edge midpoints weighted by the edge length.
 	// I want a point inside the triangle, but closer to the cirumcenter.
-	Vector3 computeFaceCenter(uint32_t face) const
-	{
+	Vector3 computeFaceCenter(uint32_t face) const {
 		const Vector3 &p0 = m_positions[m_indices[face * 3 + 0]];
 		const Vector3 &p1 = m_positions[m_indices[face * 3 + 1]];
 		const Vector3 &p2 = m_positions[m_indices[face * 3 + 2]];
@@ -2844,28 +2554,25 @@ public:
 		return m0 + m1 + m2;
 	}
 
-	Vector3 computeFaceNormal(uint32_t face) const
-	{
+	Vector3 computeFaceNormal(uint32_t face) const {
 		const Vector3 &p0 = m_positions[m_indices[face * 3 + 0]];
 		const Vector3 &p1 = m_positions[m_indices[face * 3 + 1]];
 		const Vector3 &p2 = m_positions[m_indices[face * 3 + 2]];
 		const Vector3 e0 = p2 - p0;
 		const Vector3 e1 = p1 - p0;
 		const Vector3 normalAreaScaled = cross(e0, e1);
-		return normalizeSafe(normalAreaScaled, Vector3(0, 0, 1), 0.0f);
+		return normalizeSafe(normalAreaScaled, Vector3(0, 0, 1));
 	}
 
-	float computeFaceParametricArea(uint32_t face) const
-	{
+	float computeFaceParametricArea(uint32_t face) const {
 		const Vector2 &t0 = m_texcoords[m_indices[face * 3 + 0]];
 		const Vector2 &t1 = m_texcoords[m_indices[face * 3 + 1]];
 		const Vector2 &t2 = m_texcoords[m_indices[face * 3 + 2]];
 		return triangleArea(t0, t1, t2);
 	}
-	
+
 	// @@ This is not exactly accurate, we should compare the texture coordinates...
-	bool isSeam(uint32_t edge) const
-	{
+	bool isSeam(uint32_t edge) const {
 		const uint32_t oppositeEdge = m_oppositeEdges[edge];
 		if (oppositeEdge == UINT32_MAX)
 			return false; // boundary edge
@@ -2876,8 +2583,7 @@ public:
 		return m_indices[e0] != m_indices[oe1] || m_indices[e1] != m_indices[oe0];
 	}
 
-	bool isTextureSeam(uint32_t edge) const
-	{
+	bool isTextureSeam(uint32_t edge) const {
 		const uint32_t oppositeEdge = m_oppositeEdges[edge];
 		if (oppositeEdge == UINT32_MAX)
 			return false; // boundary edge
@@ -2888,26 +2594,9 @@ public:
 		return m_texcoords[m_indices[e0]] != m_texcoords[m_indices[oe1]] || m_texcoords[m_indices[e1]] != m_texcoords[m_indices[oe0]];
 	}
 
-	uint32_t firstColocal(uint32_t vertex) const
-	{
-		for (ColocalVertexIterator it(this, vertex); !it.isDone(); it.advance()) {
-			if (it.vertex() < vertex)
-				vertex = it.vertex();
-		}
-		return vertex;
-	}
-
-	bool areColocal(uint32_t vertex0, uint32_t vertex1) const
-	{
-		if (vertex0 == vertex1)
-			return true;
-		if (m_nextColocalVertex.isEmpty())
-			return false;
-		for (ColocalVertexIterator it(this, vertex0); !it.isDone(); it.advance()) {
-			if (it.vertex() == vertex1)
-				return true;
-		}
-		return false;
+	uint32_t firstColocalVertex(uint32_t vertex) const {
+		XA_DEBUG_ASSERT(m_firstColocalVertex.size() == m_positions.size());
+		return m_firstColocalVertex[vertex];
 	}
 
 	XA_INLINE float epsilon() const { return m_epsilon; }
@@ -2919,23 +2608,28 @@ public:
 	XA_INLINE uint32_t vertexCount() const { return m_positions.size(); }
 	XA_INLINE uint32_t vertexAt(uint32_t i) const { return m_indices[i]; }
 	XA_INLINE const Vector3 &position(uint32_t vertex) const { return m_positions[vertex]; }
-	XA_INLINE const Vector3 *positions() const { return m_positions.data(); }
-	XA_INLINE const Vector3 &normal(uint32_t vertex) const { XA_DEBUG_ASSERT(m_flags & MeshFlags::HasNormals); return m_normals[vertex]; }
+	XA_INLINE ConstArrayView<Vector3> positions() const { return m_positions; }
+	XA_INLINE const Vector3 &normal(uint32_t vertex) const {
+		XA_DEBUG_ASSERT(m_flags & MeshFlags::HasNormals);
+		return m_normals[vertex];
+	}
 	XA_INLINE const Vector2 &texcoord(uint32_t vertex) const { return m_texcoords[vertex]; }
 	XA_INLINE Vector2 &texcoord(uint32_t vertex) { return m_texcoords[vertex]; }
-	XA_INLINE const Vector2 *texcoords() const { return m_texcoords.data(); }
-	XA_INLINE Vector2 *texcoords() { return m_texcoords.data(); }
+	XA_INLINE const ConstArrayView<Vector2> texcoords() const { return m_texcoords; }
+	XA_INLINE ArrayView<Vector2> texcoords() { return m_texcoords; }
 	XA_INLINE uint32_t faceCount() const { return m_indices.size() / 3; }
-	XA_INLINE const uint32_t *indices() const { return m_indices.data(); }
+	XA_INLINE ConstArrayView<uint32_t> indices() const { return m_indices; }
 	XA_INLINE uint32_t indexCount() const { return m_indices.size(); }
 	XA_INLINE bool isFaceIgnored(uint32_t face) const { return (m_flags & MeshFlags::HasIgnoredFaces) && m_faceIgnore[face]; }
+	XA_INLINE uint32_t faceMaterial(uint32_t face) const { return (m_flags & MeshFlags::HasMaterials) ? m_faceMaterials[face] : UINT32_MAX; }
+	XA_INLINE const HashMap<EdgeKey, EdgeHash> &edgeMap() const { return m_edgeMap; }
 
 private:
-
 	float m_epsilon;
 	uint32_t m_flags;
 	uint32_t m_id;
 	Array<bool> m_faceIgnore;
+	Array<uint32_t> m_faceMaterials;
 	Array<uint32_t> m_indices;
 	Array<Vector3> m_positions;
 	Array<Vector3> m_normals;
@@ -2943,205 +2637,31 @@ private:
 
 	// Populated by createColocals
 	Array<uint32_t> m_nextColocalVertex; // In: vertex index. Out: the vertex index of the next colocal position.
+	Array<uint32_t> m_firstColocalVertex;
 
 	// Populated by createBoundaries
 	BitArray m_isBoundaryVertex;
 	Array<uint32_t> m_boundaryEdges;
 	Array<uint32_t> m_oppositeEdges; // In: edge index. Out: the index of the opposite edge (i.e. wound the opposite direction). UINT32_MAX if the input edge is a boundary edge.
 
-	// Populated by linkBoundaries
-	Array<uint32_t> m_nextBoundaryEdges; // The index of the next boundary edge. UINT32_MAX if the edge is not a boundary edge.
-
-	struct EdgeKey
-	{
-		EdgeKey(const EdgeKey &k) : v0(k.v0), v1(k.v1) {}
-		EdgeKey(uint32_t v0, uint32_t v1) : v0(v0), v1(v1) {}
-		bool operator==(const EdgeKey &k) const { return v0 == k.v0 && v1 == k.v1; }
-
-		uint32_t v0;
-		uint32_t v1;
-	};
-
-	struct EdgeHash
-	{
-		uint32_t operator()(const EdgeKey &k) const { return k.v0 * 32768u + k.v1; }
-	};
-
 	HashMap<EdgeKey, EdgeHash> m_edgeMap;
 
 public:
-	class BoundaryLoopEdgeIterator
-	{
+	class FaceEdgeIterator {
 	public:
-		BoundaryLoopEdgeIterator(const Mesh *mesh, uint32_t edge) : m_mesh(mesh), m_first(UINT32_MAX), m_current(edge) {}
-
-		void advance()
-		{
-			if (m_first == UINT32_MAX)
-				m_first = m_current;
-			m_current = m_mesh->m_nextBoundaryEdges[m_current];
-		}
-
-		bool isDone() const
-		{
-			return m_first == m_current || m_current == UINT32_MAX;
-		}
-
-		uint32_t edge() const
-		{
-			return m_current;
-		}
-
-		uint32_t nextEdge() const
-		{
-			return m_mesh->m_nextBoundaryEdges[m_current];
-		}
-
-	private:
-		const Mesh *m_mesh;
-		uint32_t m_first;
-		uint32_t m_current;
-	};
-
-	class ColocalVertexIterator
-	{
-	public:
-		ColocalVertexIterator(const Mesh *mesh, uint32_t v) : m_mesh(mesh), m_first(UINT32_MAX), m_current(v) {}
-
-		void advance()
-		{
-			if (m_first == UINT32_MAX)
-				m_first = m_current;
-			if (!m_mesh->m_nextColocalVertex.isEmpty())
-				m_current = m_mesh->m_nextColocalVertex[m_current];
-		}
-
-		bool isDone() const
-		{
-			return m_first == m_current;
-		}
-
-		uint32_t vertex() const
-		{
-			return m_current;
-		}
-
-		const Vector3 *pos() const
-		{
-			return &m_mesh->m_positions[m_current];
-		}
-
-	private:
-		const Mesh *m_mesh;
-		uint32_t m_first;
-		uint32_t m_current;
-	};
-
-	class ColocalEdgeIterator
-	{
-	public:
-		ColocalEdgeIterator(const Mesh *mesh, uint32_t vertex0, uint32_t vertex1) : m_mesh(mesh), m_vertex0It(mesh, vertex0), m_vertex1It(mesh, vertex1), m_vertex1(vertex1)
-		{
-			do {
-				if (!resetElement()) {
-					advanceVertex1();
-				}
-				else {
-					break;
-				}
-			} while (!isDone());
-		}
-
-		void advance()
-		{
-			advanceElement();
-		}
-
-		bool isDone() const
-		{
-			return m_vertex0It.isDone() && m_vertex1It.isDone() && m_edge == UINT32_MAX;
-		}
-
-		uint32_t edge() const
-		{
-			return m_edge;
-		}
-
-	private:
-		bool resetElement()
-		{
-			m_edge = m_mesh->m_edgeMap.get(Mesh::EdgeKey(m_vertex0It.vertex(), m_vertex1It.vertex()));
-			while (m_edge != UINT32_MAX) {
-				if (!isIgnoredFace())
-					break;
-				m_edge = m_mesh->m_edgeMap.getNext(m_edge);
-			}
-			if (m_edge == UINT32_MAX) {
-				return false;
-			}
-			return true;
-		}
-
-		void advanceElement()
-		{
-			for (;;) {
-				m_edge = m_mesh->m_edgeMap.getNext(m_edge);
-				if (m_edge == UINT32_MAX)
-					break;
-				if (!isIgnoredFace())
-					break;
-			}
-			if (m_edge == UINT32_MAX)
-				advanceVertex1();
-		}
-
-		void advanceVertex1()
-		{
-			auto successful = false;
-			while (!successful)	{
-				m_vertex1It.advance();
-				if (m_vertex1It.isDone()) {
-					if (!m_vertex0It.isDone()) {
-						m_vertex0It.advance();
-						m_vertex1It = ColocalVertexIterator(m_mesh, m_vertex1);
-					}
-					else {
-						return;
-					}
-				}
-				successful = resetElement();
-			}
-		}
-
-		bool isIgnoredFace() const
-		{
-			return m_mesh->m_faceIgnore[meshEdgeFace(m_edge)];
-		}
-
-		const Mesh *m_mesh;
-		ColocalVertexIterator m_vertex0It, m_vertex1It;
-		const uint32_t m_vertex1;
-		uint32_t m_edge;
-	};
-
-	class FaceEdgeIterator 
-	{
-	public:
-		FaceEdgeIterator (const Mesh *mesh, uint32_t face) : m_mesh(mesh), m_face(face), m_relativeEdge(0)
-		{
+		FaceEdgeIterator(const Mesh *mesh, uint32_t face) :
+				m_mesh(mesh), m_face(face), m_relativeEdge(0) {
 			m_edge = m_face * 3;
 		}
 
-		void advance()
-		{
+		void advance() {
 			if (m_relativeEdge < 3) {
 				m_edge++;
 				m_relativeEdge++;
 			}
 		}
 
-		bool isDone() const
-		{
+		bool isDone() const {
 			return m_relativeEdge == 3;
 		}
 
@@ -3152,9 +2672,8 @@ public:
 		uint32_t relativeEdge() const { return m_relativeEdge; }
 		uint32_t face() const { return m_face; }
 		uint32_t oppositeEdge() const { return m_mesh->m_oppositeEdges[m_edge]; }
-		
-		uint32_t oppositeFace() const
-		{
+
+		uint32_t oppositeFace() const {
 			const uint32_t oedge = m_mesh->m_oppositeEdges[m_edge];
 			if (oedge == UINT32_MAX)
 				return UINT32_MAX;
@@ -3178,19 +2697,18 @@ public:
 	};
 };
 
-struct MeshFaceGroups
-{
+struct MeshFaceGroups {
 	typedef uint32_t Handle;
 	static constexpr Handle kInvalid = UINT32_MAX;
 
-	MeshFaceGroups(const Mesh *mesh) : m_mesh(mesh), m_groups(MemTag::Mesh), m_firstFace(MemTag::Mesh), m_nextFace(MemTag::Mesh), m_faceCount(MemTag::Mesh) {}
+	MeshFaceGroups(const Mesh *mesh) :
+			m_mesh(mesh), m_groups(MemTag::Mesh), m_firstFace(MemTag::Mesh), m_nextFace(MemTag::Mesh), m_faceCount(MemTag::Mesh) {}
 	XA_INLINE Handle groupAt(uint32_t face) const { return m_groups[face]; }
 	XA_INLINE uint32_t groupCount() const { return m_faceCount.size(); }
 	XA_INLINE uint32_t nextFace(uint32_t face) const { return m_nextFace[face]; }
 	XA_INLINE uint32_t faceCount(uint32_t group) const { return m_faceCount[group]; }
 
-	void compute()
-	{
+	void compute() {
 		m_groups.resize(m_mesh->faceCount());
 		m_groups.fillBytes(0xff); // Set all faces to kInvalid
 		uint32_t firstUnassignedFace = 0;
@@ -3222,57 +2740,25 @@ struct MeshFaceGroups
 					break;
 				const uint32_t f = growFaces.back();
 				growFaces.pop_back();
+				const uint32_t material = m_mesh->faceMaterial(f);
 				for (Mesh::FaceEdgeIterator edgeIt(m_mesh, f); !edgeIt.isDone(); edgeIt.advance()) {
-					// Iterate opposite edges. There may be more than one - non-manifold geometry can have duplicate edges.
-					// Prioritize the one with exact vertex match, not just colocal.
-					// If *any* of the opposite edges are already assigned to this group, don't do anything.
-					bool alreadyAssignedToThisGroup = false;
-					uint32_t bestConnectedFace = UINT32_MAX;
-					for (Mesh::ColocalEdgeIterator oppositeEdgeIt(m_mesh, edgeIt.vertex1(), edgeIt.vertex0()); !oppositeEdgeIt.isDone(); oppositeEdgeIt.advance()) {
-						const uint32_t oppositeEdge = oppositeEdgeIt.edge();
-						const uint32_t oppositeFace = meshEdgeFace(oppositeEdge);
-#if 0
-						// Reject opposite face if dihedral angle >= 90 degrees.
-						{
-							Vector3 a = m_mesh->computeFaceNormal(f);
-							Vector3 b = m_mesh->computeFaceNormal(oppositeFace);
-							if (dot(a, b) <= 0.0f)
-								continue;
-						}
-#endif
-						if (m_mesh->isFaceIgnored(oppositeFace))
-							continue; // Don't add ignored faces to group.
-						if (m_groups[oppositeFace] == group) {
-							alreadyAssignedToThisGroup = true;
-							break;
-						}
-						if (m_groups[oppositeFace] != kInvalid)
-							continue; // Connected face is already assigned to another group.
-						if (faceDuplicatesGroupEdge(group, oppositeFace))
-							continue; // Don't want duplicate edges in a group.
-						const uint32_t oppositeVertex0 = m_mesh->vertexAt(meshEdgeIndex0(oppositeEdge));
-						const uint32_t oppositeVertex1 = m_mesh->vertexAt(meshEdgeIndex1(oppositeEdge));
-						if (bestConnectedFace == UINT32_MAX || (oppositeVertex0 == edgeIt.vertex1() && oppositeVertex1 == edgeIt.vertex0()))
-							bestConnectedFace = oppositeFace;
-#if 0
-						else {
-							// Choose the opposite face with the smallest dihedral angle.
-							const float d1 = 1.0f - dot(computeFaceNormal(f), computeFaceNormal(bestConnectedFace));
-							const float d2 = 1.0f - dot(computeFaceNormal(f), computeFaceNormal(oppositeFace));
-							if (d2 < d1)
-								bestConnectedFace = oppositeFace;
-						}
-#endif
-					}
-					if (!alreadyAssignedToThisGroup && bestConnectedFace != UINT32_MAX) {
-						m_groups[bestConnectedFace] = group;
-						m_nextFace[bestConnectedFace] = UINT32_MAX;
-						if (prevFace != UINT32_MAX)
-							m_nextFace[prevFace] = bestConnectedFace;
-						prevFace = bestConnectedFace;
-						groupFaceCount++;
-						growFaces.push_back(bestConnectedFace);
-					}
+					const uint32_t oppositeEdge = m_mesh->findEdge(edgeIt.vertex1(), edgeIt.vertex0());
+					if (oppositeEdge == UINT32_MAX)
+						continue; // Boundary edge.
+					const uint32_t oppositeFace = meshEdgeFace(oppositeEdge);
+					if (m_mesh->isFaceIgnored(oppositeFace))
+						continue; // Don't add ignored faces to group.
+					if (m_mesh->faceMaterial(oppositeFace) != material)
+						continue; // Different material.
+					if (m_groups[oppositeFace] != kInvalid)
+						continue; // Connected face is already assigned to another group.
+					m_groups[oppositeFace] = group;
+					m_nextFace[oppositeFace] = UINT32_MAX;
+					if (prevFace != UINT32_MAX)
+						m_nextFace[prevFace] = oppositeFace;
+					prevFace = oppositeFace;
+					groupFaceCount++;
+					growFaces.push_back(oppositeFace);
 				}
 			}
 			m_faceCount.push_back(groupFaceCount);
@@ -3281,27 +2767,23 @@ struct MeshFaceGroups
 		}
 	}
 
-	class Iterator
-	{
+	class Iterator {
 	public:
-		Iterator(const MeshFaceGroups *meshFaceGroups, Handle group) : m_meshFaceGroups(meshFaceGroups)
-		{
+		Iterator(const MeshFaceGroups *meshFaceGroups, Handle group) :
+				m_meshFaceGroups(meshFaceGroups) {
 			XA_DEBUG_ASSERT(group != kInvalid);
 			m_current = m_meshFaceGroups->m_firstFace[group];
 		}
 
-		void advance()
-		{
+		void advance() {
 			m_current = m_meshFaceGroups->m_nextFace[m_current];
 		}
 
-		bool isDone() const
-		{
+		bool isDone() const {
 			return m_current == UINT32_MAX;
 		}
 
-		uint32_t face() const
-		{
+		uint32_t face() const {
 			return m_current;
 		}
 
@@ -3311,18 +2793,6 @@ struct MeshFaceGroups
 	};
 
 private:
-	// Check if the face duplicates any edges of any face already in the group.
-	bool faceDuplicatesGroupEdge(Handle group, uint32_t face) const
-	{
-		for (Mesh::FaceEdgeIterator edgeIt(m_mesh, face); !edgeIt.isDone(); edgeIt.advance()) {
-			for (Mesh::ColocalEdgeIterator colocalEdgeIt(m_mesh, edgeIt.vertex0(), edgeIt.vertex1()); !colocalEdgeIt.isDone(); colocalEdgeIt.advance()) {
-				if (m_groups[meshEdgeFace(colocalEdgeIt.edge())] == group)
-					return true;
-			}
-		}
-		return false;
-	}
-
 	const Mesh *m_mesh;
 	Array<Handle> m_groups;
 	Array<uint32_t> m_firstFace;
@@ -3332,243 +2802,27 @@ private:
 
 constexpr MeshFaceGroups::Handle MeshFaceGroups::kInvalid;
 
-static bool meshCloseHole(Mesh *mesh, const Array<uint32_t> &holeVertices, const Vector3 &normal)
-{
-#if XA_CLOSE_HOLES_CHECK_EDGE_INTERSECTION
-	const uint32_t faceCount = mesh->faceCount();
-#endif
-	const bool compareNormal = equal(normal, Vector3(0.0f), FLT_EPSILON);
-	uint32_t frontCount = holeVertices.size();
-	Array<uint32_t> frontVertices;
-	Array<Vector3> frontPoints;
-	Array<float> frontAngles;
-	frontVertices.resize(frontCount);
-	frontPoints.resize(frontCount);
-	for (uint32_t i = 0; i < frontCount; i++) {
-		frontVertices[i] = holeVertices[i];
-		frontPoints[i] = mesh->position(frontVertices[i]);
-	}
-	while (frontCount >= 3) {
-		frontAngles.resize(frontCount);
-		float smallestAngle = kPi2, smallestAngleIgnoringNormal = kPi2;
-		uint32_t smallestAngleIndex = UINT32_MAX, smallestAngleIndexIgnoringNormal = UINT32_MAX;
-		for (uint32_t i = 0; i < frontCount; i++) {
-			const uint32_t i1 = i == 0 ? frontCount - 1 : i - 1;
-			const uint32_t i2 = i;
-			const uint32_t i3 = (i + 1) % frontCount;
-			const Vector3 edge1 = frontPoints[i1] - frontPoints[i2];
-			const Vector3 edge2 = frontPoints[i3] - frontPoints[i2];
-			frontAngles[i] = atan2f(length(cross(edge1, edge2)), dot(edge1, edge2));
-			if (frontAngles[i] >= smallestAngle || isNan(frontAngles[i]))
-				continue;
-			// Don't duplicate edges.
-			if (mesh->findEdge(frontVertices[i1], frontVertices[i2]) != UINT32_MAX)
-				continue;
-			if (mesh->findEdge(frontVertices[i2], frontVertices[i3]) != UINT32_MAX)
-				continue;
-			if (mesh->findEdge(frontVertices[i3], frontVertices[i1]) != UINT32_MAX)
-				continue;
-			/*
-			Make sure he new edge that would be formed by (i3, i1) doesn't intersect any vertices. This often happens when fixing t-junctions.
-
-			       i2
-			       *
-			      / \
-			     /   \
-			 i1 *--*--* i3
-			     \ | /
-				  \|/
-				   *
-			*/
-			bool intersection = false;
-			for (uint32_t j = 0; j < frontCount; j++) {
-				if (j == i1 || j == i2 || j == i3)
-					continue;
-				if (lineIntersectsPoint(frontPoints[j], frontPoints[i3], frontPoints[i1], nullptr, mesh->epsilon())) {
-					intersection = true;
-					break;
-				}
-			}
-			if (intersection)
-				continue;
-			// Don't add the triangle if a boundary point lies on the same plane as the triangle, and is inside it.
-			intersection = false;
-			const Plane plane(frontPoints[i1], frontPoints[i2], frontPoints[i3]);
-			for (uint32_t j = 0; j < frontCount; j++) {
-				if (j == i1 || j == i2 || j == i3)
-					continue;
-				if (!isZero(plane.distance(frontPoints[j]), mesh->epsilon()))
-					continue;
-				if (pointInTriangle(frontPoints[j], frontPoints[i1], frontPoints[i2], frontPoints[i3])) {
-					intersection = true;
-					break;
-				}
-			}
-			if (intersection)
-				continue;
-#if XA_CLOSE_HOLES_CHECK_EDGE_INTERSECTION
-			// Don't add the triangle if the new edge (i3, i1), intersects any other triangle that isn't part of the filled hole.
-			intersection = false;
-			const Vector3 newEdgeVector = frontPoints[i1] - frontPoints[i3];
-			for (uint32_t f = 0; f < faceCount; f++) {
-				Vector3 tri[3];
-				for (uint32_t j = 0; j < 3; j++)
-					tri[j] = mesh->position(mesh->vertexAt(f * 3 + j));
-				float t;
-				if (rayIntersectsTriangle(frontPoints[i3], newEdgeVector, tri, &t)) {
-					intersection = true;
-					break;
-				}
-			}
-			if (intersection)
-				continue;
-#endif
-			// Skip backwards facing triangles.
-			if (compareNormal) {
-				if (frontAngles[i] < smallestAngleIgnoringNormal) {
-					smallestAngleIgnoringNormal = frontAngles[i];
-					smallestAngleIndexIgnoringNormal = i;
-				}
-				const Vector3 e0 = frontPoints[i3] - frontPoints[i1];
-				const Vector3 e1 = frontPoints[i2] - frontPoints[i1];
-				const Vector3 triNormal = normalizeSafe(cross(e0, e1), Vector3(0.0f), mesh->epsilon());
-				if (dot(normal, triNormal) <= 0.0f)
-					continue;
-			}
-			smallestAngle = smallestAngleIgnoringNormal = frontAngles[i];
-			smallestAngleIndex = smallestAngleIndexIgnoringNormal = i;
-		}
-		// Closing holes failed if we don't have a smallest angle.
-		// Fallback to ignoring the backwards facing normal test if possible.
-		if (smallestAngleIndex == UINT32_MAX || smallestAngle <= 0.0f || smallestAngle >= kPi) {
-			if (smallestAngleIgnoringNormal == UINT32_MAX || smallestAngleIgnoringNormal <= 0.0f || smallestAngleIgnoringNormal >= kPi)
-				return false;
-			else
-				smallestAngleIndex = smallestAngleIndexIgnoringNormal;
-		}
-		const uint32_t i1 = smallestAngleIndex == 0 ? frontCount - 1 : smallestAngleIndex - 1;
-		const uint32_t i2 = smallestAngleIndex;
-		const uint32_t i3 = (smallestAngleIndex + 1) % frontCount;
-		const Mesh::AddFaceResult::Enum result = mesh->addFace(frontVertices[i1], frontVertices[i2], frontVertices[i3]);
-		XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK); // Shouldn't happen due to the findEdge calls above.
-		XA_UNUSED(result);
-		frontVertices.removeAt(i2);
-		frontPoints.removeAt(i2);
-		frontCount = frontVertices.size();
-	}
-	return true;
-}
-
-static bool meshCloseHoles(Mesh *mesh, const Array<uint32_t> &boundaryLoops, const Vector3 &normal, uint32_t *holeCount, Array<uint32_t> *holeFaceCounts)
-{
-	if (holeFaceCounts)
-		holeFaceCounts->clear();
-	// Compute lengths.
-	const uint32_t boundaryCount = boundaryLoops.size();
-	Array<float> boundaryLengths;
-	Array<uint32_t> boundaryEdgeCounts;
-	boundaryEdgeCounts.resize(boundaryCount);
-	for (uint32_t i = 0; i < boundaryCount; i++) {
-		float boundaryLength = 0.0f;
-		boundaryEdgeCounts[i] = 0;
-		for (Mesh::BoundaryLoopEdgeIterator it(mesh, boundaryLoops[i]); !it.isDone(); it.advance()) {
-			const Vector3 &t0 = mesh->position(mesh->vertexAt(meshEdgeIndex0(it.edge())));
-			const Vector3 &t1 = mesh->position(mesh->vertexAt(meshEdgeIndex1(it.edge())));
-			boundaryLength += length(t1 - t0);
-			boundaryEdgeCounts[i]++;
-		}
-		boundaryLengths.push_back(boundaryLength);
-	}
-	// Find disk boundary.
-	uint32_t diskBoundary = 0;
-	float maxLength = boundaryLengths[0];
-	for (uint32_t i = 1; i < boundaryCount; i++) {
-		if (boundaryLengths[i] > maxLength) {
-			maxLength = boundaryLengths[i];
-			diskBoundary = i;
-		}
-	}
-	// Close holes.
-	Array<uint32_t> holeVertices;
-	Array<Vector3> holePoints;
-	bool result = true;
-	for (uint32_t i = 0; i < boundaryCount; i++) {
-		if (diskBoundary == i)
-			continue; // Skip disk boundary.
-		holeVertices.resize(boundaryEdgeCounts[i]);
-		holePoints.resize(boundaryEdgeCounts[i]);
-		// Winding is backwards for internal boundaries.
-		uint32_t e = 0;
-		for (Mesh::BoundaryLoopEdgeIterator it(mesh, boundaryLoops[i]); !it.isDone(); it.advance()) {
-			const uint32_t vertex = mesh->vertexAt(meshEdgeIndex0(it.edge()));
-			holeVertices[boundaryEdgeCounts[i] - 1 - e] = vertex;
-			holePoints[boundaryEdgeCounts[i] - 1 - e] = mesh->position(vertex);
-			e++;
-		}
-		const uint32_t oldFaceCount = mesh->faceCount();
-		if (!meshCloseHole(mesh, holeVertices, normal))
-			result = false; // Return false if any hole failed to close, but keep trying to close other holes.
-		if (holeCount)
-			(*holeCount)++;
-		if (holeFaceCounts)
-			holeFaceCounts->push_back(mesh->faceCount() - oldFaceCount);
-	}
-	return result;
-}
-
-static bool meshIsPlanar(const Mesh &mesh)
-{
-	const Vector3 p1 = mesh.position(mesh.vertexAt(0));
-	const Vector3 p2 = mesh.position(mesh.vertexAt(1));
-	const Vector3 p3 = mesh.position(mesh.vertexAt(2));
-	const Plane plane(p1, p2, p3);
-	const uint32_t vertexCount = mesh.vertexCount();
-	for (uint32_t v = 0; v < vertexCount; v++) {
-		const float d = plane.distance(mesh.position(v));
-		if (!isZero(d, mesh.epsilon()))
-			return false;
-	}
-	return true;
-}
-
-/*
-Fixing T-junctions.
-
-- Find T-junctions. Find  vertices that are on an edge.
-- This test is approximate.
-- Insert edges on a spatial index to speedup queries.
-- Consider only open edges, that is edges that have no pairs.
-- Consider only vertices on boundaries.
-- Close T-junction.
-- Split edge.
-
-*/
-struct SplitEdge
-{
-	uint32_t edge;
-	float t;
-	uint32_t vertex;
-
-	bool operator<(const SplitEdge &other) const
-	{
-		if (edge < other.edge)
-			return true;
-		else if (edge == other.edge) {
-			if (t < other.t)
-				return true;
-		}
+#if XA_CHECK_T_JUNCTIONS
+static bool lineIntersectsPoint(const Vector3 &point, const Vector3 &lineStart, const Vector3 &lineEnd, float *t, float epsilon) {
+	float tt;
+	if (!t)
+		t = &tt;
+	*t = 0.0f;
+	if (equal(lineStart, point, epsilon) || equal(lineEnd, point, epsilon))
+		return false; // Vertex lies on either line vertices.
+	const Vector3 v01 = point - lineStart;
+	const Vector3 v21 = lineEnd - lineStart;
+	const float l = length(v21);
+	const float d = length(cross(v01, v21)) / l;
+	if (!isZero(d, epsilon))
 		return false;
-	}
-};
+	*t = dot(v01, v21) / (l * l);
+	return *t > kEpsilon && *t < 1.0f - kEpsilon;
+}
 
-// Returns nullptr if there were no t-junctions to fix.
-static Mesh *meshFixTJunctions(const Mesh &inputMesh, bool *duplicatedEdge, bool *failed, uint32_t *fixedTJunctionsCount)
-{
-	if (duplicatedEdge)
-		*duplicatedEdge = false;
-	if (failed)
-		*failed = false;
-	Array<SplitEdge> splitEdges;
+// Returns the number of T-junctions found.
+static int meshCheckTJunctions(const Mesh &inputMesh) {
+	int count = 0;
 	const uint32_t vertexCount = inputMesh.vertexCount();
 	const uint32_t edgeCount = inputMesh.edgeCount();
 	for (uint32_t v = 0; v < vertexCount; v++) {
@@ -3582,155 +2836,130 @@ static Mesh *meshFixTJunctions(const Mesh &inputMesh, bool *duplicatedEdge, bool
 			const Vector3 &edgePos1 = inputMesh.position(inputMesh.vertexAt(meshEdgeIndex0(e)));
 			const Vector3 &edgePos2 = inputMesh.position(inputMesh.vertexAt(meshEdgeIndex1(e)));
 			float t;
-			if (!lineIntersectsPoint(pos, edgePos1, edgePos2, &t, inputMesh.epsilon()))
-				continue;
-			SplitEdge splitEdge;
-			splitEdge.edge = e;
-			splitEdge.t = t;
-			splitEdge.vertex = v;
-			splitEdges.push_back(splitEdge);
+			if (lineIntersectsPoint(pos, edgePos1, edgePos2, &t, inputMesh.epsilon()))
+				count++;
 		}
 	}
-	if (splitEdges.isEmpty())
-		return nullptr;
-	const uint32_t faceCount = inputMesh.faceCount();
-	Mesh *mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, inputMesh.epsilon(), vertexCount + splitEdges.size(), faceCount);
-	for (uint32_t v = 0; v < vertexCount; v++)
-		mesh->addVertex(inputMesh.position(v));
-	Array<uint32_t> indexArray;
-	indexArray.reserve(4);
-	Array<SplitEdge> faceSplitEdges;
-	faceSplitEdges.reserve(4);
-	for (uint32_t f = 0; f < faceCount; f++) {
-		// Find t-junctions in this face.
-		faceSplitEdges.clear();
-		for (uint32_t i = 0; i < splitEdges.size(); i++) {
-			if (meshEdgeFace(splitEdges[i].edge) == f)
-				faceSplitEdges.push_back(splitEdges[i]);
-		}
-		if (!faceSplitEdges.isEmpty()) {
-			// Need to split edges in winding order when a single edge has multiple t-junctions.
-			insertionSort(faceSplitEdges.data(), faceSplitEdges.size());
-			indexArray.clear();
-			for (Mesh::FaceEdgeIterator it(&inputMesh, f); !it.isDone(); it.advance()) {
-				indexArray.push_back(it.vertex0());
-				for (uint32_t se = 0; se < faceSplitEdges.size(); se++) {
-					const SplitEdge &splitEdge = faceSplitEdges[se];
-					if (splitEdge.edge == it.edge())
-						indexArray.push_back(splitEdge.vertex);
+	return count;
+}
+#endif
+
+// References invalid faces and vertices in a mesh.
+struct InvalidMeshGeometry {
+	// If meshFaceGroups is not null, invalid faces have the face group MeshFaceGroups::kInvalid.
+	// If meshFaceGroups is null, invalid faces are Mesh::isFaceIgnored.
+	void extract(const Mesh *mesh, const MeshFaceGroups *meshFaceGroups) {
+		// Copy invalid faces.
+		m_faces.clear();
+		const uint32_t meshFaceCount = mesh->faceCount();
+		for (uint32_t f = 0; f < meshFaceCount; f++) {
+			if ((meshFaceGroups && meshFaceGroups->groupAt(f) == MeshFaceGroups::kInvalid) || (!meshFaceGroups && mesh->isFaceIgnored(f)))
+				m_faces.push_back(f);
+		}
+		// Create *unique* list of vertices of invalid faces.
+		const uint32_t faceCount = m_faces.size();
+		m_indices.resize(faceCount * 3);
+		const uint32_t approxVertexCount = min(faceCount * 3, mesh->vertexCount());
+		m_vertexToSourceVertexMap.clear();
+		m_vertexToSourceVertexMap.reserve(approxVertexCount);
+		HashMap<uint32_t, PassthroughHash<uint32_t>> sourceVertexToVertexMap(MemTag::Mesh, approxVertexCount);
+		for (uint32_t f = 0; f < faceCount; f++) {
+			const uint32_t face = m_faces[f];
+			for (uint32_t i = 0; i < 3; i++) {
+				const uint32_t vertex = mesh->vertexAt(face * 3 + i);
+				uint32_t newVertex = sourceVertexToVertexMap.get(vertex);
+				if (newVertex == UINT32_MAX) {
+					newVertex = sourceVertexToVertexMap.add(vertex);
+					m_vertexToSourceVertexMap.push_back(vertex);
 				}
-			}
-			if (!meshCloseHole(mesh, indexArray, Vector3(0.0f))) {
-				if (failed)
-					*failed = true;
-			}
-		} else {
-			// No t-junctions in this face. Copy from input mesh.
-			if (mesh->addFace(&inputMesh.indices()[f * 3]) == Mesh::AddFaceResult::DuplicateEdge) {
-				if (duplicatedEdge)
-					*duplicatedEdge = true;
+				m_indices[f * 3 + i] = newVertex;
 			}
 		}
 	}
-	if (fixedTJunctionsCount)
-		*fixedTJunctionsCount = splitEdges.size();
-	return mesh;
-}
 
-// boundaryLoops are the first edges for each boundary loop.
-static void meshGetBoundaryLoops(const Mesh &mesh, Array<uint32_t> &boundaryLoops)
-{
-	const uint32_t edgeCount = mesh.edgeCount();
-	BitArray bitFlags(edgeCount);
-	bitFlags.zeroOutMemory();
-	boundaryLoops.clear();
-	// Search for boundary edges. Mark all the edges that belong to the same boundary.
-	for (uint32_t e = 0; e < edgeCount; e++) {
-		if (bitFlags.get(e) || !mesh.isBoundaryEdge(e))
-			continue;
-		for (Mesh::BoundaryLoopEdgeIterator it(&mesh, e); !it.isDone(); it.advance())
-			bitFlags.set(it.edge());
-		boundaryLoops.push_back(e);
-	}
-}
+	ConstArrayView<uint32_t> faces() const { return m_faces; }
+	ConstArrayView<uint32_t> indices() const { return m_indices; }
+	ConstArrayView<uint32_t> vertices() const { return m_vertexToSourceVertexMap; }
 
-struct Progress
-{
-	Progress(ProgressCategory::Enum category, ProgressFunc func, void *userData, uint32_t maxValue) : value(0), cancel(false), m_category(category), m_func(func), m_userData(userData), m_maxValue(maxValue), m_progress(0)
-	{
+private:
+	Array<uint32_t> m_faces, m_indices;
+	Array<uint32_t> m_vertexToSourceVertexMap; // Map face vertices to vertices of the source mesh.
+};
+
+struct Progress {
+	Progress(ProgressCategory category, ProgressFunc func, void *userData, uint32_t maxValue) :
+			cancel(false), m_category(category), m_func(func), m_userData(userData), m_value(0), m_maxValue(maxValue), m_percent(0) {
 		if (m_func) {
 			if (!m_func(category, 0, userData))
 				cancel = true;
 		}
 	}
 
-	~Progress()
-	{
+	~Progress() {
 		if (m_func) {
 			if (!m_func(m_category, 100, m_userData))
 				cancel = true;
 		}
 	}
 
-	void update()
-	{
-		if (!m_func)
-			return;
-		m_mutex.lock();
-		const uint32_t newProgress = uint32_t(ceilf(value.load() / (float)m_maxValue * 100.0f));
-		if (newProgress != m_progress && newProgress < 100) {
-			m_progress = newProgress;
-			if (!m_func(m_category, m_progress, m_userData))
-				cancel = true;
-		}
-		m_mutex.unlock();
+	void increment(uint32_t value) {
+		m_value += value;
+		update();
 	}
 
-	void setMaxValue(uint32_t maxValue)
-	{
-		m_mutex.lock();
+	void setMaxValue(uint32_t maxValue) {
 		m_maxValue = maxValue;
-		m_mutex.unlock();
+		update();
 	}
 
-	std::atomic<uint32_t> value;
 	std::atomic<bool> cancel;
 
 private:
-	ProgressCategory::Enum m_category;
+	void update() {
+		if (!m_func)
+			return;
+		const uint32_t newPercent = uint32_t(ceilf(m_value.load() / (float)m_maxValue.load() * 100.0f));
+		if (newPercent != m_percent) {
+			// Atomic max.
+			uint32_t oldPercent = m_percent;
+			while (oldPercent < newPercent && !m_percent.compare_exchange_weak(oldPercent, newPercent)) {
+			}
+			if (!m_func(m_category, m_percent, m_userData))
+				cancel = true;
+		}
+	}
+
+	ProgressCategory m_category;
 	ProgressFunc m_func;
 	void *m_userData;
-	uint32_t m_maxValue;
-	uint32_t m_progress;
-	std::mutex m_mutex;
+	std::atomic<uint32_t> m_value, m_maxValue, m_percent;
 };
 
-struct Spinlock
-{
-	void lock() { while(m_lock.test_and_set(std::memory_order_acquire)) {} }
+struct Spinlock {
+	void lock() {
+		while (m_lock.test_and_set(std::memory_order_acquire)) {
+		}
+	}
 	void unlock() { m_lock.clear(std::memory_order_release); }
 
 private:
 	std::atomic_flag m_lock = ATOMIC_FLAG_INIT;
 };
 
-struct TaskGroupHandle
-{
+struct TaskGroupHandle {
 	uint32_t value = UINT32_MAX;
 };
 
-struct Task
-{
-	void (*func)(void *userData);
-	void *userData;
+struct Task {
+	void (*func)(void *groupUserData, void *taskUserData);
+	void *userData; // Passed to func as taskUserData.
 };
 
 #if XA_MULTITHREADED
-class TaskScheduler
-{
+class TaskScheduler {
 public:
-	TaskScheduler() : m_shutdown(false)
-	{
+	TaskScheduler() :
+			m_shutdown(false) {
 		m_threadIndex = 0;
 		// Max with current task scheduler usage is 1 per thread + 1 deep nesting, but allow for some slop.
 		m_maxGroups = std::thread::hardware_concurrency() * 4;
@@ -3739,6 +2968,7 @@ public:
 			new (&m_groups[i]) TaskGroup();
 			m_groups[i].free = true;
 			m_groups[i].ref = 0;
+			m_groups[i].userData = nullptr;
 		}
 		m_workers.resize(std::thread::hardware_concurrency() <= 1 ? 1 : std::thread::hardware_concurrency() - 1);
 		for (uint32_t i = 0; i < m_workers.size(); i++) {
@@ -3748,8 +2978,7 @@ public:
 		}
 	}
 
-	~TaskScheduler()
-	{
+	~TaskScheduler() {
 		m_shutdown = true;
 		for (uint32_t i = 0; i < m_workers.size(); i++) {
 			Worker &worker = m_workers[i];
@@ -3767,13 +2996,12 @@ public:
 		XA_FREE(m_groups);
 	}
 
-	uint32_t threadCount() const
-	{
+	uint32_t threadCount() const {
 		return max(1u, std::thread::hardware_concurrency()); // Including the main thread.
 	}
 
-	TaskGroupHandle createTaskGroup(uint32_t reserveSize = 0)
-	{
+	// userData is passed to Task::func as groupUserData.
+	TaskGroupHandle createTaskGroup(void *userData = nullptr, uint32_t reserveSize = 0) {
 		// Claim the first free group.
 		for (uint32_t i = 0; i < m_maxGroups; i++) {
 			TaskGroup &group = m_groups[i];
@@ -3785,6 +3013,8 @@ public:
 			group.queue.clear();
 			group.queue.reserve(reserveSize);
 			group.queueLock.unlock();
+			group.userData = userData;
+			group.ref = 0;
 			TaskGroupHandle handle;
 			handle.value = i;
 			return handle;
@@ -3795,8 +3025,7 @@ public:
 		return handle;
 	}
 
-	void run(TaskGroupHandle handle, const Task &task)
-	{
+	void run(TaskGroupHandle handle, const Task &task) {
 		XA_DEBUG_ASSERT(handle.value != UINT32_MAX);
 		TaskGroup &group = m_groups[handle.value];
 		group.queueLock.lock();
@@ -3810,8 +3039,7 @@ public:
 		}
 	}
 
-	void wait(TaskGroupHandle *handle)
-	{
+	void wait(TaskGroupHandle *handle) {
 		if (handle->value == UINT32_MAX) {
 			XA_DEBUG_ASSERT(false);
 			return;
@@ -3826,7 +3054,7 @@ public:
 			group.queueLock.unlock();
 			if (!task)
 				break;
-			task->func(task->userData);
+			task->func(group.userData, task->userData);
 			group.ref--;
 		}
 		// Even though the task queue is empty, workers can still be running tasks.
@@ -3839,17 +3067,16 @@ public:
 	static uint32_t currentThreadIndex() { return m_threadIndex; }
 
 private:
-	struct TaskGroup
-	{
+	struct TaskGroup {
 		std::atomic<bool> free;
 		Array<Task> queue; // Items are never removed. queueHead is incremented to pop items.
 		uint32_t queueHead = 0;
 		Spinlock queueLock;
 		std::atomic<uint32_t> ref; // Increment when a task is enqueued, decrement when a task finishes.
+		void *userData;
 	};
 
-	struct Worker
-	{
+	struct Worker {
 		std::thread *thread = nullptr;
 		std::mutex mutex;
 		std::condition_variable cv;
@@ -3862,12 +3089,11 @@ private:
 	uint32_t m_maxGroups;
 	static thread_local uint32_t m_threadIndex;
 
-	static void workerThread(TaskScheduler *scheduler, Worker *worker, uint32_t threadIndex)
-	{
+	static void workerThread(TaskScheduler *scheduler, Worker *worker, uint32_t threadIndex) {
 		m_threadIndex = threadIndex;
 		std::unique_lock<std::mutex> lock(worker->mutex);
 		for (;;) {
-			worker->cv.wait(lock, [=]{ return worker->wakeup.load(); });
+			worker->cv.wait(lock, [=] { return worker->wakeup.load(); });
 			worker->wakeup = false;
 			for (;;) {
 				if (scheduler->m_shutdown)
@@ -3889,7 +3115,7 @@ private:
 				}
 				if (!task)
 					break;
-				task->func(task->userData);
+				task->func(group->userData, task->userData);
 				group->ref--;
 			}
 		}
@@ -3898,44 +3124,39 @@ private:
 
 thread_local uint32_t TaskScheduler::m_threadIndex;
 #else
-class TaskScheduler
-{
+class TaskScheduler {
 public:
-	~TaskScheduler()
-	{
+	~TaskScheduler() {
 		for (uint32_t i = 0; i < m_groups.size(); i++)
 			destroyGroup({ i });
 	}
 
-	uint32_t threadCount() const
-	{
+	uint32_t threadCount() const {
 		return 1;
 	}
 
-	TaskGroupHandle createTaskGroup(uint32_t reserveSize = 0)
-	{
+	TaskGroupHandle createTaskGroup(void *userData = nullptr, uint32_t reserveSize = 0) {
 		TaskGroup *group = XA_NEW(MemTag::Default, TaskGroup);
 		group->queue.reserve(reserveSize);
+		group->userData = userData;
 		m_groups.push_back(group);
 		TaskGroupHandle handle;
 		handle.value = m_groups.size() - 1;
 		return handle;
 	}
 
-	void run(TaskGroupHandle handle, Task task)
-	{
+	void run(TaskGroupHandle handle, Task task) {
 		m_groups[handle.value]->queue.push_back(task);
 	}
 
-	void wait(TaskGroupHandle *handle)
-	{
+	void wait(TaskGroupHandle *handle) {
 		if (handle->value == UINT32_MAX) {
 			XA_DEBUG_ASSERT(false);
 			return;
 		}
 		TaskGroup *group = m_groups[handle->value];
 		for (uint32_t i = 0; i < group->queue.size(); i++)
-			group->queue[i].func(group->queue[i].userData);
+			group->queue[i].func(group->userData, group->queue[i].userData);
 		group->queue.clear();
 		destroyGroup(*handle);
 		handle->value = UINT32_MAX;
@@ -3944,8 +3165,7 @@ public:
 	static uint32_t currentThreadIndex() { return 0; }
 
 private:
-	void destroyGroup(TaskGroupHandle handle)
-	{
+	void destroyGroup(TaskGroupHandle handle) {
 		TaskGroup *group = m_groups[handle.value];
 		if (group) {
 			group->~TaskGroup();
@@ -3954,9 +3174,9 @@ private:
 		}
 	}
 
-	struct TaskGroup
-	{
+	struct TaskGroup {
 		Array<Task> queue;
+		void *userData;
 	};
 
 	Array<TaskGroup *> m_groups;
@@ -3968,8 +3188,7 @@ const uint8_t TGA_TYPE_RGB = 2;
 const uint8_t TGA_ORIGIN_UPPER = 0x20;
 
 #pragma pack(push, 1)
-struct TgaHeader
-{
+struct TgaHeader {
 	uint8_t id_length;
 	uint8_t colormap_type;
 	uint8_t image_type;
@@ -3986,8 +3205,7 @@ struct TgaHeader
 };
 #pragma pack(pop)
 
-static void WriteTga(const char *filename, const uint8_t *data, uint32_t width, uint32_t height)
-{
+static void WriteTga(const char *filename, const uint8_t *data, uint32_t width, uint32_t height) {
 	XA_DEBUG_ASSERT(sizeof(TgaHeader) == TgaHeader::Size);
 	FILE *f;
 	XA_FOPEN(f, filename, "wb");
@@ -4012,12 +3230,10 @@ static void WriteTga(const char *filename, const uint8_t *data, uint32_t width,
 }
 #endif
 
-template<typename T>
-class ThreadLocal
-{
+template <typename T>
+class ThreadLocal {
 public:
-	ThreadLocal()
-	{
+	ThreadLocal() {
 #if XA_MULTITHREADED
 		const uint32_t n = std::thread::hardware_concurrency();
 #else
@@ -4028,8 +3244,7 @@ public:
 			new (&m_array[i]) T;
 	}
 
-	~ThreadLocal()
-	{
+	~ThreadLocal() {
 #if XA_MULTITHREADED
 		const uint32_t n = std::thread::hardware_concurrency();
 #else
@@ -4040,8 +3255,7 @@ public:
 		XA_FREE(m_array);
 	}
 
-	T &get() const
-	{
+	T &get() const {
 		return m_array[TaskScheduler::currentThreadIndex()];
 	}
 
@@ -4049,11 +3263,104 @@ private:
 	T *m_array;
 };
 
-class UniformGrid2
-{
+// Implemented as a struct so the temporary arrays can be reused.
+struct Triangulator {
+	// This is doing a simple ear-clipping algorithm that skips invalid triangles. Ideally, we should
+	// also sort the ears by angle, start with the ones that have the smallest angle and proceed in order.
+	void triangulatePolygon(ConstArrayView<Vector3> vertices, ConstArrayView<uint32_t> inputIndices, Array<uint32_t> &outputIndices) {
+		m_polygonVertices.clear();
+		m_polygonVertices.reserve(inputIndices.length);
+		outputIndices.clear();
+		if (inputIndices.length == 3) {
+			// Simple case for triangles.
+			outputIndices.push_back(inputIndices[0]);
+			outputIndices.push_back(inputIndices[1]);
+			outputIndices.push_back(inputIndices[2]);
+		} else {
+			// Build 2D polygon projecting vertices onto normal plane.
+			// Faces are not necesarily planar, this is for example the case, when the face comes from filling a hole. In such cases
+			// it's much better to use the best fit plane.
+			Basis basis;
+			basis.normal = normalize(cross(vertices[inputIndices[1]] - vertices[inputIndices[0]], vertices[inputIndices[2]] - vertices[inputIndices[1]]));
+			basis.tangent = basis.computeTangent(basis.normal);
+			basis.bitangent = basis.computeBitangent(basis.normal, basis.tangent);
+			const uint32_t edgeCount = inputIndices.length;
+			m_polygonPoints.clear();
+			m_polygonPoints.reserve(edgeCount);
+			m_polygonAngles.clear();
+			m_polygonAngles.reserve(edgeCount);
+			for (uint32_t i = 0; i < inputIndices.length; i++) {
+				m_polygonVertices.push_back(inputIndices[i]);
+				const Vector3 &pos = vertices[inputIndices[i]];
+				m_polygonPoints.push_back(Vector2(dot(basis.tangent, pos), dot(basis.bitangent, pos)));
+			}
+			m_polygonAngles.resize(edgeCount);
+			while (m_polygonVertices.size() > 2) {
+				const uint32_t size = m_polygonVertices.size();
+				// Update polygon angles. @@ Update only those that have changed.
+				float minAngle = kPi2;
+				uint32_t bestEar = 0; // Use first one if none of them is valid.
+				bool bestIsValid = false;
+				for (uint32_t i = 0; i < size; i++) {
+					uint32_t i0 = i;
+					uint32_t i1 = (i + 1) % size; // Use Sean's polygon interation trick.
+					uint32_t i2 = (i + 2) % size;
+					Vector2 p0 = m_polygonPoints[i0];
+					Vector2 p1 = m_polygonPoints[i1];
+					Vector2 p2 = m_polygonPoints[i2];
+					float d = clamp(dot(p0 - p1, p2 - p1) / (length(p0 - p1) * length(p2 - p1)), -1.0f, 1.0f);
+					float angle = acosf(d);
+					float area = triangleArea(p0, p1, p2);
+					if (area < 0.0f)
+						angle = kPi2 - angle;
+					m_polygonAngles[i1] = angle;
+					if (angle < minAngle || !bestIsValid) {
+						// Make sure this is a valid ear, if not, skip this point.
+						bool valid = true;
+						for (uint32_t j = 0; j < size; j++) {
+							if (j == i0 || j == i1 || j == i2)
+								continue;
+							Vector2 p = m_polygonPoints[j];
+							if (pointInTriangle(p, p0, p1, p2)) {
+								valid = false;
+								break;
+							}
+						}
+						if (valid || !bestIsValid) {
+							minAngle = angle;
+							bestEar = i1;
+							bestIsValid = valid;
+						}
+					}
+				}
+				// Clip best ear:
+				const uint32_t i0 = (bestEar + size - 1) % size;
+				const uint32_t i1 = (bestEar + 0) % size;
+				const uint32_t i2 = (bestEar + 1) % size;
+				outputIndices.push_back(m_polygonVertices[i0]);
+				outputIndices.push_back(m_polygonVertices[i1]);
+				outputIndices.push_back(m_polygonVertices[i2]);
+				m_polygonVertices.removeAt(i1);
+				m_polygonPoints.removeAt(i1);
+				m_polygonAngles.removeAt(i1);
+			}
+		}
+	}
+
+private:
+	static bool pointInTriangle(const Vector2 &p, const Vector2 &a, const Vector2 &b, const Vector2 &c) {
+		return triangleArea(a, b, p) >= kAreaEpsilon && triangleArea(b, c, p) >= kAreaEpsilon && triangleArea(c, a, p) >= kAreaEpsilon;
+	}
+
+	Array<int> m_polygonVertices;
+	Array<float> m_polygonAngles;
+	Array<Vector2> m_polygonPoints;
+};
+
+class UniformGrid2 {
 public:
-	void reset(const Vector2 *positions, const uint32_t *indices = nullptr, uint32_t reserveEdgeCount = 0)
-	{
+	// indices are optional.
+	void reset(ConstArrayView<Vector2> positions, ConstArrayView<uint32_t> indices = ConstArrayView<uint32_t>(), uint32_t reserveEdgeCount = 0) {
 		m_edges.clear();
 		if (reserveEdgeCount > 0)
 			m_edges.reserve(reserveEdgeCount);
@@ -4062,14 +3369,12 @@ public:
 		m_cellDataOffsets.clear();
 	}
 
-	void append(uint32_t edge)
-	{
+	void append(uint32_t edge) {
 		XA_DEBUG_ASSERT(m_cellDataOffsets.isEmpty());
 		m_edges.push_back(edge);
 	}
 
-	bool intersect(Vector2 v1, Vector2 v2, float epsilon)
-	{
+	bool intersect(Vector2 v1, Vector2 v2, float epsilon) {
 		const uint32_t edgeCount = m_edges.size();
 		bool bruteForce = edgeCount <= 20;
 		if (!bruteForce && m_cellDataOffsets.isEmpty())
@@ -4096,8 +3401,7 @@ public:
 	}
 
 	// If edges is empty, checks for intersection with all edges in the grid.
-	bool intersect(float epsilon, ConstArrayView<uint32_t> edges = ConstArrayView<uint32_t>(), ConstArrayView<uint32_t> ignoreEdges = ConstArrayView<uint32_t>())
-	{
+	bool intersect(float epsilon, ConstArrayView<uint32_t> edges = ConstArrayView<uint32_t>(), ConstArrayView<uint32_t> ignoreEdges = ConstArrayView<uint32_t>()) {
 		bool bruteForce = m_edges.size() <= 20;
 		if (!bruteForce && m_cellDataOffsets.isEmpty())
 			bruteForce = !createGrid();
@@ -4167,8 +3471,7 @@ public:
 	}
 
 #if XA_DEBUG_EXPORT_BOUNDARY_GRID
-	void debugExport(const char *filename)
-	{
+	void debugExport(const char *filename) {
 		Array<uint8_t> image;
 		image.resize(m_gridWidth * m_gridHeight * 3);
 		for (uint32_t y = 0; y < m_gridHeight; y++) {
@@ -4190,8 +3493,7 @@ public:
 #endif
 
 private:
-	bool createGrid()
-	{
+	bool createGrid() {
 		// Compute edge extents. Min will be the grid origin.
 		const uint32_t edgeCount = m_edges.size();
 		Extents2 edgeExtents;
@@ -4202,14 +3504,14 @@ private:
 			edgeExtents.add(edgePosition1(edge));
 		}
 		m_gridOrigin = edgeExtents.min;
-		// Size grid to approximately one edge per cell.
+		// Size grid to approximately one edge per cell in the largest dimension.
 		const Vector2 extentsSize(edgeExtents.max - edgeExtents.min);
-		m_cellSize = min(extentsSize.x, extentsSize.y) / sqrtf((float)edgeCount);
+		m_cellSize = max(extentsSize.x, extentsSize.y) / (float)clamp(edgeCount, 32u, 512u);
 		if (m_cellSize <= 0.0f)
 			return false;
 		m_gridWidth = uint32_t(ceilf(extentsSize.x / m_cellSize));
 		m_gridHeight = uint32_t(ceilf(extentsSize.y / m_cellSize));
-		if (m_gridWidth == 0 || m_gridHeight == 0)
+		if (m_gridWidth <= 1 || m_gridHeight <= 1)
 			return false;
 		// Insert edges into cells.
 		m_cellDataOffsets.resize(m_gridWidth * m_gridHeight);
@@ -4243,8 +3545,7 @@ private:
 		return true;
 	}
 
-	void computePotentialEdges(Vector2 p1, Vector2 p2)
-	{
+	void computePotentialEdges(Vector2 p1, Vector2 p2) {
 		m_potentialEdges.clear();
 		traverse(p1, p2);
 		for (uint32_t j = 0; j < m_traversedCellOffsets.size(); j++) {
@@ -4262,10 +3563,9 @@ private:
 	}
 
 	// "A Fast Voxel Traversal Algorithm for Ray Tracing"
-	void traverse(Vector2 p1, Vector2 p2)
-	{
+	void traverse(Vector2 p1, Vector2 p2) {
 		const Vector2 dir = p2 - p1;
-		const Vector2 normal = normalizeSafe(dir, Vector2(0.0f), kEpsilon);
+		const Vector2 normal = normalizeSafe(dir, Vector2(0.0f));
 		const int stepX = dir.x >= 0 ? 1 : -1;
 		const int stepY = dir.y >= 0 ? 1 : -1;
 		const uint32_t firstCell[2] = { cellX(p1.x), cellY(p1.y) };
@@ -4284,14 +3584,12 @@ private:
 		if (normal.x > kEpsilon || normal.x < -kEpsilon) {
 			tMaxX = (distToNextCellX * stepX) / normal.x;
 			tDeltaX = (m_cellSize * stepX) / normal.x;
-		}
-		else
+		} else
 			tMaxX = tDeltaX = FLT_MAX;
 		if (normal.y > kEpsilon || normal.y < -kEpsilon) {
 			tMaxY = (distToNextCellY * stepY) / normal.y;
 			tDeltaY = (m_cellSize * stepY) / normal.y;
-		}
-		else
+		} else
 			tMaxY = tDeltaY = FLT_MAX;
 		m_traversedCellOffsets.clear();
 		m_traversedCellOffsets.push_back(firstCell[0] + firstCell[1] * m_gridWidth);
@@ -4318,34 +3616,29 @@ private:
 		}
 	}
 
-	uint32_t cellX(float x) const
-	{
+	uint32_t cellX(float x) const {
 		return min((uint32_t)max(0.0f, (x - m_gridOrigin.x) / m_cellSize), m_gridWidth - 1u);
 	}
 
-	uint32_t cellY(float y) const
-	{
+	uint32_t cellY(float y) const {
 		return min((uint32_t)max(0.0f, (y - m_gridOrigin.y) / m_cellSize), m_gridHeight - 1u);
 	}
 
-	Vector2 edgePosition0(uint32_t edge) const
-	{
+	Vector2 edgePosition0(uint32_t edge) const {
 		return m_positions[vertexAt(meshEdgeIndex0(edge))];
 	}
 
-	Vector2 edgePosition1(uint32_t edge) const
-	{
+	Vector2 edgePosition1(uint32_t edge) const {
 		return m_positions[vertexAt(meshEdgeIndex1(edge))];
 	}
 
-	uint32_t vertexAt(uint32_t index) const
-	{
-		return m_indices ? m_indices[index] : index;
+	uint32_t vertexAt(uint32_t index) const {
+		return m_indices.length > 0 ? m_indices[index] : index;
 	}
 
 	Array<uint32_t> m_edges;
-	const Vector2 *m_positions;
-	const uint32_t *m_indices; // Optional
+	ConstArrayView<Vector2> m_positions;
+	ConstArrayView<uint32_t> m_indices; // Optional. Empty if unused.
 	float m_cellSize;
 	Vector2 m_gridOrigin;
 	uint32_t m_gridWidth, m_gridHeight; // in cells
@@ -4355,26 +3648,25 @@ private:
 	Array<uint32_t> m_traversedCellOffsets;
 };
 
-struct UvMeshChart
-{
+struct UvMeshChart {
 	Array<uint32_t> faces;
 	Array<uint32_t> indices;
 	uint32_t material;
 };
 
-struct UvMesh
-{
+struct UvMesh {
 	UvMeshDecl decl;
+	BitArray faceIgnore;
+	Array<uint32_t> faceMaterials;
 	Array<uint32_t> indices;
+	Array<Vector2> texcoords; // Copied from input and never modified, UvMeshInstance::texcoords are. Used to restore UvMeshInstance::texcoords so packing can be run multiple times.
 	Array<UvMeshChart *> charts;
 	Array<uint32_t> vertexToChartMap;
 };
 
-struct UvMeshInstance
-{
+struct UvMeshInstance {
 	UvMesh *mesh;
 	Array<Vector2> texcoords;
-	bool rotateCharts;
 };
 
 /*
@@ -4420,27 +3712,30 @@ struct UvMeshInstance
  *     FRANCE
  */
 namespace opennl {
-#define NL_NEW(T)              XA_ALLOC(MemTag::OpenNL, T)
-#define NL_NEW_ARRAY(T,NB)     XA_ALLOC_ARRAY(MemTag::OpenNL, T, NB)
-#define NL_RENEW_ARRAY(T,x,NB) XA_REALLOC(MemTag::OpenNL, x, T, NB)
-#define NL_DELETE(x)           XA_FREE(x); x = nullptr 
-#define NL_DELETE_ARRAY(x)     XA_FREE(x); x = nullptr
-#define NL_CLEAR(x, T)         memset(x, 0, sizeof(T));
-#define NL_CLEAR_ARRAY(T,x,NB) memset(x, 0, (size_t)(NB)*sizeof(T)) 
-#define NL_NEW_VECTOR(dim)     XA_ALLOC_ARRAY(MemTag::OpenNL, double, dim)
-#define NL_DELETE_VECTOR(ptr)  XA_FREE(ptr)
+#define NL_NEW(T) XA_ALLOC(MemTag::OpenNL, T)
+#define NL_NEW_ARRAY(T, NB) XA_ALLOC_ARRAY(MemTag::OpenNL, T, NB)
+#define NL_RENEW_ARRAY(T, x, NB) XA_REALLOC(MemTag::OpenNL, x, T, NB)
+#define NL_DELETE(x) \
+	XA_FREE(x);      \
+	x = nullptr
+#define NL_DELETE_ARRAY(x) \
+	XA_FREE(x);            \
+	x = nullptr
+#define NL_CLEAR(x, T) memset(x, 0, sizeof(T));
+#define NL_CLEAR_ARRAY(T, x, NB) memset(x, 0, (size_t)(NB) * sizeof(T))
+#define NL_NEW_VECTOR(dim) XA_ALLOC_ARRAY(MemTag::OpenNL, double, dim)
+#define NL_DELETE_VECTOR(ptr) XA_FREE(ptr)
 
 struct NLMatrixStruct;
-typedef NLMatrixStruct * NLMatrix;
+typedef NLMatrixStruct *NLMatrix;
 typedef void (*NLDestroyMatrixFunc)(NLMatrix M);
-typedef void (*NLMultMatrixVectorFunc)(NLMatrix M, const double* x, double* y);
+typedef void (*NLMultMatrixVectorFunc)(NLMatrix M, const double *x, double *y);
 
 #define NL_MATRIX_SPARSE_DYNAMIC 0x1001
-#define NL_MATRIX_CRS            0x1002
-#define NL_MATRIX_OTHER          0x1006
+#define NL_MATRIX_CRS 0x1002
+#define NL_MATRIX_OTHER 0x1006
 
-struct NLMatrixStruct
-{
+struct NLMatrixStruct {
 	uint32_t m;
 	uint32_t n;
 	uint32_t type;
@@ -4450,39 +3745,35 @@ struct NLMatrixStruct
 
 /* Dynamic arrays for sparse row/columns */
 
-struct NLCoeff
-{
+struct NLCoeff {
 	uint32_t index;
 	double value;
 };
 
-struct NLRowColumn
-{
+struct NLRowColumn {
 	uint32_t size;
 	uint32_t capacity;
-	NLCoeff* coeff;
+	NLCoeff *coeff;
 };
 
 /* Compressed Row Storage */
 
-struct NLCRSMatrix
-{
+struct NLCRSMatrix {
 	uint32_t m;
 	uint32_t n;
 	uint32_t type;
 	NLDestroyMatrixFunc destroy_func;
 	NLMultMatrixVectorFunc mult_func;
-	double* val;
-	uint32_t* rowptr;
-	uint32_t* colind;
+	double *val;
+	uint32_t *rowptr;
+	uint32_t *colind;
 	uint32_t nslices;
-	uint32_t* sliceptr;
+	uint32_t *sliceptr;
 };
 
 /* SparseMatrix data structure */
 
-struct NLSparseMatrix
-{
+struct NLSparseMatrix {
 	uint32_t m;
 	uint32_t n;
 	uint32_t type;
@@ -4490,25 +3781,23 @@ struct NLSparseMatrix
 	NLMultMatrixVectorFunc mult_func;
 	uint32_t diag_size;
 	uint32_t diag_capacity;
-	NLRowColumn* row;
-	NLRowColumn* column;
-	double*    diag;
+	NLRowColumn *row;
+	NLRowColumn *column;
+	double *diag;
 	uint32_t row_capacity;
 	uint32_t column_capacity;
 };
 
 /* NLContext data structure */
 
-struct NLBufferBinding
-{
-	void* base_address;
+struct NLBufferBinding {
+	void *base_address;
 	uint32_t stride;
 };
 
-#define NL_BUFFER_ITEM(B,i) *(double*)((void*)((char*)((B).base_address)+((i)*(B).stride)))
+#define NL_BUFFER_ITEM(B, i) *(double *)((void *)((char *)((B).base_address) + ((i) * (B).stride)))
 
-struct NLContext
-{
+struct NLContext {
 	NLBufferBinding *variable_buffer;
 	double *variable_value;
 	bool *variable_is_locked;
@@ -4532,35 +3821,30 @@ struct NLContext
 	double error;
 };
 
-static void nlDeleteMatrix(NLMatrix M)
-{
+static void nlDeleteMatrix(NLMatrix M) {
 	if (!M)
 		return;
 	M->destroy_func(M);
 	NL_DELETE(M);
 }
 
-static void nlMultMatrixVector(NLMatrix M, const double* x, double* y)
-{
+static void nlMultMatrixVector(NLMatrix M, const double *x, double *y) {
 	M->mult_func(M, x, y);
 }
 
-static void nlRowColumnConstruct(NLRowColumn* c)
-{
+static void nlRowColumnConstruct(NLRowColumn *c) {
 	c->size = 0;
 	c->capacity = 0;
 	c->coeff = nullptr;
 }
 
-static void nlRowColumnDestroy(NLRowColumn* c)
-{
+static void nlRowColumnDestroy(NLRowColumn *c) {
 	NL_DELETE_ARRAY(c->coeff);
 	c->size = 0;
 	c->capacity = 0;
 }
 
-static void nlRowColumnGrow(NLRowColumn* c)
-{
+static void nlRowColumnGrow(NLRowColumn *c) {
 	if (c->capacity != 0) {
 		c->capacity = 2 * c->capacity;
 		c->coeff = NL_RENEW_ARRAY(NLCoeff, c->coeff, c->capacity);
@@ -4571,8 +3855,7 @@ static void nlRowColumnGrow(NLRowColumn* c)
 	}
 }
 
-static void nlRowColumnAdd(NLRowColumn* c, uint32_t index, double value)
-{
+static void nlRowColumnAdd(NLRowColumn *c, uint32_t index, double value) {
 	for (uint32_t i = 0; i < c->size; i++) {
 		if (c->coeff[i].index == index) {
 			c->coeff[i].value += value;
@@ -4587,8 +3870,7 @@ static void nlRowColumnAdd(NLRowColumn* c, uint32_t index, double value)
 }
 
 /* Does not check whether the index already exists */
-static void nlRowColumnAppend(NLRowColumn* c, uint32_t index, double value)
-{
+static void nlRowColumnAppend(NLRowColumn *c, uint32_t index, double value) {
 	if (c->size == c->capacity)
 		nlRowColumnGrow(c);
 	c->coeff[c->size].index = index;
@@ -4596,32 +3878,27 @@ static void nlRowColumnAppend(NLRowColumn* c, uint32_t index, double value)
 	c->size++;
 }
 
-static void nlRowColumnZero(NLRowColumn* c)
-{
+static void nlRowColumnZero(NLRowColumn *c) {
 	c->size = 0;
 }
 
-static void nlRowColumnClear(NLRowColumn* c)
-{
+static void nlRowColumnClear(NLRowColumn *c) {
 	c->size = 0;
 	c->capacity = 0;
 	NL_DELETE_ARRAY(c->coeff);
 }
 
-static int nlCoeffCompare(const void* p1, const void* p2)
-{
-	return (((NLCoeff*)(p2))->index < ((NLCoeff*)(p1))->index);
+static int nlCoeffCompare(const void *p1, const void *p2) {
+	return (((NLCoeff *)(p2))->index < ((NLCoeff *)(p1))->index);
 }
 
-static void nlRowColumnSort(NLRowColumn* c)
-{
+static void nlRowColumnSort(NLRowColumn *c) {
 	qsort(c->coeff, c->size, sizeof(NLCoeff), nlCoeffCompare);
 }
 
 /* CRSMatrix data structure */
 
-static void nlCRSMatrixDestroy(NLCRSMatrix* M)
-{
+static void nlCRSMatrixDestroy(NLCRSMatrix *M) {
 	NL_DELETE_ARRAY(M->val);
 	NL_DELETE_ARRAY(M->rowptr);
 	NL_DELETE_ARRAY(M->colind);
@@ -4631,8 +3908,7 @@ static void nlCRSMatrixDestroy(NLCRSMatrix* M)
 	M->nslices = 0;
 }
 
-static void nlCRSMatrixMultSlice(NLCRSMatrix* M, const double* x, double* y, uint32_t Ibegin, uint32_t Iend)
-{
+static void nlCRSMatrixMultSlice(NLCRSMatrix *M, const double *x, double *y, uint32_t Ibegin, uint32_t Iend) {
 	for (uint32_t i = Ibegin; i < Iend; ++i) {
 		double sum = 0.0;
 		for (uint32_t j = M->rowptr[i]; j < M->rowptr[i + 1]; ++j)
@@ -4641,15 +3917,13 @@ static void nlCRSMatrixMultSlice(NLCRSMatrix* M, const double* x, double* y, uin
 	}
 }
 
-static void nlCRSMatrixMult(NLCRSMatrix* M, const double* x, double* y)
-{
+static void nlCRSMatrixMult(NLCRSMatrix *M, const double *x, double *y) {
 	int nslices = (int)(M->nslices);
 	for (int slice = 0; slice < nslices; ++slice)
 		nlCRSMatrixMultSlice(M, x, y, M->sliceptr[slice], M->sliceptr[slice + 1]);
 }
 
-static void nlCRSMatrixConstruct(NLCRSMatrix* M, uint32_t m, uint32_t n, uint32_t nnz, uint32_t nslices)
-{
+static void nlCRSMatrixConstruct(NLCRSMatrix *M, uint32_t m, uint32_t n, uint32_t nnz, uint32_t nslices) {
 	M->m = m;
 	M->n = n;
 	M->type = NL_MATRIX_CRS;
@@ -4668,22 +3942,19 @@ static void nlCRSMatrixConstruct(NLCRSMatrix* M, uint32_t m, uint32_t n, uint32_
 
 /* SparseMatrix data structure */
 
-static void nlSparseMatrixDestroyRowColumns(NLSparseMatrix* M)
-{
+static void nlSparseMatrixDestroyRowColumns(NLSparseMatrix *M) {
 	for (uint32_t i = 0; i < M->m; i++)
 		nlRowColumnDestroy(&(M->row[i]));
 	NL_DELETE_ARRAY(M->row);
 }
 
-static void nlSparseMatrixDestroy(NLSparseMatrix* M)
-{
+static void nlSparseMatrixDestroy(NLSparseMatrix *M) {
 	XA_DEBUG_ASSERT(M->type == NL_MATRIX_SPARSE_DYNAMIC);
 	nlSparseMatrixDestroyRowColumns(M);
 	NL_DELETE_ARRAY(M->diag);
 }
 
-static void nlSparseMatrixAdd(NLSparseMatrix* M, uint32_t i, uint32_t j, double value)
-{
+static void nlSparseMatrixAdd(NLSparseMatrix *M, uint32_t i, uint32_t j, double value) {
 	XA_DEBUG_ASSERT(i >= 0 && i <= M->m - 1);
 	XA_DEBUG_ASSERT(j >= 0 && j <= M->n - 1);
 	if (i == j)
@@ -4692,24 +3963,21 @@ static void nlSparseMatrixAdd(NLSparseMatrix* M, uint32_t i, uint32_t j, double
 }
 
 /* Returns the number of non-zero coefficients */
-static uint32_t nlSparseMatrixNNZ(NLSparseMatrix* M)
-{
+static uint32_t nlSparseMatrixNNZ(NLSparseMatrix *M) {
 	uint32_t nnz = 0;
 	for (uint32_t i = 0; i < M->m; i++)
 		nnz += M->row[i].size;
 	return nnz;
 }
 
-static void nlSparseMatrixSort(NLSparseMatrix* M)
-{
+static void nlSparseMatrixSort(NLSparseMatrix *M) {
 	for (uint32_t i = 0; i < M->m; i++)
 		nlRowColumnSort(&(M->row[i]));
 }
 
 /* SparseMatrix x Vector routines, internal helper routines */
 
-static void nlSparseMatrix_mult_rows(NLSparseMatrix* A,	const double* x, double* y)
-{
+static void nlSparseMatrix_mult_rows(NLSparseMatrix *A, const double *x, double *y) {
 	/*
 	 * Note: OpenMP does not like unsigned ints
 	 * (causes some floating point exceptions),
@@ -4717,8 +3985,8 @@ static void nlSparseMatrix_mult_rows(NLSparseMatrix* A,	const double* x, double*
 	 * indices.
 	 */
 	int m = (int)(A->m);
-	NLCoeff* c = nullptr;
-	NLRowColumn* Ri = nullptr;
+	NLCoeff *c = nullptr;
+	NLRowColumn *Ri = nullptr;
 	for (int i = 0; i < m; i++) {
 		Ri = &(A->row[i]);
 		y[i] = 0;
@@ -4729,14 +3997,12 @@ static void nlSparseMatrix_mult_rows(NLSparseMatrix* A,	const double* x, double*
 	}
 }
 
-static void nlSparseMatrixMult(NLSparseMatrix* A, const double* x, double* y)
-{
+static void nlSparseMatrixMult(NLSparseMatrix *A, const double *x, double *y) {
 	XA_DEBUG_ASSERT(A->type == NL_MATRIX_SPARSE_DYNAMIC);
 	nlSparseMatrix_mult_rows(A, x, y);
 }
 
-static void nlSparseMatrixConstruct(NLSparseMatrix* M, uint32_t m, uint32_t n)
-{
+static void nlSparseMatrixConstruct(NLSparseMatrix *M, uint32_t m, uint32_t n) {
 	M->m = m;
 	M->n = n;
 	M->type = NL_MATRIX_SPARSE_DYNAMIC;
@@ -4756,24 +4022,23 @@ static void nlSparseMatrixConstruct(NLSparseMatrix* M, uint32_t m, uint32_t n)
 	NL_CLEAR_ARRAY(double, M->diag, M->diag_size);
 }
 
-static NLMatrix nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M)
-{
+static NLMatrix nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix *M) {
 	uint32_t nnz = nlSparseMatrixNNZ(M);
 	uint32_t nslices = 8; /* TODO: get number of cores */
 	uint32_t slice, cur_bound, cur_NNZ, cur_row;
 	uint32_t k;
 	uint32_t slice_size = nnz / nslices;
-	NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix);
+	NLCRSMatrix *CRS = NL_NEW(NLCRSMatrix);
 	NL_CLEAR(CRS, NLCRSMatrix);
 	nlCRSMatrixConstruct(CRS, M->m, M->n, nnz, nslices);
 	nlSparseMatrixSort(M);
 	/* Convert matrix to CRS format */
 	k = 0;
 	for (uint32_t i = 0; i < M->m; ++i) {
-		NLRowColumn* Ri = &(M->row[i]);
+		NLRowColumn *Ri = &(M->row[i]);
 		CRS->rowptr[i] = k;
 		for (uint32_t ij = 0; ij < Ri->size; ij++) {
-			NLCoeff* c = &(Ri->coeff[ij]);
+			NLCoeff *c = &(Ri->coeff[ij]);
 			CRS->val[k] = c->value;
 			CRS->colind[k] = c->index;
 			++k;
@@ -4799,19 +4064,17 @@ static NLMatrix nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M)
 	return (NLMatrix)CRS;
 }
 
-static void nlMatrixCompress(NLMatrix* M)
-{
+static void nlMatrixCompress(NLMatrix *M) {
 	NLMatrix CRS = nullptr;
 	if ((*M)->type != NL_MATRIX_SPARSE_DYNAMIC)
 		return;
-	CRS = nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)*M);
+	CRS = nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix *)*M);
 	nlDeleteMatrix(*M);
 	*M = CRS;
 }
 
-static NLContext *nlNewContext()
-{
-	NLContext* result = NL_NEW(NLContext);
+static NLContext *nlNewContext() {
+	NLContext *result = NL_NEW(NLContext);
 	NL_CLEAR(result, NLContext);
 	result->max_iterations = 100;
 	result->threshold = 1e-6;
@@ -4820,8 +4083,7 @@ static NLContext *nlNewContext()
 	return result;
 }
 
-static void nlDeleteContext(NLContext *context)
-{
+static void nlDeleteContext(NLContext *context) {
 	nlDeleteMatrix(context->M);
 	context->M = nullptr;
 	nlDeleteMatrix(context->P);
@@ -4839,22 +4101,19 @@ static void nlDeleteContext(NLContext *context)
 	NL_DELETE(context);
 }
 
-static double ddot(int n, const double *x, const double *y)
-{
+static double ddot(int n, const double *x, const double *y) {
 	double sum = 0.0;
 	for (int i = 0; i < n; i++)
 		sum += x[i] * y[i];
 	return sum;
 }
 
-static void daxpy(int n, double a, const double *x, double *y)
-{
+static void daxpy(int n, double a, const double *x, double *y) {
 	for (int i = 0; i < n; i++)
 		y[i] = a * x[i] + y[i];
 }
 
-static void dscal(int n, double a, double *x)
-{
+static void dscal(int n, double a, double *x) {
 	for (int i = 0; i < n; i++)
 		x[i] *= a;
 }
@@ -4877,17 +4136,16 @@ static void dscal(int n, double a, double *x)
  *     versions of matrix x vector product (CPU/GPU, sparse/dense ...)
  */
 
-static uint32_t nlSolveSystem_PRE_CG(NLMatrix M, NLMatrix P, double* b, double* x, double eps, uint32_t max_iter, double *sq_bnorm, double *sq_rnorm)
-{
-	int     N = (int)M->n;
-	double* r = NL_NEW_VECTOR(N);
-	double* d = NL_NEW_VECTOR(N);
-	double* h = NL_NEW_VECTOR(N);
+static uint32_t nlSolveSystem_PRE_CG(NLMatrix M, NLMatrix P, double *b, double *x, double eps, uint32_t max_iter, double *sq_bnorm, double *sq_rnorm) {
+	int N = (int)M->n;
+	double *r = NL_NEW_VECTOR(N);
+	double *d = NL_NEW_VECTOR(N);
+	double *h = NL_NEW_VECTOR(N);
 	double *Ad = h;
 	uint32_t its = 0;
 	double rh, alpha, beta;
 	double b_square = ddot(N, b, b);
-	double err = eps * eps*b_square;
+	double err = eps * eps * b_square;
 	double curr_err;
 	nlMultMatrixVector(M, x, r);
 	daxpy(N, -1., b, r);
@@ -4917,13 +4175,12 @@ static uint32_t nlSolveSystem_PRE_CG(NLMatrix M, NLMatrix P, double* b, double*
 	return its;
 }
 
-static uint32_t nlSolveSystemIterative(NLContext *context, NLMatrix M, NLMatrix P, double* b_in, double* x_in, double eps, uint32_t max_iter)
-{
+static uint32_t nlSolveSystemIterative(NLContext *context, NLMatrix M, NLMatrix P, double *b_in, double *x_in, double eps, uint32_t max_iter) {
 	uint32_t result = 0;
 	double rnorm = 0.0;
 	double bnorm = 0.0;
-	double* b = b_in;
-	double* x = x_in;
+	double *b = b_in;
+	double *x = x_in;
 	XA_DEBUG_ASSERT(M->m == M->n);
 	double sq_bnorm, sq_rnorm;
 	result = nlSolveSystem_PRE_CG(M, P, b, x, eps, max_iter, &sq_bnorm, &sq_rnorm);
@@ -4938,10 +4195,9 @@ static uint32_t nlSolveSystemIterative(NLContext *context, NLMatrix M, NLMatrix
 	return result;
 }
 
-static bool nlSolveIterative(NLContext *context)
-{
-	double* b = context->b;
-	double* x = context->x;
+static bool nlSolveIterative(NLContext *context) {
+	double *b = context->b;
+	double *x = context->x;
 	uint32_t n = context->n;
 	NLMatrix M = context->M;
 	NLMatrix P = context->P;
@@ -4953,34 +4209,30 @@ static bool nlSolveIterative(NLContext *context)
 	return true;
 }
 
-struct NLJacobiPreconditioner
-{
+struct NLJacobiPreconditioner {
 	uint32_t m;
 	uint32_t n;
 	uint32_t type;
 	NLDestroyMatrixFunc destroy_func;
 	NLMultMatrixVectorFunc mult_func;
-	double* diag_inv;
+	double *diag_inv;
 };
 
-static void nlJacobiPreconditionerDestroy(NLJacobiPreconditioner* M)
-{
+static void nlJacobiPreconditionerDestroy(NLJacobiPreconditioner *M) {
 	NL_DELETE_ARRAY(M->diag_inv);
 }
 
-static void nlJacobiPreconditionerMult(NLJacobiPreconditioner* M, const double* x, double* y)
-{
+static void nlJacobiPreconditionerMult(NLJacobiPreconditioner *M, const double *x, double *y) {
 	for (uint32_t i = 0; i < M->n; ++i)
 		y[i] = x[i] * M->diag_inv[i];
 }
 
-static NLMatrix nlNewJacobiPreconditioner(NLMatrix M_in)
-{
-	NLSparseMatrix* M = nullptr;
-	NLJacobiPreconditioner* result = nullptr;
+static NLMatrix nlNewJacobiPreconditioner(NLMatrix M_in) {
+	NLSparseMatrix *M = nullptr;
+	NLJacobiPreconditioner *result = nullptr;
 	XA_DEBUG_ASSERT(M_in->type == NL_MATRIX_SPARSE_DYNAMIC);
 	XA_DEBUG_ASSERT(M_in->m == M_in->n);
-	M = (NLSparseMatrix*)M_in;
+	M = (NLSparseMatrix *)M_in;
 	result = NL_NEW(NLJacobiPreconditioner);
 	NL_CLEAR(result, NLJacobiPreconditioner);
 	result->m = M->m;
@@ -4998,8 +4250,7 @@ static NLMatrix nlNewJacobiPreconditioner(NLMatrix M_in)
 #define NL_NB_VARIABLES 0x101
 #define NL_MAX_ITERATIONS 0x103
 
-static void nlSolverParameteri(NLContext *context, uint32_t pname, int param)
-{
+static void nlSolverParameteri(NLContext *context, uint32_t pname, int param) {
 	if (pname == NL_NB_VARIABLES) {
 		XA_DEBUG_ASSERT(param > 0);
 		context->nb_variables = (uint32_t)param;
@@ -5010,26 +4261,22 @@ static void nlSolverParameteri(NLContext *context, uint32_t pname, int param)
 	}
 }
 
-static void nlSetVariable(NLContext *context, uint32_t index, double value)
-{
+static void nlSetVariable(NLContext *context, uint32_t index, double value) {
 	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
 	NL_BUFFER_ITEM(context->variable_buffer[0], index) = value;
 }
 
-static double nlGetVariable(NLContext *context, uint32_t index)
-{
+static double nlGetVariable(NLContext *context, uint32_t index) {
 	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
 	return NL_BUFFER_ITEM(context->variable_buffer[0], index);
 }
 
-static void nlLockVariable(NLContext *context, uint32_t index)
-{
+static void nlLockVariable(NLContext *context, uint32_t index) {
 	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
 	context->variable_is_locked[index] = true;
 }
 
-static void nlVariablesToVector(NLContext *context)
-{
+static void nlVariablesToVector(NLContext *context) {
 	uint32_t n = context->n;
 	XA_DEBUG_ASSERT(context->x);
 	for (uint32_t k = 0; k < context->nb_systems; ++k) {
@@ -5044,8 +4291,7 @@ static void nlVariablesToVector(NLContext *context)
 	}
 }
 
-static void nlVectorToVariables(NLContext *context)
-{
+static void nlVectorToVariables(NLContext *context) {
 	uint32_t n = context->n;
 	XA_DEBUG_ASSERT(context->x);
 	for (uint32_t k = 0; k < context->nb_systems; ++k) {
@@ -5060,8 +4306,7 @@ static void nlVectorToVariables(NLContext *context)
 	}
 }
 
-static void nlCoefficient(NLContext *context, uint32_t index, double value)
-{
+static void nlCoefficient(NLContext *context, uint32_t index, double value) {
 	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
 	if (context->variable_is_locked[index]) {
 		/*
@@ -5078,12 +4323,11 @@ static void nlCoefficient(NLContext *context, uint32_t index, double value)
 	}
 }
 
-#define NL_SYSTEM  0x0
-#define NL_MATRIX  0x1
-#define NL_ROW     0x2
+#define NL_SYSTEM 0x0
+#define NL_MATRIX 0x1
+#define NL_ROW 0x2
 
-static void nlBegin(NLContext *context, uint32_t prim)
-{
+static void nlBegin(NLContext *context, uint32_t prim) {
 	if (prim == NL_SYSTEM) {
 		XA_DEBUG_ASSERT(context->nb_variables > 0);
 		context->variable_buffer = NL_NEW_ARRAY(NLBufferBinding, context->nb_systems);
@@ -5092,8 +4336,8 @@ static void nlBegin(NLContext *context, uint32_t prim)
 		NL_CLEAR_ARRAY(double, context->variable_value, context->nb_variables * context->nb_systems);
 		for (uint32_t k = 0; k < context->nb_systems; ++k) {
 			context->variable_buffer[k].base_address =
-				context->variable_value +
-				k * context->nb_variables;
+					context->variable_value +
+					k * context->nb_variables;
 			context->variable_buffer[k].stride = sizeof(double);
 		}
 		context->variable_is_locked = NL_NEW_ARRAY(bool, context->nb_variables);
@@ -5116,11 +4360,11 @@ static void nlBegin(NLContext *context, uint32_t prim)
 			context->max_iterations = n * 5;
 		context->M = (NLMatrix)(NL_NEW(NLSparseMatrix));
 		NL_CLEAR(context->M, NLSparseMatrix);
-		nlSparseMatrixConstruct((NLSparseMatrix*)(context->M), n, n);
-		context->x = NL_NEW_ARRAY(double, n*context->nb_systems);
-		NL_CLEAR_ARRAY(double, context->x, n*context->nb_systems);
-		context->b = NL_NEW_ARRAY(double, n*context->nb_systems);
-		NL_CLEAR_ARRAY(double, context->b, n*context->nb_systems);
+		nlSparseMatrixConstruct((NLSparseMatrix *)(context->M), n, n);
+		context->x = NL_NEW_ARRAY(double, n * context->nb_systems);
+		NL_CLEAR_ARRAY(double, context->x, n * context->nb_systems);
+		context->b = NL_NEW_ARRAY(double, n * context->nb_systems);
+		NL_CLEAR_ARRAY(double, context->b, n * context->nb_systems);
 		nlVariablesToVector(context);
 		nlRowColumnConstruct(&context->af);
 		nlRowColumnConstruct(&context->al);
@@ -5131,16 +4375,15 @@ static void nlBegin(NLContext *context, uint32_t prim)
 	}
 }
 
-static void nlEnd(NLContext *context, uint32_t prim)
-{
+static void nlEnd(NLContext *context, uint32_t prim) {
 	if (prim == NL_MATRIX) {
 		nlRowColumnClear(&context->af);
 		nlRowColumnClear(&context->al);
 	} else if (prim == NL_ROW) {
-		NLRowColumn*    af = &context->af;
-		NLRowColumn*    al = &context->al;
-		NLSparseMatrix* M = (NLSparseMatrix*)context->M;
-		double* b = context->b;
+		NLRowColumn *af = &context->af;
+		NLRowColumn *al = &context->al;
+		NLSparseMatrix *M = (NLSparseMatrix *)context->M;
+		double *b = context->b;
 		uint32_t nf = af->size;
 		uint32_t nl = al->size;
 		uint32_t n = context->n;
@@ -5161,14 +4404,13 @@ static void nlEnd(NLContext *context, uint32_t prim)
 				S += al->coeff[jj].value * NL_BUFFER_ITEM(context->variable_buffer[k], j);
 			}
 			for (uint32_t jj = 0; jj < nf; jj++)
-				b[k*n + af->coeff[jj].index] -= af->coeff[jj].value * S;
+				b[k * n + af->coeff[jj].index] -= af->coeff[jj].value * S;
 		}
 		context->current_row++;
 	}
 }
 
-static bool nlSolve(NLContext *context)
-{
+static bool nlSolve(NLContext *context) {
 	nlDeleteMatrix(context->P);
 	context->P = nlNewJacobiPreconditioner(context->M);
 	nlMatrixCompress(&context->M);
@@ -5179,11 +4421,9 @@ static bool nlSolve(NLContext *context)
 } // namespace opennl
 
 namespace raster {
-class ClippedTriangle
-{
+class ClippedTriangle {
 public:
-	ClippedTriangle(const Vector2 &a, const Vector2 &b, const Vector2 &c)
-	{
+	ClippedTriangle(const Vector2 &a, const Vector2 &b, const Vector2 &c) {
 		m_numVertices = 3;
 		m_activeVertexBuffer = 0;
 		m_verticesA[0] = a;
@@ -5194,20 +4434,20 @@ public:
 		m_area = 0;
 	}
 
-	void clipHorizontalPlane(float offset, float clipdirection)
-	{
-		Vector2 *v  = m_vertexBuffers[m_activeVertexBuffer];
+	void clipHorizontalPlane(float offset, float clipdirection) {
+		Vector2 *v = m_vertexBuffers[m_activeVertexBuffer];
 		m_activeVertexBuffer ^= 1;
 		Vector2 *v2 = m_vertexBuffers[m_activeVertexBuffer];
 		v[m_numVertices] = v[0];
-		float dy2,   dy1 = offset - v[0].y;
-		int   dy2in, dy1in = clipdirection * dy1 >= 0;
-		uint32_t  p = 0;
+		float dy2, dy1 = offset - v[0].y;
+		int dy2in, dy1in = clipdirection * dy1 >= 0;
+		uint32_t p = 0;
 		for (uint32_t k = 0; k < m_numVertices; k++) {
-			dy2   = offset - v[k + 1].y;
+			dy2 = offset - v[k + 1].y;
 			dy2in = clipdirection * dy2 >= 0;
-			if (dy1in) v2[p++] = v[k];
-			if ( dy1in + dy2in == 1 ) { // not both in/out
+			if (dy1in)
+				v2[p++] = v[k];
+			if (dy1in + dy2in == 1) { // not both in/out
 				float dx = v[k + 1].x - v[k].x;
 				float dy = v[k + 1].y - v[k].y;
 				v2[p++] = Vector2(v[k].x + dy1 * (dx / dy), offset);
@@ -5218,20 +4458,20 @@ public:
 		m_numVertices = p;
 	}
 
-	void clipVerticalPlane(float offset, float clipdirection)
-	{
-		Vector2 *v  = m_vertexBuffers[m_activeVertexBuffer];
+	void clipVerticalPlane(float offset, float clipdirection) {
+		Vector2 *v = m_vertexBuffers[m_activeVertexBuffer];
 		m_activeVertexBuffer ^= 1;
 		Vector2 *v2 = m_vertexBuffers[m_activeVertexBuffer];
 		v[m_numVertices] = v[0];
-		float dx2,   dx1   = offset - v[0].x;
-		int   dx2in, dx1in = clipdirection * dx1 >= 0;
-		uint32_t  p = 0;
+		float dx2, dx1 = offset - v[0].x;
+		int dx2in, dx1in = clipdirection * dx1 >= 0;
+		uint32_t p = 0;
 		for (uint32_t k = 0; k < m_numVertices; k++) {
 			dx2 = offset - v[k + 1].x;
 			dx2in = clipdirection * dx2 >= 0;
-			if (dx1in) v2[p++] = v[k];
-			if ( dx1in + dx2in == 1 ) { // not both in/out
+			if (dx1in)
+				v2[p++] = v[k];
+			if (dx1in + dx2in == 1) { // not both in/out
 				float dx = v[k + 1].x - v[k].x;
 				float dy = v[k + 1].y - v[k].y;
 				v2[p++] = Vector2(offset, v[k].y + dx1 * (dy / dx));
@@ -5242,9 +4482,8 @@ public:
 		m_numVertices = p;
 	}
 
-	void computeArea()
-	{
-		Vector2 *v  = m_vertexBuffers[m_activeVertexBuffer];
+	void computeArea() {
+		Vector2 *v = m_vertexBuffers[m_activeVertexBuffer];
 		v[m_numVertices] = v[0];
 		m_area = 0;
 		float centroidx = 0, centroidy = 0;
@@ -5258,8 +4497,7 @@ public:
 		m_area = 0.5f * fabsf(m_area);
 	}
 
-	void clipAABox(float x0, float y0, float x1, float y1)
-	{
+	void clipAABox(float x0, float y0, float x1, float y1) {
 		clipVerticalPlane(x0, -1);
 		clipHorizontalPlane(y0, -1);
 		clipVerticalPlane(x1, 1);
@@ -5267,8 +4505,7 @@ public:
 		computeArea();
 	}
 
-	float area() const
-	{
+	float area() const {
 		return m_area;
 	}
 
@@ -5285,10 +4522,9 @@ private:
 typedef bool (*SamplingCallback)(void *param, int x, int y);
 
 /// A triangle for rasterization.
-struct Triangle
-{
-	Triangle(const Vector2 &_v0, const Vector2 &_v1, const Vector2 &_v2) : v1(_v0), v2(_v2), v3(_v1)
-	{
+struct Triangle {
+	Triangle(const Vector2 &_v0, const Vector2 &_v1, const Vector2 &_v2) :
+			v1(_v0), v2(_v2), v3(_v1), n1(0.0f), n2(0.0f), n3(0.0f) {
 		// make sure every triangle is front facing.
 		flipBackface();
 		// Compute deltas.
@@ -5296,8 +4532,7 @@ struct Triangle
 			computeUnitInwardNormals();
 	}
 
-	bool isValid()
-	{
+	bool isValid() {
 		const Vector2 e0 = v3 - v1;
 		const Vector2 e1 = v2 - v1;
 		const float area = e0.y * e1.x - e1.y * e0.x;
@@ -5305,18 +4540,17 @@ struct Triangle
 	}
 
 	// extents has to be multiple of BK_SIZE!!
-	bool drawAA(const Vector2 &extents, SamplingCallback cb, void *param)
-	{
-		const float PX_INSIDE = 1.0f/sqrtf(2.0f);
-		const float PX_OUTSIDE = -1.0f/sqrtf(2.0f);
+	bool drawAA(const Vector2 &extents, SamplingCallback cb, void *param) {
+		const float PX_INSIDE = 1.0f / sqrtf(2.0f);
+		const float PX_OUTSIDE = -1.0f / sqrtf(2.0f);
 		const float BK_SIZE = 8;
-		const float BK_INSIDE = sqrtf(BK_SIZE*BK_SIZE/2.0f);
-		const float BK_OUTSIDE = -sqrtf(BK_SIZE*BK_SIZE/2.0f);
+		const float BK_INSIDE = sqrtf(BK_SIZE * BK_SIZE / 2.0f);
+		const float BK_OUTSIDE = -sqrtf(BK_SIZE * BK_SIZE / 2.0f);
 		// Bounding rectangle
 		float minx = floorf(max(min3(v1.x, v2.x, v3.x), 0.0f));
 		float miny = floorf(max(min3(v1.y, v2.y, v3.y), 0.0f));
-		float maxx = ceilf( min(max3(v1.x, v2.x, v3.x), extents.x - 1.0f));
-		float maxy = ceilf( min(max3(v1.y, v2.y, v3.y), extents.y - 1.0f));
+		float maxx = ceilf(min(max3(v1.x, v2.x, v3.x), extents.x - 1.0f));
+		float maxy = ceilf(min(max3(v1.y, v2.y, v3.y), extents.y - 1.0f));
 		// There's no reason to align the blocks to the viewport, instead we align them to the origin of the triangle bounds.
 		minx = floorf(minx);
 		miny = floorf(miny);
@@ -5341,9 +4575,10 @@ struct Triangle
 				float bC = C2 + n2.x * xc + n2.y * yc;
 				float cC = C3 + n3.x * xc + n3.y * yc;
 				// Skip block when outside an edge
-				if ( (aC <= BK_OUTSIDE) || (bC <= BK_OUTSIDE) || (cC <= BK_OUTSIDE) ) continue;
+				if ((aC <= BK_OUTSIDE) || (bC <= BK_OUTSIDE) || (cC <= BK_OUTSIDE))
+					continue;
 				// Accept whole block when totally covered
-				if ( (aC >= BK_INSIDE) && (bC >= BK_INSIDE) && (cC >= BK_INSIDE) ) {
+				if ((aC >= BK_INSIDE) && (bC >= BK_INSIDE) && (cC >= BK_INSIDE)) {
 					for (float y = y0; y < y0 + BK_SIZE; y++) {
 						for (float x = x0; x < x0 + BK_SIZE; x++) {
 							if (!cb(param, (int)x, (int)y))
@@ -5386,10 +4621,9 @@ struct Triangle
 	}
 
 private:
-	void flipBackface()
-	{
+	void flipBackface() {
 		// check if triangle is backfacing, if so, swap two vertices
-		if ( ((v3.x - v1.x) * (v2.y - v1.y) - (v3.y - v1.y) * (v2.x - v1.x)) < 0 ) {
+		if (((v3.x - v1.x) * (v2.y - v1.y) - (v3.y - v1.y) * (v2.x - v1.x)) < 0) {
 			Vector2 hv = v1;
 			v1 = v2;
 			v2 = hv; // swap pos
@@ -5397,8 +4631,7 @@ private:
 	}
 
 	// compute unit inward normals for each edge.
-	void computeUnitInwardNormals()
-	{
+	void computeUnitInwardNormals() {
 		n1 = v1 - v2;
 		n1 = Vector2(-n1.y, n1.x);
 		n1 = n1 * (1.0f / sqrtf(dot(n1, n1)));
@@ -5416,8 +4649,7 @@ private:
 };
 
 // Process the given triangle. Returns false if rasterization was interrupted by the callback.
-static bool drawTriangle(const Vector2 &extents, const Vector2 v[3], SamplingCallback cb, void *param)
-{
+static bool drawTriangle(const Vector2 &extents, const Vector2 v[3], SamplingCallback cb, void *param) {
 	Triangle tri(v[0], v[1], v[2]);
 	// @@ It would be nice to have a conservative drawing mode that enlarges the triangle extents by one texel and is able to handle degenerate triangles.
 	// @@ Maybe the simplest thing to do would be raster triangle edges.
@@ -5432,22 +4664,19 @@ namespace segment {
 
 // - Insertion is o(n)
 // - Smallest element goes at the end, so that popping it is o(1).
-struct CostQueue
-{
-	CostQueue(uint32_t size = UINT32_MAX) : m_maxSize(size), m_pairs(MemTag::SegmentAtlasChartCandidates) {}
+struct CostQueue {
+	CostQueue(uint32_t size = UINT32_MAX) :
+			m_maxSize(size), m_pairs(MemTag::SegmentAtlasChartCandidates) {}
 
-	float peekCost() const
-	{
+	float peekCost() const {
 		return m_pairs.back().cost;
 	}
 
-	uint32_t peekFace() const
-	{
+	uint32_t peekFace() const {
 		return m_pairs.back().face;
 	}
 
-	void push(float cost, uint32_t face)
-	{
+	void push(float cost, uint32_t face) {
 		const Pair p = { cost, face };
 		if (m_pairs.isEmpty() || cost < peekCost())
 			m_pairs.push_back(p);
@@ -5464,29 +4693,25 @@ struct CostQueue
 		}
 	}
 
-	uint32_t pop()
-	{
+	uint32_t pop() {
 		XA_DEBUG_ASSERT(!m_pairs.isEmpty());
 		uint32_t f = m_pairs.back().face;
 		m_pairs.pop_back();
 		return f;
 	}
 
-	XA_INLINE void clear()
-	{
+	XA_INLINE void clear() {
 		m_pairs.clear();
 	}
 
-	XA_INLINE uint32_t count() const
-	{
+	XA_INLINE uint32_t count() const {
 		return m_pairs.size();
 	}
 
 private:
 	const uint32_t m_maxSize;
 
-	struct Pair
-	{
+	struct Pair {
 		float cost;
 		uint32_t face;
 	};
@@ -5494,25 +4719,27 @@ private:
 	Array<Pair> m_pairs;
 };
 
-struct AtlasData
-{
+struct AtlasData {
 	ChartOptions options;
 	const Mesh *mesh = nullptr;
 	Array<float> edgeDihedralAngles;
 	Array<float> edgeLengths;
 	Array<float> faceAreas;
+	Array<float> faceUvAreas; // Can be negative.
 	Array<Vector3> faceNormals;
 	BitArray isFaceInChart;
 
-	AtlasData() : edgeDihedralAngles(MemTag::SegmentAtlasMeshData), edgeLengths(MemTag::SegmentAtlasMeshData), faceAreas(MemTag::SegmentAtlasMeshData), faceNormals(MemTag::SegmentAtlasMeshData) {}
+	AtlasData() :
+			edgeDihedralAngles(MemTag::SegmentAtlasMeshData), edgeLengths(MemTag::SegmentAtlasMeshData), faceAreas(MemTag::SegmentAtlasMeshData), faceNormals(MemTag::SegmentAtlasMeshData) {}
 
-	void compute()
-	{
+	void compute() {
 		const uint32_t faceCount = mesh->faceCount();
 		const uint32_t edgeCount = mesh->edgeCount();
 		edgeDihedralAngles.resize(edgeCount);
 		edgeLengths.resize(edgeCount);
 		faceAreas.resize(faceCount);
+		if (options.useInputMeshUvs)
+			faceUvAreas.resize(faceCount);
 		faceNormals.resize(faceCount);
 		isFaceInChart.resize(faceCount);
 		isFaceInChart.zeroOutMemory();
@@ -5526,6 +4753,8 @@ struct AtlasData
 			}
 			faceAreas[f] = mesh->computeFaceArea(f);
 			XA_DEBUG_ASSERT(faceAreas[f] > 0.0f);
+			if (options.useInputMeshUvs)
+				faceUvAreas[f] = mesh->computeFaceParametricArea(f);
 			faceNormals[f] = mesh->computeFaceNormal(f);
 		}
 		for (uint32_t face = 0; face < faceCount; face++) {
@@ -5543,19 +4772,109 @@ struct AtlasData
 	}
 };
 
+// If MeshDecl::vertexUvData is set on input meshes, find charts by floodfilling faces in world/model space without crossing UV seams.
+struct OriginalUvCharts {
+	OriginalUvCharts(AtlasData &data) :
+			m_data(data) {}
+	uint32_t chartCount() const { return m_charts.size(); }
+	const Basis &chartBasis(uint32_t chartIndex) const { return m_chartBasis[chartIndex]; }
+
+	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const {
+		const Chart &chart = m_charts[chartIndex];
+		return ConstArrayView<uint32_t>(&m_chartFaces[chart.firstFace], chart.faceCount);
+	}
+
+	void compute() {
+		m_charts.clear();
+		m_chartFaces.clear();
+		const Mesh *mesh = m_data.mesh;
+		const uint32_t faceCount = mesh->faceCount();
+		for (uint32_t f = 0; f < faceCount; f++) {
+			if (m_data.isFaceInChart.get(f))
+				continue;
+			if (isZero(m_data.faceUvAreas[f], kAreaEpsilon))
+				continue; // Face must have valid UVs.
+			// Found an unassigned face, create a new chart.
+			Chart chart;
+			chart.firstFace = m_chartFaces.size();
+			chart.faceCount = 1;
+			m_chartFaces.push_back(f);
+			m_data.isFaceInChart.set(f);
+			floodfillFaces(chart);
+			m_charts.push_back(chart);
+		}
+		// Compute basis for each chart.
+		m_chartBasis.resize(m_charts.size());
+		for (uint32_t c = 0; c < m_charts.size(); c++) {
+			const Chart &chart = m_charts[c];
+			m_tempPoints.resize(chart.faceCount * 3);
+			for (uint32_t f = 0; f < chart.faceCount; f++) {
+				const uint32_t face = m_chartFaces[chart.firstFace + f];
+				for (uint32_t i = 0; i < 3; i++)
+					m_tempPoints[f * 3 + i] = m_data.mesh->position(m_data.mesh->vertexAt(face * 3 + i));
+			}
+			Fit::computeBasis(m_tempPoints, &m_chartBasis[c]);
+		}
+	}
+
+private:
+	struct Chart {
+		uint32_t firstFace, faceCount;
+	};
+
+	void floodfillFaces(Chart &chart) {
+		const bool isFaceAreaNegative = m_data.faceUvAreas[m_chartFaces[chart.firstFace]] < 0.0f;
+		for (;;) {
+			bool newFaceAdded = false;
+			const uint32_t faceCount = chart.faceCount;
+			for (uint32_t f = 0; f < faceCount; f++) {
+				const uint32_t sourceFace = m_chartFaces[chart.firstFace + f];
+				for (Mesh::FaceEdgeIterator edgeIt(m_data.mesh, sourceFace); !edgeIt.isDone(); edgeIt.advance()) {
+					const uint32_t face = edgeIt.oppositeFace();
+					if (face == UINT32_MAX)
+						continue; // Boundary edge.
+					if (m_data.isFaceInChart.get(face))
+						continue; // Already assigned to a chart.
+					if (isZero(m_data.faceUvAreas[face], kAreaEpsilon))
+						continue; // Face must have valid UVs.
+					if ((m_data.faceUvAreas[face] < 0.0f) != isFaceAreaNegative)
+						continue; // Face winding is opposite of the first chart face.
+					const Vector2 &uv0 = m_data.mesh->texcoord(edgeIt.vertex0());
+					const Vector2 &uv1 = m_data.mesh->texcoord(edgeIt.vertex1());
+					const Vector2 &ouv0 = m_data.mesh->texcoord(m_data.mesh->vertexAt(meshEdgeIndex0(edgeIt.oppositeEdge())));
+					const Vector2 &ouv1 = m_data.mesh->texcoord(m_data.mesh->vertexAt(meshEdgeIndex1(edgeIt.oppositeEdge())));
+					if (!equal(uv0, ouv1, m_data.mesh->epsilon()) || !equal(uv1, ouv0, m_data.mesh->epsilon()))
+						continue; // UVs must match exactly.
+					m_chartFaces.push_back(face);
+					chart.faceCount++;
+					m_data.isFaceInChart.set(face);
+					newFaceAdded = true;
+				}
+			}
+			if (!newFaceAdded)
+				break;
+		}
+	}
+
+	AtlasData &m_data;
+	Array<Chart> m_charts;
+	Array<Basis> m_chartBasis;
+	Array<uint32_t> m_chartFaces;
+	Array<Vector3> m_tempPoints;
+};
+
 #if XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS
 static uint32_t s_planarRegionsCurrentRegion;
 static uint32_t s_planarRegionsCurrentVertex;
 #endif
 
-struct PlanarCharts
-{
-	PlanarCharts(AtlasData &data) : m_data(data), m_nextRegionFace(MemTag::SegmentAtlasPlanarRegions), m_faceToRegionId(MemTag::SegmentAtlasPlanarRegions) {}
+struct PlanarCharts {
+	PlanarCharts(AtlasData &data) :
+			m_data(data), m_nextRegionFace(MemTag::SegmentAtlasPlanarRegions), m_faceToRegionId(MemTag::SegmentAtlasPlanarRegions) {}
 	const Basis &chartBasis(uint32_t chartIndex) const { return m_chartBasis[chartIndex]; }
 	uint32_t chartCount() const { return m_charts.size(); }
-	
-	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const
-	{
+
+	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const {
 		const Chart &chart = m_charts[chartIndex];
 		return ConstArrayView<uint32_t>(&m_chartFaces[chart.firstFace], chart.faceCount);
 	}
@@ -5564,8 +4883,7 @@ struct PlanarCharts
 	uint32_t nextRegionFace(uint32_t face) const { return m_nextRegionFace[face]; }
 	float regionArea(uint32_t region) const { return m_regionAreas[region]; }
 
-	void compute()
-	{
+	void compute() {
 		const uint32_t faceCount = m_data.mesh->faceCount();
 		// Precompute regions of coplanar incident faces.
 		m_regionFirstFace.clear();
@@ -5581,6 +4899,8 @@ struct PlanarCharts
 		for (uint32_t f = 0; f < faceCount; f++) {
 			if (m_nextRegionFace[f] != f)
 				continue; // Already assigned.
+			if (m_data.isFaceInChart.get(f))
+				continue; // Already in a chart.
 			faceStack.clear();
 			faceStack.push_back(f);
 			for (;;) {
@@ -5595,6 +4915,8 @@ struct PlanarCharts
 						continue;
 					if (m_nextRegionFace[oface] != oface)
 						continue; // Already assigned.
+					if (m_data.isFaceInChart.get(oface))
+						continue; // Already in a chart.
 					if (!equal(dot(m_data.faceNormals[face], m_data.faceNormals[oface]), 1.0f, kEpsilon))
 						continue; // Not coplanar.
 					const uint32_t next = m_nextRegionFace[face];
@@ -5632,8 +4954,11 @@ struct PlanarCharts
 		// Precompute planar region areas.
 		m_regionAreas.resize(regionCount);
 		m_regionAreas.zeroOutMemory();
-		for (uint32_t f = 0; f < faceCount; f++)
+		for (uint32_t f = 0; f < faceCount; f++) {
+			if (m_faceToRegionId[f] == UINT32_MAX)
+				continue;
 			m_regionAreas[m_faceToRegionId[f]] += m_data.faceAreas[f];
+		}
 		// Create charts from suitable planar regions.
 		// The dihedral angle of all boundary edges must be >= 90 degrees.
 		m_charts.clear();
@@ -5658,8 +4983,7 @@ struct PlanarCharts
 				if (!createChart)
 					break;
 				face = m_nextRegionFace[face];
-			}
-			while (face != firstRegionFace);
+			} while (face != firstRegionFace);
 			// Create a chart.
 			if (createChart) {
 				Chart chart;
@@ -5671,15 +4995,13 @@ struct PlanarCharts
 					m_chartFaces.push_back(face);
 					chart.faceCount++;
 					face = m_nextRegionFace[face];
-				}
-				while (face != firstRegionFace);
+				} while (face != firstRegionFace);
 				m_charts.push_back(chart);
 			}
 		}
 		// Compute basis for each chart using the first face normal (all faces have the same normal).
 		m_chartBasis.resize(m_charts.size());
-		for (uint32_t c = 0; c < m_charts.size(); c++)
-		{
+		for (uint32_t c = 0; c < m_charts.size(); c++) {
 			const uint32_t face = m_chartFaces[m_charts[c].firstFace];
 			Basis &basis = m_chartBasis[c];
 			basis.normal = m_data.faceNormals[face];
@@ -5689,8 +5011,7 @@ struct PlanarCharts
 	}
 
 private:
-	struct Chart
-	{
+	struct Chart {
 		uint32_t firstFace, faceCount;
 	};
 
@@ -5704,12 +5025,11 @@ private:
 	Array<Basis> m_chartBasis;
 };
 
-struct ClusteredCharts
-{
-	ClusteredCharts(AtlasData &data, const PlanarCharts &planarCharts) : m_data(data), m_planarCharts(planarCharts), m_texcoords(MemTag::SegmentAtlasMeshData), m_bestTriangles(10), m_placingSeeds(false) {}
+struct ClusteredCharts {
+	ClusteredCharts(AtlasData &data, const PlanarCharts &planarCharts) :
+			m_data(data), m_planarCharts(planarCharts), m_texcoords(MemTag::SegmentAtlasMeshData), m_bestTriangles(10), m_placingSeeds(false) {}
 
-	~ClusteredCharts()
-	{
+	~ClusteredCharts() {
 		const uint32_t chartCount = m_charts.size();
 		for (uint32_t i = 0; i < chartCount; i++) {
 			m_charts[i]->~Chart();
@@ -5721,8 +5041,7 @@ struct ClusteredCharts
 	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const { return m_charts[chartIndex]->faces; }
 	const Basis &chartBasis(uint32_t chartIndex) const { return m_charts[chartIndex]->basis; }
 
-	void compute()
-	{
+	void compute() {
 		const uint32_t faceCount = m_data.mesh->faceCount();
 		m_facesLeft = 0;
 		for (uint32_t i = 0; i < faceCount; i++) {
@@ -5768,9 +5087,9 @@ struct ClusteredCharts
 	}
 
 private:
-	struct Chart
-	{
-		Chart() : faces(MemTag::SegmentAtlasChartFaces) {}
+	struct Chart {
+		Chart() :
+				faces(MemTag::SegmentAtlasChartFaces) {}
 
 		int id = -1;
 		Basis basis; // Best fit normal.
@@ -5784,8 +5103,7 @@ private:
 		uint32_t seed;
 	};
 
-	void placeSeeds(float threshold)
-	{
+	void placeSeeds(float threshold) {
 		XA_PROFILE_START(clusteredChartsPlaceSeeds)
 		m_placingSeeds = true;
 		// Instead of using a predefiened number of seeds:
@@ -5801,8 +5119,7 @@ private:
 	}
 
 	// Returns true if any of the charts can grow more.
-	void growCharts(float threshold)
-	{
+	void growCharts(float threshold) {
 		XA_PROFILE_START(clusteredChartsGrow)
 		for (;;) {
 			if (m_facesLeft == 0)
@@ -5848,8 +5165,7 @@ private:
 		XA_PROFILE_END(clusteredChartsGrow)
 	}
 
-	void resetCharts()
-	{
+	void resetCharts() {
 		XA_PROFILE_START(clusteredChartsReset)
 		const uint32_t faceCount = m_data.mesh->faceCount();
 		for (uint32_t i = 0; i < faceCount; i++) {
@@ -5880,8 +5196,7 @@ private:
 		XA_PROFILE_END(clusteredChartsReset)
 	}
 
-	bool relocateSeeds()
-	{
+	bool relocateSeeds() {
 		XA_PROFILE_START(clusteredChartsRelocateSeeds)
 		bool anySeedChanged = false;
 		const uint32_t chartCount = m_charts.size();
@@ -5894,8 +5209,7 @@ private:
 		return anySeedChanged;
 	}
 
-	void fillHoles(float threshold)
-	{
+	void fillHoles(float threshold) {
 		XA_PROFILE_START(clusteredChartsFillHoles)
 		while (m_facesLeft > 0)
 			createChart(threshold);
@@ -5903,8 +5217,7 @@ private:
 	}
 
 #if XA_MERGE_CHARTS
-	void mergeCharts()
-	{
+	void mergeCharts() {
 		XA_PROFILE_START(clusteredChartsMerge)
 		const uint32_t chartCount = m_charts.size();
 		// Merge charts progressively until there's none left to merge.
@@ -5964,7 +5277,7 @@ private:
 					// Merge if chart2 has a single face.
 					// chart1 must have more than 1 face.
 					// chart2 area must be <= 10% of chart1 area.
-					if (m_sharedBoundaryLengthsNoSeams[cc] > 0.0f && chart->faces.size() > 1 && chart2->faces.size() == 1 && chart2->area <= chart->area * 0.1f) 
+					if (m_sharedBoundaryLengthsNoSeams[cc] > 0.0f && chart->faces.size() > 1 && chart2->faces.size() == 1 && chart2->area <= chart->area * 0.1f)
 						goto merge;
 					// Merge if chart2 has two faces (probably a quad), and chart1 bounds at least 2 of its edges.
 					if (chart2->faces.size() == 2 && m_sharedBoundaryEdgeCountNoSeams[cc] >= 2)
@@ -5972,8 +5285,8 @@ private:
 					// Merge if chart2 is wholely inside chart1, ignoring seams.
 					if (m_sharedBoundaryLengthsNoSeams[cc] > 0.0f && equal(m_sharedBoundaryLengthsNoSeams[cc], chart2->boundaryLength, kEpsilon))
 						goto merge;
-					if (m_sharedBoundaryLengths[cc] > 0.2f * max(0.0f, chart->boundaryLength - externalBoundaryLength) || 
-						m_sharedBoundaryLengths[cc] > 0.75f * chart2->boundaryLength)
+					if (m_sharedBoundaryLengths[cc] > 0.2f * max(0.0f, chart->boundaryLength - externalBoundaryLength) ||
+							m_sharedBoundaryLengths[cc] > 0.75f * chart2->boundaryLength)
 						goto merge;
 					continue;
 				merge:
@@ -6011,8 +5324,7 @@ private:
 #endif
 
 private:
-	void createChart(float threshold)
-	{
+	void createChart(float threshold) {
 		Chart *chart = XA_NEW(MemTag::Default, Chart);
 		chart->id = (int)m_charts.size();
 		m_charts.push_back(chart);
@@ -6043,15 +5355,13 @@ private:
 		}
 	}
 
-	bool isChartBoundaryEdge(const Chart *chart, uint32_t edge) const
-	{
+	bool isChartBoundaryEdge(const Chart *chart, uint32_t edge) const {
 		const uint32_t oppositeEdge = m_data.mesh->oppositeEdge(edge);
 		const uint32_t oppositeFace = meshEdgeFace(oppositeEdge);
 		return oppositeEdge == UINT32_MAX || m_faceCharts[oppositeFace] != chart->id;
 	}
 
-	bool computeChartBasis(Chart *chart, Basis *basis)
-	{
+	bool computeChartBasis(Chart *chart, Basis *basis) {
 		const uint32_t faceCount = chart->faces.size();
 		m_tempPoints.resize(chart->faces.size() * 3);
 		for (uint32_t i = 0; i < faceCount; i++) {
@@ -6059,11 +5369,10 @@ private:
 			for (uint32_t j = 0; j < 3; j++)
 				m_tempPoints[i * 3 + j] = m_data.mesh->position(m_data.mesh->vertexAt(f * 3 + j));
 		}
-		return Fit::computeBasis(m_tempPoints.data(), m_tempPoints.size(), basis);
+		return Fit::computeBasis(m_tempPoints, basis);
 	}
 
-	bool isFaceFlipped(uint32_t face) const
-	{
+	bool isFaceFlipped(uint32_t face) const {
 		const Vector2 &v1 = m_texcoords[face * 3 + 0];
 		const Vector2 &v2 = m_texcoords[face * 3 + 1];
 		const Vector2 &v3 = m_texcoords[face * 3 + 2];
@@ -6071,8 +5380,7 @@ private:
 		return parametricArea < 0.0f;
 	}
 
-	void parameterizeChart(const Chart *chart)
-	{
+	void parameterizeChart(const Chart *chart) {
 		const uint32_t faceCount = chart->faces.size();
 		for (uint32_t i = 0; i < faceCount; i++) {
 			const uint32_t face = chart->faces[i];
@@ -6085,8 +5393,7 @@ private:
 	}
 
 	// m_faceCharts for the chart faces must be set to the chart ID. Needed to compute boundary edges.
-	bool isChartParameterizationValid(const Chart *chart)
-	{
+	bool isChartParameterizationValid(const Chart *chart) {
 		const uint32_t faceCount = chart->faces.size();
 		// Check for flipped faces in the parameterization. OK if all are flipped.
 		uint32_t flippedFaceCount = 0;
@@ -6099,7 +5406,7 @@ private:
 		// Check for boundary intersection in the parameterization.
 		XA_PROFILE_START(clusteredChartsPlaceSeedsBoundaryIntersection)
 		XA_PROFILE_START(clusteredChartsGrowBoundaryIntersection)
-		m_boundaryGrid.reset(m_texcoords.data());
+		m_boundaryGrid.reset(m_texcoords);
 		for (uint32_t i = 0; i < faceCount; i++) {
 			const uint32_t f = chart->faces[i];
 			for (uint32_t j = 0; j < 3; j++) {
@@ -6120,15 +5427,14 @@ private:
 		return true;
 	}
 
-	bool addFaceToChart(Chart *chart, uint32_t face)
-	{
+	bool addFaceToChart(Chart *chart, uint32_t face) {
 		XA_DEBUG_ASSERT(!m_data.isFaceInChart.get(face));
 		const uint32_t oldFaceCount = chart->faces.size();
 		const bool firstFace = oldFaceCount == 0;
 		// Append the face and any coplanar connected faces to the chart faces array.
 		chart->faces.push_back(face);
 		uint32_t coplanarFace = m_planarCharts.nextRegionFace(face);
-		while (coplanarFace != face) { 
+		while (coplanarFace != face) {
 			XA_DEBUG_ASSERT(!m_data.isFaceInChart.get(coplanarFace));
 			chart->faces.push_back(coplanarFace);
 			coplanarFace = m_planarCharts.nextRegionFace(coplanarFace);
@@ -6140,7 +5446,7 @@ private:
 			// Use the first face normal.
 			// Use any edge as the tangent vector.
 			basis.normal = m_data.faceNormals[face];
-			basis.tangent = normalize(m_data.mesh->position(m_data.mesh->vertexAt(face * 3 + 0)) - m_data.mesh->position(m_data.mesh->vertexAt(face * 3 + 1)), kEpsilon);
+			basis.tangent = normalize(m_data.mesh->position(m_data.mesh->vertexAt(face * 3 + 0)) - m_data.mesh->position(m_data.mesh->vertexAt(face * 3 + 1)));
 			basis.bitangent = cross(basis.normal, basis.tangent);
 		} else {
 			// Use best fit normal.
@@ -6199,8 +5505,7 @@ private:
 	}
 
 	// Returns true if the seed has changed.
-	bool relocateSeed(Chart *chart)
-	{
+	bool relocateSeed(Chart *chart) {
 		// Find the first N triangles that fit the proxy best.
 		const uint32_t faceCount = chart->faces.size();
 		m_bestTriangles.clear();
@@ -6230,8 +5535,7 @@ private:
 	}
 
 	// Cost is combined metrics * weights.
-	float computeCost(Chart *chart, uint32_t face) const
-	{
+	float computeCost(Chart *chart, uint32_t face) const {
 		// Estimate boundary length and area:
 		const float newChartArea = computeArea(chart, face);
 		const float newBoundaryLength = computeBoundaryLength(chart, face);
@@ -6267,28 +5571,25 @@ private:
 	// Returns a value in [0-1].
 	// 0 if face normal is coplanar to the chart's best fit normal.
 	// 1 if face normal is perpendicular.
-	float computeNormalDeviationMetric(Chart *chart, uint32_t face) const
-	{
+	float computeNormalDeviationMetric(Chart *chart, uint32_t face) const {
 		// All faces in coplanar regions have the same normal, can use any face.
 		const Vector3 faceNormal = m_data.faceNormals[face];
 		// Use plane fitting metric for now:
 		return min(1.0f - dot(faceNormal, chart->basis.normal), 1.0f); // @@ normal deviations should be weighted by face area
 	}
 
-	float computeRoundnessMetric(Chart *chart, float newBoundaryLength, float newChartArea) const
-	{
+	float computeRoundnessMetric(Chart *chart, float newBoundaryLength, float newChartArea) const {
 		const float oldRoundness = square(chart->boundaryLength) / chart->area;
 		const float newRoundness = square(newBoundaryLength) / newChartArea;
 		return 1.0f - oldRoundness / newRoundness;
 	}
 
-	float computeStraightnessMetric(Chart *chart, uint32_t firstFace) const
-	{
+	float computeStraightnessMetric(Chart *chart, uint32_t firstFace) const {
 		float l_out = 0.0f; // Length of firstFace planar region boundary that doesn't border the chart.
 		float l_in = 0.0f; // Length that does border the chart.
 		const uint32_t planarRegionId = m_planarCharts.regionIdFromFace(firstFace);
 		uint32_t face = firstFace;
-		for (;;) { 
+		for (;;) {
 			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
 				const float l = m_data.edgeLengths[it.edge()];
 				if (it.isBoundary()) {
@@ -6305,7 +5606,6 @@ private:
 				break;
 		}
 #if 1
-		XA_DEBUG_ASSERT(l_in != 0.0f); // Candidate face must be adjacent to chart. @@ This is not true if the input mesh has zero-length edges.
 		float ratio = (l_out - l_in) / (l_out + l_in);
 		return min(ratio, 0.0f); // Only use the straightness metric to close gaps.
 #else
@@ -6313,8 +5613,7 @@ private:
 #endif
 	}
 
-	bool isNormalSeam(uint32_t edge) const
-	{
+	bool isNormalSeam(uint32_t edge) const {
 		const uint32_t oppositeEdge = m_data.mesh->oppositeEdge(edge);
 		if (oppositeEdge == UINT32_MAX)
 			return false; // boundary edge
@@ -6334,11 +5633,10 @@ private:
 		return !equal(m_data.faceNormals[f0], m_data.faceNormals[f1], kNormalEpsilon);
 	}
 
-	float computeNormalSeamMetric(Chart *chart, uint32_t firstFace) const
-	{
+	float computeNormalSeamMetric(Chart *chart, uint32_t firstFace) const {
 		float seamFactor = 0.0f, totalLength = 0.0f;
 		uint32_t face = firstFace;
-		for (;;) { 
+		for (;;) {
 			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
 				if (it.isBoundary())
 					continue;
@@ -6375,11 +5673,10 @@ private:
 		return seamFactor / totalLength;
 	}
 
-	float computeTextureSeamMetric(Chart *chart, uint32_t firstFace) const
-	{
+	float computeTextureSeamMetric(Chart *chart, uint32_t firstFace) const {
 		float seamLength = 0.0f, totalLength = 0.0f;
 		uint32_t face = firstFace;
-		for (;;) { 
+		for (;;) {
 			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
 				if (it.isBoundary())
 					continue;
@@ -6402,11 +5699,10 @@ private:
 		return seamLength / totalLength;
 	}
 
-	float computeArea(Chart *chart, uint32_t firstFace) const
-	{
+	float computeArea(Chart *chart, uint32_t firstFace) const {
 		float area = chart->area;
 		uint32_t face = firstFace;
-		for (;;) { 
+		for (;;) {
 			area += m_data.faceAreas[face];
 			face = m_planarCharts.nextRegionFace(face);
 			if (face == firstFace)
@@ -6415,13 +5711,12 @@ private:
 		return area;
 	}
 
-	float computeBoundaryLength(Chart *chart, uint32_t firstFace) const
-	{
+	float computeBoundaryLength(Chart *chart, uint32_t firstFace) const {
 		float boundaryLength = chart->boundaryLength;
 		// Add new edges, subtract edges shared with the chart.
 		const uint32_t planarRegionId = m_planarCharts.regionIdFromFace(firstFace);
 		uint32_t face = firstFace;
-		for (;;) { 
+		for (;;) {
 			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
 				const float edgeLength = m_data.edgeLengths[it.edge()];
 				if (it.isBoundary()) {
@@ -6437,11 +5732,10 @@ private:
 			if (face == firstFace)
 				break;
 		}
-		return max(0.0f, boundaryLength);  // @@ Hack!
+		return max(0.0f, boundaryLength); // @@ Hack!
 	}
 
-	bool mergeChart(Chart *owner, Chart *chart, float sharedBoundaryLength)
-	{
+	bool mergeChart(Chart *owner, Chart *chart, float sharedBoundaryLength) {
 		const uint32_t oldOwnerFaceCount = owner->faces.size();
 		const uint32_t chartFaceCount = chart->faces.size();
 		owner->faces.push_back(chart->faces);
@@ -6499,33 +5793,53 @@ private:
 	bool m_placingSeeds;
 };
 
-struct Atlas
-{
-	Atlas() : m_planarCharts(m_data), m_clusteredCharts(m_data, m_planarCharts) {}
+struct ChartGeneratorType {
+	enum Enum {
+		OriginalUv,
+		Planar,
+		Clustered,
+		Piecewise
+	};
+};
 
-	uint32_t chartCount() const
-	{
-		return m_planarCharts.chartCount() + m_clusteredCharts.chartCount();
+struct Atlas {
+	Atlas() :
+			m_originalUvCharts(m_data), m_planarCharts(m_data), m_clusteredCharts(m_data, m_planarCharts) {}
+
+	uint32_t chartCount() const {
+		return m_originalUvCharts.chartCount() + m_planarCharts.chartCount() + m_clusteredCharts.chartCount();
 	}
 
-	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const
-	{
+	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const {
+		if (chartIndex < m_originalUvCharts.chartCount())
+			return m_originalUvCharts.chartFaces(chartIndex);
+		chartIndex -= m_originalUvCharts.chartCount();
 		if (chartIndex < m_planarCharts.chartCount())
 			return m_planarCharts.chartFaces(chartIndex);
 		chartIndex -= m_planarCharts.chartCount();
 		return m_clusteredCharts.chartFaces(chartIndex);
 	}
 
-	const Basis &chartBasis(uint32_t chartIndex) const
-	{
+	const Basis &chartBasis(uint32_t chartIndex) const {
+		if (chartIndex < m_originalUvCharts.chartCount())
+			return m_originalUvCharts.chartBasis(chartIndex);
+		chartIndex -= m_originalUvCharts.chartCount();
 		if (chartIndex < m_planarCharts.chartCount())
 			return m_planarCharts.chartBasis(chartIndex);
 		chartIndex -= m_planarCharts.chartCount();
 		return m_clusteredCharts.chartBasis(chartIndex);
 	}
 
-	void reset(const Mesh *mesh, const ChartOptions &options)
-	{
+	ChartGeneratorType::Enum chartGeneratorType(uint32_t chartIndex) const {
+		if (chartIndex < m_originalUvCharts.chartCount())
+			return ChartGeneratorType::OriginalUv;
+		chartIndex -= m_originalUvCharts.chartCount();
+		if (chartIndex < m_planarCharts.chartCount())
+			return ChartGeneratorType::Planar;
+		return ChartGeneratorType::Clustered;
+	}
+
+	void reset(const Mesh *mesh, const ChartOptions &options) {
 		XA_PROFILE_START(buildAtlasInit)
 		m_data.options = options;
 		m_data.mesh = mesh;
@@ -6533,8 +5847,12 @@ struct Atlas
 		XA_PROFILE_END(buildAtlasInit)
 	}
 
-	void compute()
-	{
+	void compute() {
+		if (m_data.options.useInputMeshUvs) {
+			XA_PROFILE_START(originalUvCharts)
+			m_originalUvCharts.compute();
+			XA_PROFILE_END(originalUvCharts)
+		}
 		XA_PROFILE_START(planarCharts)
 		m_planarCharts.compute();
 		XA_PROFILE_END(planarCharts)
@@ -6545,17 +5863,143 @@ struct Atlas
 
 private:
 	AtlasData m_data;
+	OriginalUvCharts m_originalUvCharts;
 	PlanarCharts m_planarCharts;
 	ClusteredCharts m_clusteredCharts;
 };
 
+struct ComputeUvMeshChartsTaskArgs {
+	UvMesh *mesh;
+	Progress *progress;
+};
+
+// Charts are found by floodfilling faces without crossing UV seams.
+struct ComputeUvMeshChartsTask {
+	ComputeUvMeshChartsTask(ComputeUvMeshChartsTaskArgs *args) :
+			m_mesh(args->mesh), m_progress(args->progress), m_uvToEdgeMap(MemTag::Default, m_mesh->indices.size()), m_faceAssigned(m_mesh->indices.size() / 3) {}
+
+	void run() {
+		const uint32_t vertexCount = m_mesh->texcoords.size();
+		const uint32_t indexCount = m_mesh->indices.size();
+		const uint32_t faceCount = indexCount / 3;
+		// A vertex can only be assigned to one chart.
+		m_mesh->vertexToChartMap.resize(vertexCount);
+		m_mesh->vertexToChartMap.fill(UINT32_MAX);
+		// Map vertex UV to edge. Face is then edge / 3.
+		for (uint32_t i = 0; i < indexCount; i++)
+			m_uvToEdgeMap.add(m_mesh->texcoords[m_mesh->indices[i]]);
+		// Find charts.
+		m_faceAssigned.zeroOutMemory();
+		for (uint32_t f = 0; f < faceCount; f++) {
+			if (m_progress->cancel)
+				return;
+			m_progress->increment(1);
+			// Found an unassigned face, see if it can be added.
+			const uint32_t chartIndex = m_mesh->charts.size();
+			if (!canAddFaceToChart(chartIndex, f))
+				continue;
+			// Face is OK, create a new chart with the face.
+			UvMeshChart *chart = XA_NEW(MemTag::Default, UvMeshChart);
+			m_mesh->charts.push_back(chart);
+			chart->material = m_mesh->faceMaterials.isEmpty() ? 0 : m_mesh->faceMaterials[f];
+			addFaceToChart(chartIndex, f);
+			// Walk incident faces and assign them to the chart.
+			uint32_t f2 = 0;
+			for (;;) {
+				bool newFaceAssigned = false;
+				const uint32_t faceCount2 = chart->faces.size();
+				for (; f2 < faceCount2; f2++) {
+					const uint32_t face = chart->faces[f2];
+					for (uint32_t i = 0; i < 3; i++) {
+						// Add any valid faces with colocal UVs to the chart.
+						const Vector2 &uv = m_mesh->texcoords[m_mesh->indices[face * 3 + i]];
+						uint32_t edge = m_uvToEdgeMap.get(uv);
+						while (edge != UINT32_MAX) {
+							const uint32_t newFace = edge / 3;
+							if (canAddFaceToChart(chartIndex, newFace)) {
+								addFaceToChart(chartIndex, newFace);
+								newFaceAssigned = true;
+							}
+							edge = m_uvToEdgeMap.getNext(uv, edge);
+						}
+					}
+				}
+				if (!newFaceAssigned)
+					break;
+			}
+		}
+	}
+
+private:
+	// The chart at chartIndex doesn't have to exist yet.
+	bool canAddFaceToChart(uint32_t chartIndex, uint32_t face) const {
+		if (m_faceAssigned.get(face))
+			return false; // Already assigned to a chart.
+		if (m_mesh->faceIgnore.get(face))
+			return false; // Face is ignored (zero area or nan UVs).
+		if (!m_mesh->faceMaterials.isEmpty() && chartIndex < m_mesh->charts.size()) {
+			if (m_mesh->faceMaterials[face] != m_mesh->charts[chartIndex]->material)
+				return false; // Materials don't match.
+		}
+		for (uint32_t i = 0; i < 3; i++) {
+			const uint32_t vertex = m_mesh->indices[face * 3 + i];
+			if (m_mesh->vertexToChartMap[vertex] != UINT32_MAX && m_mesh->vertexToChartMap[vertex] != chartIndex)
+				return false; // Vertex already assigned to another chart.
+		}
+		return true;
+	}
+
+	void addFaceToChart(uint32_t chartIndex, uint32_t face) {
+		UvMeshChart *chart = m_mesh->charts[chartIndex];
+		m_faceAssigned.set(face);
+		chart->faces.push_back(face);
+		for (uint32_t i = 0; i < 3; i++) {
+			const uint32_t vertex = m_mesh->indices[face * 3 + i];
+			m_mesh->vertexToChartMap[vertex] = chartIndex;
+			chart->indices.push_back(vertex);
+		}
+	}
+
+	UvMesh *const m_mesh;
+	Progress *const m_progress;
+	HashMap<Vector2> m_uvToEdgeMap; // Face is edge / 3.
+	BitArray m_faceAssigned;
+};
+
+static void runComputeUvMeshChartsTask(void * /*groupUserData*/, void *taskUserData) {
+	XA_PROFILE_START(computeChartsThread)
+	ComputeUvMeshChartsTask task((ComputeUvMeshChartsTaskArgs *)taskUserData);
+	task.run();
+	XA_PROFILE_END(computeChartsThread)
+}
+
+static bool computeUvMeshCharts(TaskScheduler *taskScheduler, ArrayView<UvMesh *> meshes, ProgressFunc progressFunc, void *progressUserData) {
+	uint32_t totalFaceCount = 0;
+	for (uint32_t i = 0; i < meshes.length; i++)
+		totalFaceCount += meshes[i]->indices.size() / 3;
+	Progress progress(ProgressCategory::ComputeCharts, progressFunc, progressUserData, totalFaceCount);
+	TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(nullptr, meshes.length);
+	Array<ComputeUvMeshChartsTaskArgs> taskArgs;
+	taskArgs.resize(meshes.length);
+	for (uint32_t i = 0; i < meshes.length; i++) {
+		ComputeUvMeshChartsTaskArgs &args = taskArgs[i];
+		args.mesh = meshes[i];
+		args.progress = &progress;
+		Task task;
+		task.userData = &args;
+		task.func = runComputeUvMeshChartsTask;
+		taskScheduler->run(taskGroup, task);
+	}
+	taskScheduler->wait(&taskGroup);
+	return !progress.cancel;
+}
+
 } // namespace segment
 
 namespace param {
 
 // Fast sweep in 3 directions
-static bool findApproximateDiameterVertices(Mesh *mesh, uint32_t *a, uint32_t *b)
-{
+static bool findApproximateDiameterVertices(Mesh *mesh, uint32_t *a, uint32_t *b) {
 	XA_DEBUG_ASSERT(a != nullptr);
 	XA_DEBUG_ASSERT(b != nullptr);
 	const uint32_t vertexCount = mesh->vertexCount();
@@ -6612,10 +6056,9 @@ static bool findApproximateDiameterVertices(Mesh *mesh, uint32_t *a, uint32_t *b
 
 // From OpenNL LSCM example.
 // Computes the coordinates of the vertices of a triangle in a local 2D orthonormal basis of the triangle's plane.
-static void projectTriangle(Vector3 p0, Vector3 p1, Vector3 p2, Vector2 *z0, Vector2 *z1, Vector2 *z2, float epsilon)
-{
-	Vector3 X = normalize(p1 - p0, epsilon);
-	Vector3 Z = normalize(cross(X, p2 - p0), epsilon);
+static void projectTriangle(Vector3 p0, Vector3 p1, Vector3 p2, Vector2 *z0, Vector2 *z1, Vector2 *z2) {
+	Vector3 X = normalize(p1 - p0);
+	Vector3 Z = normalize(cross(X, p2 - p0));
 	Vector3 Y = cross(Z, X);
 	Vector3 &O = p0;
 	*z0 = Vector2(0, 0);
@@ -6623,8 +6066,83 @@ static void projectTriangle(Vector3 p0, Vector3 p1, Vector3 p2, Vector2 *z0, Vec
 	*z2 = Vector2(dot(p2 - O, X), dot(p2 - O, Y));
 }
 
-static bool computeLeastSquaresConformalMap(Mesh *mesh)
-{
+// Conformal relations from Brecht Van Lommel (based on ABF):
+
+static float vec_angle_cos(const Vector3 &v1, const Vector3 &v2, const Vector3 &v3) {
+	Vector3 d1 = v1 - v2;
+	Vector3 d2 = v3 - v2;
+	return clamp(dot(d1, d2) / (length(d1) * length(d2)), -1.0f, 1.0f);
+}
+
+static float vec_angle(const Vector3 &v1, const Vector3 &v2, const Vector3 &v3) {
+	float dot = vec_angle_cos(v1, v2, v3);
+	return acosf(dot);
+}
+
+static void triangle_angles(const Vector3 &v1, const Vector3 &v2, const Vector3 &v3, float *a1, float *a2, float *a3) {
+	*a1 = vec_angle(v3, v1, v2);
+	*a2 = vec_angle(v1, v2, v3);
+	*a3 = kPi - *a2 - *a1;
+}
+
+static bool setup_abf_relations(opennl::NLContext *context, int id0, int id1, int id2, const Vector3 &p0, const Vector3 &p1, const Vector3 &p2) {
+	// @@ IC: Wouldn't it be more accurate to return cos and compute 1-cos^2?
+	// It does indeed seem to be a little bit more robust.
+	// @@ Need to revisit this more carefully!
+	float a0, a1, a2;
+	triangle_angles(p0, p1, p2, &a0, &a1, &a2);
+	if (a0 == 0.0f || a1 == 0.0f || a2 == 0.0f)
+		return false;
+	float s0 = sinf(a0);
+	float s1 = sinf(a1);
+	float s2 = sinf(a2);
+	if (s1 > s0 && s1 > s2) {
+		swap(s1, s2);
+		swap(s0, s1);
+		swap(a1, a2);
+		swap(a0, a1);
+		swap(id1, id2);
+		swap(id0, id1);
+	} else if (s0 > s1 && s0 > s2) {
+		swap(s0, s2);
+		swap(s0, s1);
+		swap(a0, a2);
+		swap(a0, a1);
+		swap(id0, id2);
+		swap(id0, id1);
+	}
+	float c0 = cosf(a0);
+	float ratio = (s2 == 0.0f) ? 1.0f : s1 / s2;
+	float cosine = c0 * ratio;
+	float sine = s0 * ratio;
+	// Note  : 2*id + 0 --> u
+	//         2*id + 1 --> v
+	int u0_id = 2 * id0 + 0;
+	int v0_id = 2 * id0 + 1;
+	int u1_id = 2 * id1 + 0;
+	int v1_id = 2 * id1 + 1;
+	int u2_id = 2 * id2 + 0;
+	int v2_id = 2 * id2 + 1;
+	// Real part
+	opennl::nlBegin(context, NL_ROW);
+	opennl::nlCoefficient(context, u0_id, cosine - 1.0f);
+	opennl::nlCoefficient(context, v0_id, -sine);
+	opennl::nlCoefficient(context, u1_id, -cosine);
+	opennl::nlCoefficient(context, v1_id, sine);
+	opennl::nlCoefficient(context, u2_id, 1);
+	opennl::nlEnd(context, NL_ROW);
+	// Imaginary part
+	opennl::nlBegin(context, NL_ROW);
+	opennl::nlCoefficient(context, u0_id, sine);
+	opennl::nlCoefficient(context, v0_id, cosine - 1.0f);
+	opennl::nlCoefficient(context, u1_id, -sine);
+	opennl::nlCoefficient(context, v1_id, -cosine);
+	opennl::nlCoefficient(context, v2_id, 1);
+	opennl::nlEnd(context, NL_ROW);
+	return true;
+}
+
+static bool computeLeastSquaresConformalMap(Mesh *mesh) {
 	uint32_t lockedVertex0, lockedVertex1;
 	if (!findApproximateDiameterVertices(mesh, &lockedVertex0, &lockedVertex1)) {
 		// Mesh has no boundaries.
@@ -6635,55 +6153,57 @@ static bool computeLeastSquaresConformalMap(Mesh *mesh)
 	opennl::nlSolverParameteri(context, NL_NB_VARIABLES, int(2 * vertexCount));
 	opennl::nlSolverParameteri(context, NL_MAX_ITERATIONS, int(5 * vertexCount));
 	opennl::nlBegin(context, NL_SYSTEM);
-	const Vector2 *texcoords = mesh->texcoords();
+	ArrayView<Vector2> texcoords = mesh->texcoords();
 	for (uint32_t i = 0; i < vertexCount; i++) {
 		opennl::nlSetVariable(context, 2 * i, texcoords[i].x);
 		opennl::nlSetVariable(context, 2 * i + 1, texcoords[i].y);
 		if (i == lockedVertex0 || i == lockedVertex1) {
 			opennl::nlLockVariable(context, 2 * i);
 			opennl::nlLockVariable(context, 2 * i + 1);
-		} 
+		}
 	}
 	opennl::nlBegin(context, NL_MATRIX);
 	const uint32_t faceCount = mesh->faceCount();
-	const Vector3 *positions = mesh->positions();
-	const uint32_t *indices = mesh->indices();
+	ConstArrayView<Vector3> positions = mesh->positions();
+	ConstArrayView<uint32_t> indices = mesh->indices();
 	for (uint32_t f = 0; f < faceCount; f++) {
 		const uint32_t v0 = indices[f * 3 + 0];
 		const uint32_t v1 = indices[f * 3 + 1];
 		const uint32_t v2 = indices[f * 3 + 2];
-		Vector2 z0, z1, z2;
-		projectTriangle(positions[v0], positions[v1], positions[v2], &z0, &z1, &z2, mesh->epsilon());
-		double a = z1.x - z0.x;
-		double b = z1.y - z0.y;
-		double c = z2.x - z0.x;
-		double d = z2.y - z0.y;
-		XA_DEBUG_ASSERT(b == 0.0);
-		// Note  : 2*id + 0 --> u
-		//         2*id + 1 --> v
-		uint32_t u0_id = 2 * v0;
-		uint32_t v0_id = 2 * v0 + 1;
-		uint32_t u1_id = 2 * v1;
-		uint32_t v1_id = 2 * v1 + 1;
-		uint32_t u2_id = 2 * v2;
-		uint32_t v2_id = 2 * v2 + 1;
-		// Note : b = 0
-		// Real part
-		opennl::nlBegin(context, NL_ROW);
-		opennl::nlCoefficient(context, u0_id, -a+c) ;
-		opennl::nlCoefficient(context, v0_id, b-d) ;
-		opennl::nlCoefficient(context, u1_id, -c) ;
-		opennl::nlCoefficient(context, v1_id, d) ;
-		opennl::nlCoefficient(context, u2_id, a);
-		opennl::nlEnd(context, NL_ROW);
-		// Imaginary part
-		opennl::nlBegin(context, NL_ROW);
-		opennl::nlCoefficient(context, u0_id, -b+d);
-		opennl::nlCoefficient(context, v0_id, -a+c);
-		opennl::nlCoefficient(context, u1_id, -d);
-		opennl::nlCoefficient(context, v1_id, -c);
-		opennl::nlCoefficient(context, v2_id, a);
-		opennl::nlEnd(context, NL_ROW);
+		if (!setup_abf_relations(context, v0, v1, v2, positions[v0], positions[v1], positions[v2])) {
+			Vector2 z0, z1, z2;
+			projectTriangle(positions[v0], positions[v1], positions[v2], &z0, &z1, &z2);
+			double a = z1.x - z0.x;
+			double b = z1.y - z0.y;
+			double c = z2.x - z0.x;
+			double d = z2.y - z0.y;
+			XA_DEBUG_ASSERT(b == 0.0);
+			// Note  : 2*id + 0 --> u
+			//         2*id + 1 --> v
+			uint32_t u0_id = 2 * v0;
+			uint32_t v0_id = 2 * v0 + 1;
+			uint32_t u1_id = 2 * v1;
+			uint32_t v1_id = 2 * v1 + 1;
+			uint32_t u2_id = 2 * v2;
+			uint32_t v2_id = 2 * v2 + 1;
+			// Note : b = 0
+			// Real part
+			opennl::nlBegin(context, NL_ROW);
+			opennl::nlCoefficient(context, u0_id, -a + c);
+			opennl::nlCoefficient(context, v0_id, b - d);
+			opennl::nlCoefficient(context, u1_id, -c);
+			opennl::nlCoefficient(context, v1_id, d);
+			opennl::nlCoefficient(context, u2_id, a);
+			opennl::nlEnd(context, NL_ROW);
+			// Imaginary part
+			opennl::nlBegin(context, NL_ROW);
+			opennl::nlCoefficient(context, u0_id, -b + d);
+			opennl::nlCoefficient(context, v0_id, -a + c);
+			opennl::nlCoefficient(context, u1_id, -d);
+			opennl::nlCoefficient(context, v1_id, -c);
+			opennl::nlCoefficient(context, v2_id, a);
+			opennl::nlEnd(context, NL_ROW);
+		}
 	}
 	opennl::nlEnd(context, NL_MATRIX);
 	opennl::nlEnd(context, NL_SYSTEM);
@@ -6694,7 +6214,7 @@ static bool computeLeastSquaresConformalMap(Mesh *mesh)
 	for (uint32_t i = 0; i < vertexCount; i++) {
 		const double u = opennl::nlGetVariable(context, 2 * i);
 		const double v = opennl::nlGetVariable(context, 2 * i + 1);
-		mesh->texcoord(i) = Vector2((float)u, (float)v);
+		texcoords[i] = Vector2((float)u, (float)v);
 		XA_DEBUG_ASSERT(!isNan(mesh->texcoord(i).x));
 		XA_DEBUG_ASSERT(!isNan(mesh->texcoord(i).y));
 	}
@@ -6702,30 +6222,26 @@ static bool computeLeastSquaresConformalMap(Mesh *mesh)
 	return true;
 }
 
-#if XA_RECOMPUTE_CHARTS
-struct PiecewiseParam
-{
-	void reset(const Mesh *mesh, uint32_t faceCount)
-	{
+struct PiecewiseParam {
+	void reset(const Mesh *mesh) {
 		m_mesh = mesh;
-		m_faceCount = faceCount;
+		const uint32_t faceCount = m_mesh->faceCount();
 		const uint32_t vertexCount = m_mesh->vertexCount();
 		m_texcoords.resize(vertexCount);
-		m_patch.reserve(m_faceCount);
-		m_candidates.reserve(m_faceCount);
-		m_faceInAnyPatch.resize(m_faceCount);
+		m_patch.reserve(faceCount);
+		m_candidates.reserve(faceCount);
+		m_faceInAnyPatch.resize(faceCount);
 		m_faceInAnyPatch.zeroOutMemory();
-		m_faceInvalid.resize(m_faceCount);
-		m_faceInPatch.resize(m_faceCount);
+		m_faceInvalid.resize(faceCount);
+		m_faceInPatch.resize(faceCount);
 		m_vertexInPatch.resize(vertexCount);
-		m_faceToCandidate.resize(m_faceCount);
+		m_faceToCandidate.resize(faceCount);
 	}
 
 	ConstArrayView<uint32_t> chartFaces() const { return m_patch; }
-	const Vector2 *texcoords() const { return m_texcoords.data(); }
+	ConstArrayView<Vector2> texcoords() const { return m_texcoords; }
 
-	bool computeChart()
-	{
+	bool computeChart() {
 		// Clear per-patch state.
 		m_patch.clear();
 		m_candidates.clear();
@@ -6734,8 +6250,9 @@ struct PiecewiseParam
 		m_faceInPatch.zeroOutMemory();
 		m_vertexInPatch.zeroOutMemory();
 		// Add the seed face (first unassigned face) to the patch.
+		const uint32_t faceCount = m_mesh->faceCount();
 		uint32_t seed = UINT32_MAX;
-		for (uint32_t f = 0; f < m_faceCount; f++) {
+		for (uint32_t f = 0; f < faceCount; f++) {
 			if (m_faceInAnyPatch.get(f))
 				continue;
 			seed = f;
@@ -6749,7 +6266,7 @@ struct PiecewiseParam
 			}
 			addFaceToPatch(seed);
 			// Initialize the boundary grid.
-			m_boundaryGrid.reset(m_texcoords.data(), m_mesh->indices());
+			m_boundaryGrid.reset(m_texcoords, m_mesh->indices());
 			for (Mesh::FaceEdgeIterator it(m_mesh, seed); !it.isDone(); it.advance())
 				m_boundaryGrid.append(it.edge());
 			break;
@@ -6793,22 +6310,34 @@ struct PiecewiseParam
 					break;
 				}
 			}
+			// Check for zero area and flipped faces (using area).
+			for (CandidateIterator it(bestCandidate); !it.isDone(); it.advance()) {
+				const Vector2 a = m_texcoords[m_mesh->vertexAt(it.current()->face * 3 + 0)];
+				const Vector2 b = m_texcoords[m_mesh->vertexAt(it.current()->face * 3 + 1)];
+				const Vector2 c = m_texcoords[m_mesh->vertexAt(it.current()->face * 3 + 2)];
+				const float area = triangleArea(a, b, c);
+				if (area <= 0.0f) {
+					invalid = true;
+					break;
+				}
+			}
 			// Check for boundary intersection.
 			if (!invalid) {
 				XA_PROFILE_START(parameterizeChartsPiecewiseBoundaryIntersection)
 				// Test candidate edges that would form part of the new patch boundary.
 				// Ignore boundary edges that would become internal if the candidate faces were added to the patch.
-				Array<uint32_t> newBoundaryEdges, ignoreEdges;
+				m_newBoundaryEdges.clear();
+				m_ignoreBoundaryEdges.clear();
 				for (CandidateIterator candidateIt(bestCandidate); !candidateIt.isDone(); candidateIt.advance()) {
 					for (Mesh::FaceEdgeIterator it(m_mesh, candidateIt.current()->face); !it.isDone(); it.advance()) {
 						const uint32_t oface = it.oppositeFace();
-						if (oface == UINT32_MAX || oface >= m_faceCount || !m_faceInPatch.get(oface))
-							newBoundaryEdges.push_back(it.edge());
-						if (oface != UINT32_MAX && oface < m_faceCount && m_faceInPatch.get(oface))
-							ignoreEdges.push_back(it.oppositeEdge());
+						if (oface == UINT32_MAX || !m_faceInPatch.get(oface))
+							m_newBoundaryEdges.push_back(it.edge());
+						if (oface != UINT32_MAX && m_faceInPatch.get(oface))
+							m_ignoreBoundaryEdges.push_back(it.oppositeEdge());
 					}
 				}
-				invalid = m_boundaryGrid.intersect(m_mesh->epsilon(), newBoundaryEdges, ignoreEdges);
+				invalid = m_boundaryGrid.intersect(m_mesh->epsilon(), m_newBoundaryEdges, m_ignoreBoundaryEdges);
 				XA_PROFILE_END(parameterizeChartsPiecewiseBoundaryIntersection)
 			}
 			if (invalid) {
@@ -6826,11 +6355,11 @@ struct PiecewiseParam
 				removeLinkedCandidates(bestCandidate);
 				// Reset the grid with all edges on the patch boundary.
 				XA_PROFILE_START(parameterizeChartsPiecewiseBoundaryIntersection)
-				m_boundaryGrid.reset(m_texcoords.data(), m_mesh->indices());
+				m_boundaryGrid.reset(m_texcoords, m_mesh->indices());
 				for (uint32_t i = 0; i < m_patch.size(); i++) {
 					for (Mesh::FaceEdgeIterator it(m_mesh, m_patch[i]); !it.isDone(); it.advance()) {
 						const uint32_t oface = it.oppositeFace();
-						if (oface == UINT32_MAX || oface >= m_faceCount || !m_faceInPatch.get(oface))
+						if (oface == UINT32_MAX || !m_faceInPatch.get(oface))
 							m_boundaryGrid.append(it.edge());
 					}
 				}
@@ -6841,8 +6370,7 @@ struct PiecewiseParam
 	}
 
 private:
-	struct Candidate
-	{
+	struct Candidate {
 		uint32_t face, vertex;
 		Candidate *prev, *next; // The previous/next candidate with the same vertex.
 		Vector2 position;
@@ -6852,10 +6380,14 @@ private:
 		float patchVertexOrient;
 	};
 
-	struct CandidateIterator
-	{
-		CandidateIterator(Candidate *head) : m_current(head) { XA_DEBUG_ASSERT(!head->prev); }
-		void advance() { if (m_current != nullptr) { m_current = m_current->next; } }
+	struct CandidateIterator {
+		CandidateIterator(Candidate *head) :
+				m_current(head) { XA_DEBUG_ASSERT(!head->prev); }
+		void advance() {
+			if (m_current != nullptr) {
+				m_current = m_current->next;
+			}
+		}
 		bool isDone() const { return !m_current; }
 		Candidate *current() { return m_current; }
 
@@ -6864,7 +6396,6 @@ private:
 	};
 
 	const Mesh *m_mesh;
-	uint32_t m_faceCount;
 	Array<Vector2> m_texcoords;
 	BitArray m_faceInAnyPatch; // Face is in a previous chart patch or the current patch.
 	Array<Candidate *> m_candidates; // Incident faces to the patch.
@@ -6873,9 +6404,9 @@ private:
 	BitArray m_faceInPatch, m_vertexInPatch; // Face/vertex is in the current patch.
 	BitArray m_faceInvalid; // Face cannot be added to the patch - flipped, cost too high or causes boundary intersection.
 	UniformGrid2 m_boundaryGrid;
+	Array<uint32_t> m_newBoundaryEdges, m_ignoreBoundaryEdges; // Temp arrays used when testing for boundary intersection.
 
-	void addFaceToPatch(uint32_t face)
-	{
+	void addFaceToPatch(uint32_t face) {
 		XA_DEBUG_ASSERT(!m_faceInPatch.get(face));
 		XA_DEBUG_ASSERT(!m_faceInAnyPatch.get(face));
 		m_patch.push_back(face);
@@ -6884,7 +6415,7 @@ private:
 		// Find new candidate faces on the patch incident to the newly added face.
 		for (Mesh::FaceEdgeIterator it(m_mesh, face); !it.isDone(); it.advance()) {
 			const uint32_t oface = it.oppositeFace();
-			if (oface == UINT32_MAX || oface >= m_faceCount || m_faceInAnyPatch.get(oface) || m_faceToCandidate[oface])
+			if (oface == UINT32_MAX || m_faceInAnyPatch.get(oface) || m_faceToCandidate[oface])
 				continue;
 			// Found an active edge on the patch front.
 			// Find the free vertex (the vertex that isn't on the active edge).
@@ -6900,12 +6431,14 @@ private:
 				}
 			}
 			XA_DEBUG_ASSERT(freeVertex != UINT32_MAX);
-			// If the free vertex is already in the patch, the face is enclosed by the patch. Add the face to the patch - don't need to assign texcoords.
-			/*if (m_vertexInPatch.get(freeVertex)) {
+			if (m_vertexInPatch.get(freeVertex)) {
+#if 0
+				// If the free vertex is already in the patch, the face is enclosed by the patch. Add the face to the patch - don't need to assign texcoords.
 				freeVertex = UINT32_MAX;
-				addFaceToPatch(oface, false);
+				addFaceToPatch(oface);
+#endif
 				continue;
-			}*/
+			}
 			// Check this here rather than above so faces enclosed by the patch are always added.
 			if (m_faceInvalid.get(oface))
 				continue;
@@ -6913,8 +6446,7 @@ private:
 		}
 	}
 
-	void addCandidateFace(uint32_t patchEdge, float patchVertexOrient, uint32_t face, uint32_t edge, uint32_t freeVertex)
-	{
+	void addCandidateFace(uint32_t patchEdge, float patchVertexOrient, uint32_t face, uint32_t edge, uint32_t freeVertex) {
 		XA_DEBUG_ASSERT(!m_faceToCandidate[face]);
 		Vector2 texcoords[3];
 		orthoProjectFace(face, texcoords);
@@ -6960,8 +6492,10 @@ private:
 			uv.x = x + texcoords[localVertex0].x;
 			uv.y = y + texcoords[localVertex0].y;
 		}
-		if (isNan(texcoords[localFreeVertex].x) || isNan(texcoords[localFreeVertex].y))
+		if (isNan(texcoords[localFreeVertex].x) || isNan(texcoords[localFreeVertex].y)) {
+			m_faceInvalid.set(face);
 			return;
+		}
 		// Check for local overlap (flipped triangle).
 		// The patch face vertex that isn't on the active edge and the free vertex should be oriented on opposite sides to the active edge.
 		const float freeVertexOrient = orientToEdge(m_texcoords[vertex0], m_texcoords[vertex1], texcoords[localFreeVertex]);
@@ -6975,12 +6509,10 @@ private:
 			return;
 		}
 		const float cost = fabsf(stretch - 1.0f);
-#if 0
-		if (cost > 0.25f) {
+		if (cost > 0.5f) {
 			m_faceInvalid.set(face);
 			return;
 		}
-#endif
 		// Add the candidate.
 		Candidate *candidate = XA_ALLOC(MemTag::Default, Candidate);
 		candidate->face = face;
@@ -7017,8 +6549,7 @@ private:
 			it.current()->maxCost = maxCost;
 	}
 
-	Candidate *linkedCandidateHead(Candidate *candidate)
-	{
+	Candidate *linkedCandidateHead(Candidate *candidate) {
 		Candidate *current = candidate;
 		for (;;) {
 			if (!current->prev)
@@ -7028,8 +6559,7 @@ private:
 		return current;
 	}
 
-	void removeLinkedCandidates(Candidate *head)
-	{
+	void removeLinkedCandidates(Candidate *head) {
 		XA_DEBUG_ASSERT(!head->prev);
 		Candidate *current = head;
 		while (current) {
@@ -7046,10 +6576,9 @@ private:
 		}
 	}
 
-	void orthoProjectFace(uint32_t face, Vector2 *texcoords) const
-	{
-		const Vector3 normal = m_mesh->computeFaceNormal(face);
-		const Vector3 tangent = normalize(m_mesh->position(m_mesh->vertexAt(face * 3 + 1)) - m_mesh->position(m_mesh->vertexAt(face * 3 + 0)), kEpsilon);
+	void orthoProjectFace(uint32_t face, Vector2 *texcoords) const {
+		const Vector3 normal = -m_mesh->computeFaceNormal(face);
+		const Vector3 tangent = normalize(m_mesh->position(m_mesh->vertexAt(face * 3 + 1)) - m_mesh->position(m_mesh->vertexAt(face * 3 + 0)));
 		const Vector3 bitangent = cross(normal, tangent);
 		for (uint32_t i = 0; i < 3; i++) {
 			const Vector3 &pos = m_mesh->position(m_mesh->vertexAt(face * 3 + i));
@@ -7057,16 +6586,14 @@ private:
 		}
 	}
 
-	float parametricArea(const Vector2 *texcoords) const
-	{
+	float parametricArea(const Vector2 *texcoords) const {
 		const Vector2 &v1 = texcoords[0];
 		const Vector2 &v2 = texcoords[1];
 		const Vector2 &v3 = texcoords[2];
 		return ((v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y)) * 0.5f;
 	}
 
-	float computeStretch(Vector3 p1, Vector3 p2, Vector3 p3, Vector2 t1, Vector2 t2, Vector2 t3) const
-	{
+	float computeStretch(Vector3 p1, Vector3 p2, Vector3 p3, Vector2 t1, Vector2 t2, Vector2 t3) const {
 		float parametricArea = ((t2.y - t1.y) * (t3.x - t1.x) - (t3.y - t1.y) * (t2.x - t1.x)) * 0.5f;
 		if (isZero(parametricArea, kAreaEpsilon))
 			return FLT_MAX;
@@ -7080,16 +6607,13 @@ private:
 	}
 
 	// Return value is positive if the point is one side of the edge, negative if on the other side.
-	float orientToEdge(Vector2 edgeVertex0, Vector2 edgeVertex1, Vector2 point) const
-	{
+	float orientToEdge(Vector2 edgeVertex0, Vector2 edgeVertex1, Vector2 point) const {
 		return (edgeVertex0.x - point.x) * (edgeVertex1.y - point.y) - (edgeVertex0.y - point.y) * (edgeVertex1.x - point.x);
 	}
 };
-#endif
 
 // Estimate quality of existing parameterization.
-struct Quality
-{
+struct Quality {
 	// computeBoundaryIntersection
 	bool boundaryIntersection = false;
 
@@ -7106,8 +6630,7 @@ struct Quality
 	float conformalMetric = 0.0f;
 	float authalicMetric = 0.0f;
 
-	void computeBoundaryIntersection(const Mesh *mesh, UniformGrid2 &boundaryGrid)
-	{
+	void computeBoundaryIntersection(const Mesh *mesh, UniformGrid2 &boundaryGrid) {
 		const Array<uint32_t> &boundaryEdges = mesh->boundaryEdges();
 		const uint32_t boundaryEdgeCount = boundaryEdges.size();
 		boundaryGrid.reset(mesh->texcoords(), mesh->indices(), boundaryEdgeCount);
@@ -7123,11 +6646,11 @@ struct Quality
 #endif
 	}
 
-	void computeFlippedFaces(const Mesh *mesh, uint32_t faceCount, Array<uint32_t> *flippedFaces)
-	{
+	void computeFlippedFaces(const Mesh *mesh, Array<uint32_t> *flippedFaces) {
 		totalTriangleCount = flippedTriangleCount = zeroAreaTriangleCount = 0;
 		if (flippedFaces)
 			flippedFaces->clear();
+		const uint32_t faceCount = mesh->faceCount();
 		for (uint32_t f = 0; f < faceCount; f++) {
 			Vector2 texcoord[3];
 			for (int i = 0; i < 3; i++) {
@@ -7159,8 +6682,7 @@ struct Quality
 				flippedFaces->clear();
 			flippedTriangleCount = 0;
 		}
-		if (flippedTriangleCount > totalTriangleCount / 2)
-		{
+		if (flippedTriangleCount > totalTriangleCount / 2) {
 			// If more than half the triangles are flipped, reverse the flipped / not flipped classification.
 			flippedTriangleCount = totalTriangleCount - flippedTriangleCount;
 			if (flippedFaces) {
@@ -7182,10 +6704,10 @@ struct Quality
 		}
 	}
 
-	void computeMetrics(const Mesh *mesh, uint32_t faceCount)
-	{
+	void computeMetrics(const Mesh *mesh) {
 		totalGeometricArea = totalParametricArea = 0.0f;
 		stretchMetric = maxStretchMetric = conformalMetric = authalicMetric = 0.0f;
+		const uint32_t faceCount = mesh->faceCount();
 		for (uint32_t f = 0; f < faceCount; f++) {
 			Vector3 pos[3];
 			Vector2 texcoord[3];
@@ -7214,7 +6736,7 @@ struct Quality
 			const float a = dot(Ss, Ss); // E
 			const float b = dot(Ss, St); // F
 			const float c = dot(St, St); // G
-										 // Compute eigen-values of the first fundamental form:
+					// Compute eigen-values of the first fundamental form:
 			const float sigma1 = sqrtf(0.5f * max(0.0f, a + c - sqrtf(square(a - c) + 4 * square(b)))); // gamma uppercase, min eigenvalue.
 			const float sigma2 = sqrtf(0.5f * max(0.0f, a + c + sqrtf(square(a - c) + 4 * square(b)))); // gamma lowercase, max eigenvalue.
 			XA_ASSERT(sigma2 > sigma1 || equal(sigma1, sigma2, kEpsilon));
@@ -7245,347 +6767,261 @@ struct Quality
 		if (totalGeometricArea > 0.0f) {
 			const float normFactor = sqrtf(totalParametricArea / totalGeometricArea);
 			stretchMetric = sqrtf(stretchMetric / totalGeometricArea) * normFactor;
-			maxStretchMetric  *= normFactor;
+			maxStretchMetric *= normFactor;
 			conformalMetric = sqrtf(conformalMetric / totalGeometricArea);
 			authalicMetric = sqrtf(authalicMetric / totalGeometricArea);
 		}
 	}
 };
 
-struct ChartWarningFlags
-{
-	enum Enum
-	{
-		CloseHolesFailed = 1<<1,
-		FixTJunctionsDuplicatedEdge = 1<<2,
-		FixTJunctionsFailed = 1<<3,
-		TriangulateDuplicatedEdge = 1<<4,
-	};
-};
-
-struct ChartCtorBuffers
-{
+struct ChartCtorBuffers {
 	Array<uint32_t> chartMeshIndices;
 	Array<uint32_t> unifiedMeshIndices;
-	Array<uint32_t> boundaryLoops;
 };
 
-class Chart
-{
+class Chart {
 public:
-	Chart(ChartCtorBuffers &buffers, const ParameterizeOptions &options, const Basis &basis, ConstArrayView<uint32_t> faces, const Mesh *sourceMesh, uint32_t chartGroupId, uint32_t chartId) : m_basis(basis), m_mesh(nullptr), m_unifiedMesh(nullptr), m_unmodifiedUnifiedMesh(nullptr), m_type(ChartType::LSCM), m_warningFlags(0), m_closedHolesCount(0), m_fixedTJunctionsCount(0), m_isInvalid(false)
-	{
+	Chart(const Basis &basis, segment::ChartGeneratorType::Enum generatorType, ConstArrayView<uint32_t> faces, const Mesh *sourceMesh, uint32_t chartGroupId, uint32_t chartId) :
+			m_basis(basis), m_unifiedMesh(nullptr), m_type(ChartType::LSCM), m_generatorType(generatorType), m_tjunctionCount(0), m_originalVertexCount(0), m_isInvalid(false) {
 		XA_UNUSED(chartGroupId);
 		XA_UNUSED(chartId);
 		m_faceToSourceFaceMap.copyFrom(faces.data, faces.length);
 		const uint32_t approxVertexCount = min(faces.length * 3, sourceMesh->vertexCount());
-		m_mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), approxVertexCount, faces.length);
 		m_unifiedMesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), approxVertexCount, faces.length);
 		HashMap<uint32_t, PassthroughHash<uint32_t>> sourceVertexToUnifiedVertexMap(MemTag::Mesh, approxVertexCount), sourceVertexToChartVertexMap(MemTag::Mesh, approxVertexCount);
-		// Add vertices.
-		const uint32_t faceCount = m_initialFaceCount = faces.length;
+		m_originalIndices.resize(faces.length * 3);
+		// Add geometry.
+		const uint32_t faceCount = faces.length;
 		for (uint32_t f = 0; f < faceCount; f++) {
+			uint32_t unifiedIndices[3];
 			for (uint32_t i = 0; i < 3; i++) {
 				const uint32_t sourceVertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
-				const uint32_t sourceUnifiedVertex = sourceMesh->firstColocal(sourceVertex);
+				uint32_t sourceUnifiedVertex = sourceMesh->firstColocalVertex(sourceVertex);
+				if (m_generatorType == segment::ChartGeneratorType::OriginalUv && sourceVertex != sourceUnifiedVertex) {
+					// Original UVs: don't unify vertices with different UVs; we want to preserve UVs.
+					if (!equal(sourceMesh->texcoord(sourceVertex), sourceMesh->texcoord(sourceUnifiedVertex), sourceMesh->epsilon()))
+						sourceUnifiedVertex = sourceVertex;
+				}
 				uint32_t unifiedVertex = sourceVertexToUnifiedVertexMap.get(sourceUnifiedVertex);
 				if (unifiedVertex == UINT32_MAX) {
 					unifiedVertex = sourceVertexToUnifiedVertexMap.add(sourceUnifiedVertex);
-					m_unifiedMesh->addVertex(sourceMesh->position(sourceVertex));
+					m_unifiedMesh->addVertex(sourceMesh->position(sourceVertex), Vector3(0.0f), sourceMesh->texcoord(sourceVertex));
 				}
 				if (sourceVertexToChartVertexMap.get(sourceVertex) == UINT32_MAX) {
 					sourceVertexToChartVertexMap.add(sourceVertex);
 					m_vertexToSourceVertexMap.push_back(sourceVertex);
 					m_chartVertexToUnifiedVertexMap.push_back(unifiedVertex);
-					m_mesh->addVertex(sourceMesh->position(sourceVertex), Vector3(0.0f), sourceMesh->texcoord(sourceVertex));
+					m_originalVertexCount++;
 				}
-			}
-		}
-		// Add faces.
-		for (uint32_t f = 0; f < faceCount; f++) {
-			uint32_t indices[3], unifiedIndices[3];
-			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t sourceVertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
-				const uint32_t sourceUnifiedVertex = sourceMesh->firstColocal(sourceVertex);
-				indices[i] = sourceVertexToChartVertexMap.get(sourceVertex);
-				XA_DEBUG_ASSERT(indices[i] != UINT32_MAX);
+				m_originalIndices[f * 3 + i] = sourceVertexToChartVertexMap.get(sourceVertex);
+				;
+				XA_DEBUG_ASSERT(m_originalIndices[f * 3 + i] != UINT32_MAX);
 				unifiedIndices[i] = sourceVertexToUnifiedVertexMap.get(sourceUnifiedVertex);
 				XA_DEBUG_ASSERT(unifiedIndices[i] != UINT32_MAX);
 			}
-			Mesh::AddFaceResult::Enum result = m_mesh->addFace(indices);
-			XA_UNUSED(result);
-			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
-#if XA_DEBUG
-			// Unifying colocals may create degenerate edges. e.g. if two triangle vertices are colocal.
-			for (int i = 0; i < 3; i++) {
-				const uint32_t index1 = unifiedIndices[i];
-				const uint32_t index2 = unifiedIndices[(i + 1) % 3];
-				XA_DEBUG_ASSERT(index1 != index2);
-			}
-#endif
-			result = m_unifiedMesh->addFace(unifiedIndices);
-			XA_UNUSED(result);
-			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
+			m_unifiedMesh->addFace(unifiedIndices);
 		}
-		m_mesh->createBoundaries(); // For AtlasPacker::computeBoundingBox
-		m_mesh->destroyEdgeMap(); // Only needed it for createBoundaries.
 		m_unifiedMesh->createBoundaries();
-		if (meshIsPlanar(*m_unifiedMesh)) {
+		if (m_generatorType == segment::ChartGeneratorType::Planar) {
 			m_type = ChartType::Planar;
 			return;
 		}
-		m_unifiedMesh->linkBoundaries();
-#if XA_DEBUG_EXPORT_OBJ_BEFORE_FIX_TJUNCTION
-		m_unifiedMesh->writeObjFile("debug_before_fix_tjunction.obj");
-#endif
-		bool duplicatedEdge = false, failed = false;
-		if (options.fixTJunctions) {
-			XA_PROFILE_START(fixChartMeshTJunctions)
-			Mesh *fixedUnifiedMesh = meshFixTJunctions(*m_unifiedMesh, &duplicatedEdge, &failed, &m_fixedTJunctionsCount);
-			XA_PROFILE_END(fixChartMeshTJunctions)
-			if (fixedUnifiedMesh) {
-				if (duplicatedEdge)
-					m_warningFlags |= ChartWarningFlags::FixTJunctionsDuplicatedEdge;
-				if (failed)
-					m_warningFlags |= ChartWarningFlags::FixTJunctionsFailed;
-				m_unmodifiedUnifiedMesh = m_unifiedMesh;
-				m_unifiedMesh = fixedUnifiedMesh;
-				m_unifiedMesh->createBoundaries();
-				m_unifiedMesh->linkBoundaries();
-				m_initialFaceCount = m_unifiedMesh->faceCount(); // Fixing t-junctions rewrites faces.
-			}
-		}
-		if (options.closeHoles) {
-			// See if there are any holes that need closing.
-			Array<uint32_t> &boundaryLoops = buffers.boundaryLoops;
-			meshGetBoundaryLoops(*m_unifiedMesh, boundaryLoops);
-			if (boundaryLoops.size() > 1) {
-#if XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR
-				const uint32_t faceCountBeforeHolesClosed = m_unifiedMesh->faceCount();
+#if XA_CHECK_T_JUNCTIONS
+		m_tjunctionCount = meshCheckTJunctions(*m_unifiedMesh);
+#if XA_DEBUG_EXPORT_OBJ_TJUNCTION
+		if (m_tjunctionCount > 0) {
+			char filename[256];
+			XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_chart_%03u_tjunction.obj", sourceMesh->id(), chartGroupId, chartId);
+			m_unifiedMesh->writeObjFile(filename);
+		}
 #endif
-				// Closing the holes is not always the best solution and does not fix all the problems.
-				// We need to do some analysis of the holes and the genus to:
-				// - Find cuts that reduce genus.
-				// - Find cuts to connect holes.
-				// - Use minimal spanning trees or seamster.
-				XA_PROFILE_START(closeChartMeshHoles)
-				uint32_t holeCount = 0;
-#if XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR
-				Array<uint32_t> holeFaceCounts;
-				failed = !meshCloseHoles(m_unifiedMesh, boundaryLoops, m_basis.normal, &holeFaceCounts);
-#else
-				failed = !meshCloseHoles(m_unifiedMesh, boundaryLoops, m_basis.normal, &holeCount, nullptr);
 #endif
-				XA_PROFILE_END(closeChartMeshHoles)
-				m_unifiedMesh->createBoundaries();
-				m_unifiedMesh->linkBoundaries();
-				meshGetBoundaryLoops(*m_unifiedMesh, boundaryLoops);
-				if (failed || boundaryLoops.size() > 1)
-					m_warningFlags |= ChartWarningFlags::CloseHolesFailed;
-				m_closedHolesCount = holeCount;
-#if XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR
-				if (m_warningFlags & ChartWarningFlags::CloseHolesFailed) {
-					char filename[256];
-					XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_chart_%03u_close_holes_error.obj", sourceMesh->id(), chartGroupId, chartId);
-					FILE *file;
-					XA_FOPEN(file, filename, "w");
-					if (file) {
-						m_unifiedMesh->writeObjVertices(file);
-						fprintf(file, "s off\n");
-						fprintf(file, "o object\n");
-						for (uint32_t i = 0; i < faceCountBeforeHolesClosed; i++)
-							m_unifiedMesh->writeObjFace(file, i);
-						uint32_t face = faceCountBeforeHolesClosed;
-						for (uint32_t i = 0; i < holeFaceCounts.size(); i++) {
-							fprintf(file, "s off\n");
-							fprintf(file, "o hole%u\n", i);
-							for (uint32_t j = 0; j < holeFaceCounts[i]; j++) {
-								m_unifiedMesh->writeObjFace(file, face);
-								face++;
-							}
-						}
-						m_unifiedMesh->writeObjBoundaryEges(file);
-						m_unifiedMesh->writeObjLinkedBoundaries(file);
-						fclose(file);
-					}
-				}
-#endif
-			}
-		}
 	}
 
-#if XA_RECOMPUTE_CHARTS
-	Chart(ChartCtorBuffers &buffers, const Chart *parent, const Mesh *parentMesh, ConstArrayView<uint32_t> faces, const Vector2 *texcoords, const Mesh *sourceMesh) : m_mesh(nullptr), m_unifiedMesh(nullptr), m_unmodifiedUnifiedMesh(nullptr), m_type(ChartType::Piecewise), m_warningFlags(0), m_closedHolesCount(0), m_fixedTJunctionsCount(0), m_isInvalid(false)
-	{
-		const uint32_t faceCount = m_initialFaceCount = faces.length;
+	Chart(ChartCtorBuffers &buffers, const Chart *parent, const Mesh *parentMesh, ConstArrayView<uint32_t> faces, ConstArrayView<Vector2> texcoords, const Mesh *sourceMesh) :
+			m_unifiedMesh(nullptr), m_type(ChartType::Piecewise), m_generatorType(segment::ChartGeneratorType::Piecewise), m_tjunctionCount(0), m_originalVertexCount(0), m_isInvalid(false) {
+		const uint32_t faceCount = faces.length;
 		m_faceToSourceFaceMap.resize(faceCount);
 		for (uint32_t i = 0; i < faceCount; i++)
 			m_faceToSourceFaceMap[i] = parent->m_faceToSourceFaceMap[faces[i]]; // Map faces to parent chart source mesh.
 		// Copy face indices.
-		m_mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), m_faceToSourceFaceMap.size() * 3, m_faceToSourceFaceMap.size());
 		Array<uint32_t> &chartMeshIndices = buffers.chartMeshIndices;
 		chartMeshIndices.resize(sourceMesh->vertexCount());
 		chartMeshIndices.fillBytes(0xff);
+		m_unifiedMesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), m_faceToSourceFaceMap.size() * 3, m_faceToSourceFaceMap.size());
+		HashMap<uint32_t, PassthroughHash<uint32_t>> sourceVertexToUnifiedVertexMap(MemTag::Mesh, m_faceToSourceFaceMap.size() * 3);
 		// Add vertices.
 		for (uint32_t f = 0; f < faceCount; f++) {
 			for (uint32_t i = 0; i < 3; i++) {
 				const uint32_t vertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
+				const uint32_t sourceUnifiedVertex = sourceMesh->firstColocalVertex(vertex);
 				const uint32_t parentVertex = parentMesh->vertexAt(faces[f] * 3 + i);
-				if (chartMeshIndices[vertex] == (uint32_t)~0) {
-					chartMeshIndices[vertex] = m_mesh->vertexCount();
+				uint32_t unifiedVertex = sourceVertexToUnifiedVertexMap.get(sourceUnifiedVertex);
+				if (unifiedVertex == UINT32_MAX) {
+					unifiedVertex = sourceVertexToUnifiedVertexMap.add(sourceUnifiedVertex);
+					m_unifiedMesh->addVertex(sourceMesh->position(vertex), Vector3(0.0f), texcoords[parentVertex]);
+				}
+				if (chartMeshIndices[vertex] == UINT32_MAX) {
+					chartMeshIndices[vertex] = m_originalVertexCount;
+					m_originalVertexCount++;
 					m_vertexToSourceVertexMap.push_back(vertex);
-					m_mesh->addVertex(sourceMesh->position(vertex), Vector3(0.0f), texcoords[parentVertex]);
+					m_chartVertexToUnifiedVertexMap.push_back(unifiedVertex);
 				}
 			}
 		}
 		// Add faces.
+		m_originalIndices.resize(faceCount * 3);
 		for (uint32_t f = 0; f < faceCount; f++) {
-			uint32_t indices[3];
+			uint32_t unifiedIndices[3];
 			for (uint32_t i = 0; i < 3; i++) {
 				const uint32_t vertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
-				indices[i] = chartMeshIndices[vertex];
+				m_originalIndices[f * 3 + i] = chartMeshIndices[vertex];
+				const uint32_t unifiedVertex = sourceMesh->firstColocalVertex(vertex);
+				unifiedIndices[i] = sourceVertexToUnifiedVertexMap.get(unifiedVertex);
 			}
-			Mesh::AddFaceResult::Enum result = m_mesh->addFace(indices);
-			XA_UNUSED(result);
-			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
+			m_unifiedMesh->addFace(unifiedIndices);
 		}
-		m_mesh->createBoundaries(); // For AtlasPacker::computeBoundingBox
-		m_mesh->destroyEdgeMap(); // Only needed it for createBoundaries.
+		m_unifiedMesh->createBoundaries();
 		// Need to store texcoords for backup/restore so packing can be run multiple times.
 		backupTexcoords();
 	}
-#endif
 
-	~Chart()
-	{
-		if (m_mesh) {
-			m_mesh->~Mesh();
-			XA_FREE(m_mesh);
+	~Chart() {
+		if (m_unifiedMesh) {
+			m_unifiedMesh->~Mesh();
+			XA_FREE(m_unifiedMesh);
+			m_unifiedMesh = nullptr;
 		}
-		destroyUnifiedMesh();
 	}
 
 	bool isInvalid() const { return m_isInvalid; }
-	ChartType::Enum type() const { return m_type; }
-	uint32_t warningFlags() const { return m_warningFlags; }
-	uint32_t closedHolesCount() const { return m_closedHolesCount; }
-	uint32_t fixedTJunctionsCount() const { return m_fixedTJunctionsCount; }
+	ChartType type() const { return m_type; }
+	segment::ChartGeneratorType::Enum generatorType() const { return m_generatorType; }
+	uint32_t tjunctionCount() const { return m_tjunctionCount; }
 	const Quality &quality() const { return m_quality; }
-	uint32_t initialFaceCount() const { return m_initialFaceCount; }
 #if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
 	const Array<uint32_t> &paramFlippedFaces() const { return m_paramFlippedFaces; }
 #endif
 	uint32_t mapFaceToSourceFace(uint32_t i) const { return m_faceToSourceFaceMap[i]; }
 	uint32_t mapChartVertexToSourceVertex(uint32_t i) const { return m_vertexToSourceVertexMap[i]; }
-	const Mesh *mesh() const { return m_mesh; }
-	Mesh *mesh() { return m_mesh; }
 	const Mesh *unifiedMesh() const { return m_unifiedMesh; }
-	const Mesh *unmodifiedUnifiedMesh() const { return m_unmodifiedUnifiedMesh; }
+	Mesh *unifiedMesh() { return m_unifiedMesh; }
 
-	void parameterize(const ParameterizeOptions &options, UniformGrid2 &boundaryGrid)
-	{
-		XA_PROFILE_START(parameterizeChartsOrthogonal)
-		{
+	// Vertex count of the chart mesh before unifying vertices.
+	uint32_t originalVertexCount() const { return m_originalVertexCount; }
+
+	uint32_t originalVertexToUnifiedVertex(uint32_t v) const { return m_chartVertexToUnifiedVertexMap[v]; }
+
+	ConstArrayView<uint32_t> originalVertices() const { return m_originalIndices; }
+
+	void parameterize(const ChartOptions &options, UniformGrid2 &boundaryGrid) {
+		const uint32_t unifiedVertexCount = m_unifiedMesh->vertexCount();
+		if (m_generatorType == segment::ChartGeneratorType::OriginalUv) {
+		} else {
 			// Project vertices to plane.
-			const uint32_t vertexCount = m_unifiedMesh->vertexCount();
-			for (uint32_t i = 0; i < vertexCount; i++)
+			XA_PROFILE_START(parameterizeChartsOrthogonal)
+			for (uint32_t i = 0; i < unifiedVertexCount; i++)
 				m_unifiedMesh->texcoord(i) = Vector2(dot(m_basis.tangent, m_unifiedMesh->position(i)), dot(m_basis.bitangent, m_unifiedMesh->position(i)));
-		}
-		XA_PROFILE_END(parameterizeChartsOrthogonal)
-		// Computing charts checks for flipped triangles and boundary intersection. Don't need to do that again here if chart is planar.
-		if (m_type != ChartType::Planar) {
-			XA_PROFILE_START(parameterizeChartsEvaluateQuality)
-			m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
-			m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, nullptr);
-			m_quality.computeMetrics(m_unifiedMesh, m_initialFaceCount);
-			XA_PROFILE_END(parameterizeChartsEvaluateQuality)
-			// Use orthogonal parameterization if quality is acceptable.
-			if (!m_quality.boundaryIntersection && m_quality.flippedTriangleCount == 0 && m_quality.totalGeometricArea > 0.0f && m_quality.stretchMetric <= 1.1f && m_quality.maxStretchMetric <= 1.25f)
-				m_type = ChartType::Ortho;
-		}
-		if (m_type == ChartType::LSCM) {
-			XA_PROFILE_START(parameterizeChartsLSCM)
-			if (options.func) {
-				options.func(&m_unifiedMesh->position(0).x, &m_unifiedMesh->texcoord(0).x, m_unifiedMesh->vertexCount(), m_unifiedMesh->indices(), m_unifiedMesh->indexCount());
-			}
-			else
-				computeLeastSquaresConformalMap(m_unifiedMesh);
-			XA_PROFILE_END(parameterizeChartsLSCM)
-			XA_PROFILE_START(parameterizeChartsEvaluateQuality)
-			m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
+			XA_PROFILE_END(parameterizeChartsOrthogonal)
+			// Computing charts checks for flipped triangles and boundary intersection. Don't need to do that again here if chart is planar.
+			if (m_type != ChartType::Planar && m_generatorType != segment::ChartGeneratorType::OriginalUv) {
+				XA_PROFILE_START(parameterizeChartsEvaluateQuality)
+				m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
+				m_quality.computeFlippedFaces(m_unifiedMesh, nullptr);
+				m_quality.computeMetrics(m_unifiedMesh);
+				XA_PROFILE_END(parameterizeChartsEvaluateQuality)
+				// Use orthogonal parameterization if quality is acceptable.
+				if (!m_quality.boundaryIntersection && m_quality.flippedTriangleCount == 0 && m_quality.zeroAreaTriangleCount == 0 && m_quality.totalGeometricArea > 0.0f && m_quality.stretchMetric <= 1.1f && m_quality.maxStretchMetric <= 1.25f)
+					m_type = ChartType::Ortho;
+			}
+			if (m_type == ChartType::LSCM) {
+				XA_PROFILE_START(parameterizeChartsLSCM)
+				if (options.paramFunc) {
+					options.paramFunc(&m_unifiedMesh->position(0).x, &m_unifiedMesh->texcoord(0).x, m_unifiedMesh->vertexCount(), m_unifiedMesh->indices().data, m_unifiedMesh->indexCount());
+				} else
+					computeLeastSquaresConformalMap(m_unifiedMesh);
+				XA_PROFILE_END(parameterizeChartsLSCM)
+				XA_PROFILE_START(parameterizeChartsEvaluateQuality)
+				m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
 #if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
-			m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, &m_paramFlippedFaces);
+				m_quality.computeFlippedFaces(m_unifiedMesh, &m_paramFlippedFaces);
 #else
-			m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, nullptr);
+				m_quality.computeFlippedFaces(m_unifiedMesh, nullptr);
 #endif
-			// Don't need to call computeMetrics here, that's only used in evaluateOrthoQuality to determine if quality is acceptable enough to use ortho projection.
-			if (m_quality.boundaryIntersection || m_quality.flippedTriangleCount > 0)
-				m_isInvalid = true;
-			XA_PROFILE_END(parameterizeChartsEvaluateQuality)
+				// Don't need to call computeMetrics here, that's only used in evaluateOrthoQuality to determine if quality is acceptable enough to use ortho projection.
+				if (m_quality.boundaryIntersection || m_quality.flippedTriangleCount > 0 || m_quality.zeroAreaTriangleCount > 0)
+					m_isInvalid = true;
+				XA_PROFILE_END(parameterizeChartsEvaluateQuality)
+			}
 		}
+		if (options.fixWinding && m_unifiedMesh->computeFaceParametricArea(0) < 0.0f) {
+			for (uint32_t i = 0; i < unifiedVertexCount; i++)
+				m_unifiedMesh->texcoord(i).x *= -1.0f;
+		}
+#if XA_CHECK_PARAM_WINDING
+		const uint32_t faceCount = m_unifiedMesh->faceCount();
+		uint32_t flippedCount = 0;
+		for (uint32_t i = 0; i < faceCount; i++) {
+			const float area = m_unifiedMesh->computeFaceParametricArea(i);
+			if (area < 0.0f)
+				flippedCount++;
+		}
+		if (flippedCount == faceCount) {
+			XA_PRINT_WARNING("param: all faces flipped\n");
+		} else if (flippedCount > 0) {
+			XA_PRINT_WARNING("param: %u / %u faces flipped\n", flippedCount, faceCount);
+		}
+#endif
+
 #if XA_DEBUG_ALL_CHARTS_INVALID
 		m_isInvalid = true;
 #endif
-		// Transfer parameterization from unified mesh to chart mesh.
-		const uint32_t vertexCount = m_mesh->vertexCount();
-		for (uint32_t v = 0; v < vertexCount; v++)
-			m_mesh->texcoord(v) = m_unifiedMesh->texcoord(m_chartVertexToUnifiedVertexMap[v]);
-		// Can destroy unified mesh now.
-		// But not if the parameterization is invalid, the unified mesh will be needed for PiecewiseParameterization.
-		if (!m_isInvalid)
-			destroyUnifiedMesh();
 		// Need to store texcoords for backup/restore so packing can be run multiple times.
 		backupTexcoords();
 	}
 
-	Vector2 computeParametricBounds() const
-	{
+	Vector2 computeParametricBounds() const {
 		Vector2 minCorner(FLT_MAX, FLT_MAX);
 		Vector2 maxCorner(-FLT_MAX, -FLT_MAX);
-		const uint32_t vertexCount = m_mesh->vertexCount();
+		const uint32_t vertexCount = m_unifiedMesh->vertexCount();
 		for (uint32_t v = 0; v < vertexCount; v++) {
-			minCorner = min(minCorner, m_mesh->texcoord(v));
-			maxCorner = max(maxCorner, m_mesh->texcoord(v));
+			minCorner = min(minCorner, m_unifiedMesh->texcoord(v));
+			maxCorner = max(maxCorner, m_unifiedMesh->texcoord(v));
 		}
 		return (maxCorner - minCorner) * 0.5f;
 	}
 
-	void restoreTexcoords()
-	{
-		memcpy(m_mesh->texcoords(), m_backupTexcoords.data(), m_mesh->vertexCount() * sizeof(Vector2));
+#if XA_CHECK_PIECEWISE_CHART_QUALITY
+	void evaluateQuality(UniformGrid2 &boundaryGrid) {
+		m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
+#if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
+		m_quality.computeFlippedFaces(m_unifiedMesh, &m_paramFlippedFaces);
+#else
+		m_quality.computeFlippedFaces(m_unifiedMesh, nullptr);
+#endif
+		if (m_quality.boundaryIntersection || m_quality.flippedTriangleCount > 0 || m_quality.zeroAreaTriangleCount > 0)
+			m_isInvalid = true;
 	}
+#endif
 
-private:
-	void backupTexcoords()
-	{
-		m_backupTexcoords.resize(m_mesh->vertexCount());
-		memcpy(m_backupTexcoords.data(), m_mesh->texcoords(), m_mesh->vertexCount() * sizeof(Vector2));
+	void restoreTexcoords() {
+		memcpy(m_unifiedMesh->texcoords().data, m_backupTexcoords.data(), m_unifiedMesh->vertexCount() * sizeof(Vector2));
 	}
 
-	void destroyUnifiedMesh()
-	{
-		if (m_unifiedMesh) {
-			m_unifiedMesh->~Mesh();
-			XA_FREE(m_unifiedMesh);
-			m_unifiedMesh = nullptr;
-		}
-		if (m_unmodifiedUnifiedMesh) {
-			m_unmodifiedUnifiedMesh->~Mesh();
-			XA_FREE(m_unmodifiedUnifiedMesh);
-			m_unmodifiedUnifiedMesh = nullptr;
-		}
-		// Don't need this when unified meshes are destroyed.
-		m_chartVertexToUnifiedVertexMap.destroy();
+private:
+	void backupTexcoords() {
+		m_backupTexcoords.resize(m_unifiedMesh->vertexCount());
+		memcpy(m_backupTexcoords.data(), m_unifiedMesh->texcoords().data, m_unifiedMesh->vertexCount() * sizeof(Vector2));
 	}
 
 	Basis m_basis;
-	Mesh *m_mesh;
 	Mesh *m_unifiedMesh;
-	Mesh *m_unmodifiedUnifiedMesh; // Unified mesh before fixing t-junctions. Null if no t-junctions were fixed
-	ChartType::Enum m_type;
-	uint32_t m_warningFlags;
-	uint32_t m_initialFaceCount; // Before fixing T-junctions and/or closing holes.
-	uint32_t m_closedHolesCount, m_fixedTJunctionsCount;
+	ChartType m_type;
+	segment::ChartGeneratorType::Enum m_generatorType;
+	uint32_t m_tjunctionCount;
+
+	uint32_t m_originalVertexCount;
+	Array<uint32_t> m_originalIndices;
 
 	// List of faces of the source mesh that belong to this chart.
 	Array<uint32_t> m_faceToSourceFaceMap;
@@ -7604,47 +7040,46 @@ private:
 	bool m_isInvalid;
 };
 
-struct CreateAndParameterizeChartTaskArgs
-{
-	const Basis *basis;
+struct CreateAndParameterizeChartTaskGroupArgs {
+	Progress *progress;
 	ThreadLocal<UniformGrid2> *boundaryGrid;
+	ThreadLocal<ChartCtorBuffers> *chartBuffers;
+	const ChartOptions *options;
+	ThreadLocal<PiecewiseParam> *pp;
+};
+
+struct CreateAndParameterizeChartTaskArgs {
+	const Basis *basis;
 	Chart *chart; // output
 	Array<Chart *> charts; // output (if more than one chart)
-	ThreadLocal<ChartCtorBuffers> *chartBuffers;
+	segment::ChartGeneratorType::Enum chartGeneratorType;
 	const Mesh *mesh;
-	const ParameterizeOptions *options;
-#if XA_RECOMPUTE_CHARTS
-	ThreadLocal<PiecewiseParam> *pp;
-#endif
 	ConstArrayView<uint32_t> faces;
 	uint32_t chartGroupId;
 	uint32_t chartId;
 };
 
-static void runCreateAndParameterizeChartTask(void *userData)
-{
-	auto args = (CreateAndParameterizeChartTaskArgs *)userData;
+static void runCreateAndParameterizeChartTask(void *groupUserData, void *taskUserData) {
+	XA_PROFILE_START(createChartMeshAndParameterizeThread)
+	auto groupArgs = (CreateAndParameterizeChartTaskGroupArgs *)groupUserData;
+	auto args = (CreateAndParameterizeChartTaskArgs *)taskUserData;
 	XA_PROFILE_START(createChartMesh)
-	args->chart = XA_NEW_ARGS(MemTag::Default, Chart, args->chartBuffers->get(), *args->options, *args->basis, args->faces, args->mesh, args->chartGroupId, args->chartId);
+	args->chart = XA_NEW_ARGS(MemTag::Default, Chart, *args->basis, args->chartGeneratorType, args->faces, args->mesh, args->chartGroupId, args->chartId);
 	XA_PROFILE_END(createChartMesh)
-	args->chart->parameterize(*args->options, args->boundaryGrid->get());
+	XA_PROFILE_START(parameterizeCharts)
+	args->chart->parameterize(*groupArgs->options, groupArgs->boundaryGrid->get());
+	XA_PROFILE_END(parameterizeCharts)
 #if XA_RECOMPUTE_CHARTS
-	if (!args->chart->isInvalid())
+	if (!args->chart->isInvalid()) {
+		XA_PROFILE_END(createChartMeshAndParameterizeThread)
 		return;
+	}
 	// Recompute charts with invalid parameterizations.
 	XA_PROFILE_START(parameterizeChartsRecompute)
 	Chart *invalidChart = args->chart;
-	// Fixing t-junctions rewrites unified mesh faces, and we need to map faces back to input mesh. So use the unmodified unified mesh.
-	const Mesh *invalidMesh = invalidChart->unmodifiedUnifiedMesh();
-	uint32_t faceCount = 0;
-	if (invalidMesh) {
-		faceCount = invalidMesh->faceCount();
-	} else {
-		invalidMesh = invalidChart->unifiedMesh();
-		faceCount = invalidChart->initialFaceCount(); // Not invalidMesh->faceCount(). Don't want faces added by hole closing.
-	}
-	PiecewiseParam &pp = args->pp->get();
-	pp.reset(invalidMesh, faceCount);
+	const Mesh *invalidMesh = invalidChart->unifiedMesh();
+	PiecewiseParam &pp = groupArgs->pp->get();
+	pp.reset(invalidMesh);
 #if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
 	char filename[256];
 	XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_chart_%03u_recomputed.obj", args->mesh->id(), args->chartGroupId, args->chartId);
@@ -7658,7 +7093,10 @@ static void runCreateAndParameterizeChartTask(void *userData)
 		XA_PROFILE_END(parameterizeChartsPiecewise)
 		if (!facesRemaining)
 			break;
-		Chart *chart = XA_NEW_ARGS(MemTag::Default, Chart, args->chartBuffers->get(), invalidChart, invalidMesh, pp.chartFaces(), pp.texcoords(), args->mesh);
+		Chart *chart = XA_NEW_ARGS(MemTag::Default, Chart, groupArgs->chartBuffers->get(), invalidChart, invalidMesh, pp.chartFaces(), pp.texcoords(), args->mesh);
+#if XA_CHECK_PIECEWISE_CHART_QUALITY
+		chart->evaluateQuality(args->boundaryGrid->get());
+#endif
 		args->charts.push_back(chart);
 #if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
 		if (file) {
@@ -7686,50 +7124,63 @@ static void runCreateAndParameterizeChartTask(void *userData)
 #endif
 	XA_PROFILE_END(parameterizeChartsRecompute)
 #endif // XA_RECOMPUTE_CHARTS
+	XA_PROFILE_END(createChartMeshAndParameterizeThread)
+	// Update progress.
+	groupArgs->progress->increment(args->faces.length);
 }
 
 // Set of charts corresponding to mesh faces in the same face group.
-class ChartGroup
-{
+class ChartGroup {
 public:
-	ChartGroup(uint32_t id, const Mesh *sourceMesh, const MeshFaceGroups *sourceMeshFaceGroups, MeshFaceGroups::Handle faceGroup) : m_id(id), m_sourceMesh(sourceMesh), m_sourceMeshFaceGroups(sourceMeshFaceGroups), m_faceGroup(faceGroup), m_faceCount(0), m_paramAddedChartsCount(0), m_paramDeletedChartsCount(0)
-	{
+	ChartGroup(uint32_t id, const Mesh *sourceMesh, const MeshFaceGroups *sourceMeshFaceGroups, MeshFaceGroups::Handle faceGroup) :
+			m_id(id), m_sourceMesh(sourceMesh), m_sourceMeshFaceGroups(sourceMeshFaceGroups), m_faceGroup(faceGroup) {
 	}
 
-	~ChartGroup()
-	{
+	~ChartGroup() {
 		for (uint32_t i = 0; i < m_charts.size(); i++) {
 			m_charts[i]->~Chart();
 			XA_FREE(m_charts[i]);
 		}
 	}
 
-	uint32_t segmentChartCount() const { return m_chartBasis.size(); }
 	uint32_t chartCount() const { return m_charts.size(); }
 	Chart *chartAt(uint32_t i) const { return m_charts[i]; }
-	uint32_t faceCount() const { return m_faceCount; }
-	uint32_t paramAddedChartsCount() const { return m_paramAddedChartsCount; }
-	uint32_t paramDeletedChartsCount() const { return m_paramDeletedChartsCount; }
+	uint32_t faceCount() const { return m_sourceMeshFaceGroups->faceCount(m_faceGroup); }
 
-	void computeChartFaces(const ChartOptions &options, segment::Atlas &atlas)
-	{
+	void computeCharts(TaskScheduler *taskScheduler, const ChartOptions &options, Progress *progress, segment::Atlas &atlas, ThreadLocal<UniformGrid2> *boundaryGrid, ThreadLocal<ChartCtorBuffers> *chartBuffers, ThreadLocal<PiecewiseParam> *piecewiseParam) {
+		// This function may be called multiple times, so destroy existing charts.
+		for (uint32_t i = 0; i < m_charts.size(); i++) {
+			m_charts[i]->~Chart();
+			XA_FREE(m_charts[i]);
+		}
 		// Create mesh from source mesh, using only the faces in this face group.
 		XA_PROFILE_START(createChartGroupMesh)
 		Mesh *mesh = createMesh();
 		XA_PROFILE_END(createChartGroupMesh)
 		// Segment mesh into charts (arrays of faces).
 #if XA_DEBUG_SINGLE_CHART
-		m_chartBasis.resize(1);
-		Fit::computeBasis(&mesh->position(0), mesh->vertexCount(), &m_chartBasis[0]);
-		m_chartFaces.resize(1 + mesh->faceCount());
-		m_chartFaces[0] = mesh->faceCount();
-		for (uint32_t i = 0; i < m_chartFaces.size(); i++)
-			m_chartFaces[i + 1] = i;
+		XA_UNUSED(options);
+		XA_UNUSED(atlas);
+		const uint32_t chartCount = 1;
+		uint32_t offset;
+		Basis chartBasis;
+		Fit::computeBasis(&mesh->position(0), mesh->vertexCount(), &chartBasis);
+		Array<uint32_t> chartFaces;
+		chartFaces.resize(1 + mesh->faceCount());
+		chartFaces[0] = mesh->faceCount();
+		for (uint32_t i = 0; i < chartFaces.size() - 1; i++)
+			chartFaces[i + 1] = m_faceToSourceFaceMap[i];
+		// Destroy mesh.
+		const uint32_t faceCount = mesh->faceCount();
+		mesh->~Mesh();
+		XA_FREE(mesh);
 #else
 		XA_PROFILE_START(buildAtlas)
 		atlas.reset(mesh, options);
 		atlas.compute();
 		XA_PROFILE_END(buildAtlas)
+		// Update progress.
+		progress->increment(faceCount());
 #if XA_DEBUG_EXPORT_OBJ_CHARTS
 		char filename[256];
 		XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_charts.obj", m_sourceMesh->id(), m_id);
@@ -7745,7 +7196,6 @@ public:
 					mesh->writeObjFace(file, faces[f]);
 			}
 			mesh->writeObjBoundaryEges(file);
-			mesh->writeObjLinkedBoundaries(file);
 			fclose(file);
 		}
 #endif
@@ -7754,65 +7204,57 @@ public:
 		mesh->~Mesh();
 		XA_FREE(mesh);
 		XA_PROFILE_START(copyChartFaces)
-		// Copy basis.
-		const uint32_t chartCount = atlas.chartCount();
-		m_chartBasis.resize(chartCount);
-		for (uint32_t i = 0; i < chartCount; i++)
-			m_chartBasis[i] = atlas.chartBasis(i);
+		if (progress->cancel)
+			return;
 		// Copy faces from segment::Atlas to m_chartFaces array with <chart 0 face count> <face 0> <face n> <chart 1 face count> etc. encoding.
 		// segment::Atlas faces refer to the chart group mesh. Map them to the input mesh instead.
-		m_chartFaces.resize(chartCount + faceCount);
+		const uint32_t chartCount = atlas.chartCount();
+		Array<uint32_t> chartFaces;
+		chartFaces.resize(chartCount + faceCount);
 		uint32_t offset = 0;
 		for (uint32_t i = 0; i < chartCount; i++) {
 			ConstArrayView<uint32_t> faces = atlas.chartFaces(i);
-			m_chartFaces[offset++] = faces.length;
+			chartFaces[offset++] = faces.length;
 			for (uint32_t j = 0; j < faces.length; j++)
-				m_chartFaces[offset++] = m_faceToSourceFaceMap[faces[j]];
+				chartFaces[offset++] = m_faceToSourceFaceMap[faces[j]];
 		}
 		XA_PROFILE_END(copyChartFaces)
 #endif
-	}
-
-#if XA_RECOMPUTE_CHARTS
-	void parameterizeCharts(TaskScheduler *taskScheduler, const ParameterizeOptions &options, ThreadLocal<UniformGrid2> *boundaryGrid, ThreadLocal<ChartCtorBuffers> *chartBuffers, ThreadLocal<PiecewiseParam> *piecewiseParam)
-#else
-	void parameterizeCharts(TaskScheduler* taskScheduler, const ParameterizeOptions &options, ThreadLocal<UniformGrid2>* boundaryGrid, ThreadLocal<ChartCtorBuffers>* chartBuffers)
-#endif
-	{
-		// This function may be called multiple times, so destroy existing charts.
-		for (uint32_t i = 0; i < m_charts.size(); i++) {
-			m_charts[i]->~Chart();
-			XA_FREE(m_charts[i]);
-		}
-		m_paramAddedChartsCount = 0;
-		const uint32_t chartCount = m_chartBasis.size();
+		XA_PROFILE_START(createChartMeshAndParameterizeReal)
+		CreateAndParameterizeChartTaskGroupArgs groupArgs;
+		groupArgs.progress = progress;
+		groupArgs.boundaryGrid = boundaryGrid;
+		groupArgs.chartBuffers = chartBuffers;
+		groupArgs.options = &options;
+		groupArgs.pp = piecewiseParam;
+		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(&groupArgs, chartCount);
 		Array<CreateAndParameterizeChartTaskArgs> taskArgs;
 		taskArgs.resize(chartCount);
 		taskArgs.runCtors(); // Has Array member.
-		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartCount);
-		uint32_t offset = 0;
+		offset = 0;
 		for (uint32_t i = 0; i < chartCount; i++) {
 			CreateAndParameterizeChartTaskArgs &args = taskArgs[i];
-			args.basis = &m_chartBasis[i];
-			args.boundaryGrid = boundaryGrid;
+#if XA_DEBUG_SINGLE_CHART
+			args.basis = &chartBasis;
+			args.isPlanar = false;
+#else
+			args.basis = &atlas.chartBasis(i);
+			args.chartGeneratorType = atlas.chartGeneratorType(i);
+#endif
 			args.chart = nullptr;
 			args.chartGroupId = m_id;
 			args.chartId = i;
-			args.chartBuffers = chartBuffers;
-			const uint32_t faceCount = m_chartFaces[offset++];
-			args.faces = ConstArrayView<uint32_t>(&m_chartFaces[offset], faceCount);
-			offset += faceCount;
+			const uint32_t chartFaceCount = chartFaces[offset++];
+			args.faces = ConstArrayView<uint32_t>(&chartFaces[offset], chartFaceCount);
+			offset += chartFaceCount;
 			args.mesh = m_sourceMesh;
-			args.options = &options;
-#if XA_RECOMPUTE_CHARTS
-			args.pp = piecewiseParam;
-#endif
 			Task task;
 			task.userData = &args;
 			task.func = runCreateAndParameterizeChartTask;
 			taskScheduler->run(taskGroup, task);
 		}
 		taskScheduler->wait(&taskGroup);
+		XA_PROFILE_END(createChartMeshAndParameterizeReal)
 #if XA_RECOMPUTE_CHARTS
 		// Count charts. Skip invalid ones and include new ones added by recomputing.
 		uint32_t newChartCount = 0;
@@ -7830,7 +7272,6 @@ public:
 			if (chart->isInvalid()) {
 				chart->~Chart();
 				XA_FREE(chart);
-				m_paramDeletedChartsCount++;
 				continue;
 			}
 			m_charts[current++] = chart;
@@ -7838,10 +7279,8 @@ public:
 		// Now add new charts.
 		for (uint32_t i = 0; i < chartCount; i++) {
 			CreateAndParameterizeChartTaskArgs &args = taskArgs[i];
-			for (uint32_t j = 0; j < args.charts.size(); j++) {
+			for (uint32_t j = 0; j < args.charts.size(); j++)
 				m_charts[current++] = args.charts[j];
-				m_paramAddedChartsCount++;
-			}
 		}
 #else // XA_RECOMPUTE_CHARTS
 		m_charts.resize(chartCount);
@@ -7852,15 +7291,14 @@ public:
 	}
 
 private:
-	Mesh *createMesh()
-	{
+	Mesh *createMesh() {
 		XA_DEBUG_ASSERT(m_faceGroup != MeshFaceGroups::kInvalid);
 		// Create new mesh from the source mesh, using faces that belong to this group.
 		m_faceToSourceFaceMap.reserve(m_sourceMeshFaceGroups->faceCount(m_faceGroup));
 		for (MeshFaceGroups::Iterator it(m_sourceMeshFaceGroups, m_faceGroup); !it.isDone(); it.advance())
 			m_faceToSourceFaceMap.push_back(it.face());
 		// Only initial meshes has ignored faces. The only flag we care about is HasNormals.
-		const uint32_t faceCount = m_faceCount = m_faceToSourceFaceMap.size();
+		const uint32_t faceCount = m_faceToSourceFaceMap.size();
 		XA_DEBUG_ASSERT(faceCount > 0);
 		const uint32_t approxVertexCount = min(faceCount * 3, m_sourceMesh->vertexCount());
 		Mesh *mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, m_sourceMesh->epsilon(), approxVertexCount, faceCount, m_sourceMesh->flags() & MeshFlags::HasNormals);
@@ -7889,9 +7327,7 @@ private:
 				XA_DEBUG_ASSERT(indices[i] != UINT32_MAX);
 			}
 			// Don't copy flags - ignored faces aren't used by chart groups, they are handled by InvalidMeshGeometry.
-			Mesh::AddFaceResult::Enum result = mesh->addFace(indices);
-			XA_UNUSED(result);
-			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
+			mesh->addFace(indices);
 		}
 		XA_PROFILE_START(createChartGroupMeshColocals)
 		mesh->createColocals();
@@ -7909,98 +7345,57 @@ private:
 	}
 
 	const uint32_t m_id;
-	const Mesh * const m_sourceMesh;
-	const MeshFaceGroups * const m_sourceMeshFaceGroups;
+	const Mesh *const m_sourceMesh;
+	const MeshFaceGroups *const m_sourceMeshFaceGroups;
 	const MeshFaceGroups::Handle m_faceGroup;
 	Array<uint32_t> m_faceToSourceFaceMap; // List of faces of the source mesh that belong to this chart group.
-	Array<Basis> m_chartBasis; // Copied from segment::Atlas.
-	Array<uint32_t> m_chartFaces; // Copied from segment::Atlas. Encoding: <chart 0 face count> <face 0> <face n> <chart 1 face count> etc.
 	Array<Chart *> m_charts;
-	uint32_t m_faceCount; // Set by createMesh(). Used for sorting.
-	uint32_t m_paramAddedChartsCount; // Number of new charts added by recomputing charts with invalid parameterizations.
-	uint32_t m_paramDeletedChartsCount; // Number of charts with invalid parameterizations that were deleted, after charts were recomputed.
-};
-
-// References invalid faces and vertices in a mesh.
-struct InvalidMeshGeometry
-{
-	// Invalid faces have the face groups MeshFaceGroups::kInvalid.
-	void extract(const Mesh *mesh, const MeshFaceGroups *meshFaceGroups)
-	{
-		// Copy invalid faces.
-		m_faces.clear();
-		const uint32_t meshFaceCount = mesh->faceCount();
-		for (uint32_t f = 0; f < meshFaceCount; f++) {
-			if (meshFaceGroups->groupAt(f) == MeshFaceGroups::kInvalid)
-				m_faces.push_back(f);
-		}
-		// Create *unique* list of vertices of invalid faces.
-		const uint32_t faceCount = m_faces.size();
-		m_indices.resize(faceCount * 3);
-		const uint32_t approxVertexCount = min(faceCount * 3, mesh->vertexCount());
-		m_vertexToSourceVertexMap.clear();
-		m_vertexToSourceVertexMap.reserve(approxVertexCount);
-		HashMap<uint32_t, PassthroughHash<uint32_t>> sourceVertexToVertexMap(MemTag::Mesh, approxVertexCount);
-		for (uint32_t f = 0; f < faceCount; f++) {
-			const uint32_t face = m_faces[f];
-			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex = mesh->vertexAt(face * 3 + i);
-				uint32_t newVertex = sourceVertexToVertexMap.get(vertex);
-				if (newVertex == UINT32_MAX) {
-					newVertex = sourceVertexToVertexMap.add(vertex);
-					m_vertexToSourceVertexMap.push_back(vertex);
-				}
-				m_indices[f * 3 + i] = newVertex;
-			}
-		}
-	}
-
-	ConstArrayView<uint32_t> faces() const { return m_faces; }
-	ConstArrayView<uint32_t> indices() const { return m_indices; }
-	ConstArrayView<uint32_t> vertices() const { return m_vertexToSourceVertexMap; }
-
-private:
-	Array<uint32_t> m_faces, m_indices;
-	Array<uint32_t> m_vertexToSourceVertexMap; // Map face vertices to vertices of the source mesh.
 };
 
-struct ChartGroupComputeChartFacesTaskArgs
-{
+struct ChartGroupComputeChartsTaskGroupArgs {
 	ThreadLocal<segment::Atlas> *atlas;
-	ChartGroup *chartGroup;
 	const ChartOptions *options;
 	Progress *progress;
+	TaskScheduler *taskScheduler;
+	ThreadLocal<UniformGrid2> *boundaryGrid;
+	ThreadLocal<ChartCtorBuffers> *chartBuffers;
+	ThreadLocal<PiecewiseParam> *piecewiseParam;
 };
 
-static void runChartGroupComputeChartFacesJob(void *userData)
-{
-	auto args = (ChartGroupComputeChartFacesTaskArgs *)userData;
+static void runChartGroupComputeChartsTask(void *groupUserData, void *taskUserData) {
+	auto args = (ChartGroupComputeChartsTaskGroupArgs *)groupUserData;
+	auto chartGroup = (ChartGroup *)taskUserData;
 	if (args->progress->cancel)
 		return;
 	XA_PROFILE_START(chartGroupComputeChartsThread)
-	args->chartGroup->computeChartFaces(*args->options, args->atlas->get());
+	chartGroup->computeCharts(args->taskScheduler, *args->options, args->progress, args->atlas->get(), args->boundaryGrid, args->chartBuffers, args->piecewiseParam);
 	XA_PROFILE_END(chartGroupComputeChartsThread)
 }
 
-struct MeshComputeChartFacesTaskArgs
-{
-	Array<ChartGroup *> *chartGroups; // output
-	InvalidMeshGeometry *invalidMeshGeometry; // output
+struct MeshComputeChartsTaskGroupArgs {
 	ThreadLocal<segment::Atlas> *atlas;
 	const ChartOptions *options;
 	Progress *progress;
-	const Mesh *sourceMesh;
 	TaskScheduler *taskScheduler;
+	ThreadLocal<UniformGrid2> *boundaryGrid;
+	ThreadLocal<ChartCtorBuffers> *chartBuffers;
+	ThreadLocal<PiecewiseParam> *piecewiseParam;
+};
+
+struct MeshComputeChartsTaskArgs {
+	const Mesh *sourceMesh;
+	Array<ChartGroup *> *chartGroups; // output
+	InvalidMeshGeometry *invalidMeshGeometry; // output
 };
 
 #if XA_DEBUG_EXPORT_OBJ_FACE_GROUPS
 static uint32_t s_faceGroupsCurrentVertex = 0;
 #endif
 
-static void runMeshComputeChartFacesJob(void *userData)
-{
-	auto args = (MeshComputeChartFacesTaskArgs *)userData;
-	if (args->progress->cancel)
+static void runMeshComputeChartsTask(void *groupUserData, void *taskUserData) {
+	auto groupArgs = (MeshComputeChartsTaskGroupArgs *)groupUserData;
+	auto args = (MeshComputeChartsTaskArgs *)taskUserData;
+	if (groupArgs->progress->cancel)
 		return;
 	XA_PROFILE_START(computeChartsThread)
 	// Create face groups.
@@ -8009,7 +7404,7 @@ static void runMeshComputeChartFacesJob(void *userData)
 	meshFaceGroups->compute();
 	const uint32_t chartGroupCount = meshFaceGroups->groupCount();
 	XA_PROFILE_END(createFaceGroups)
-	if (args->progress->cancel)
+	if (groupArgs->progress->cancel)
 		goto cleanup;
 #if XA_DEBUG_EXPORT_OBJ_FACE_GROUPS
 	{
@@ -8053,33 +7448,41 @@ static void runMeshComputeChartFacesJob(void *userData)
 	for (uint32_t i = 0; i < chartGroupCount; i++)
 		(*args->chartGroups)[i] = XA_NEW_ARGS(MemTag::Default, ChartGroup, i, args->sourceMesh, meshFaceGroups, MeshFaceGroups::Handle(i));
 	// Extract invalid geometry via the invalid face group (MeshFaceGroups::kInvalid).
-	XA_PROFILE_START(extractInvalidMeshGeometry)
-	args->invalidMeshGeometry->extract(args->sourceMesh, meshFaceGroups);
-	XA_PROFILE_END(extractInvalidMeshGeometry)
-	// One task for each chart group - compute chart faces.
+	{
+		XA_PROFILE_START(extractInvalidMeshGeometry)
+		args->invalidMeshGeometry->extract(args->sourceMesh, meshFaceGroups);
+		XA_PROFILE_END(extractInvalidMeshGeometry)
+	}
+	// One task for each chart group - compute charts.
 	{
 		XA_PROFILE_START(chartGroupComputeChartsReal)
-		Array<ChartGroupComputeChartFacesTaskArgs> taskArgs;
-		taskArgs.resize(chartGroupCount);
-		for (uint32_t i = 0; i < chartGroupCount; i++) {
-			taskArgs[i].atlas = args->atlas;
-			taskArgs[i].chartGroup = (*args->chartGroups)[i];
-			taskArgs[i].options = args->options;
-			taskArgs[i].progress = args->progress;
-		}
-		TaskGroupHandle taskGroup = args->taskScheduler->createTaskGroup(chartGroupCount);
+		// Sort chart groups by face count.
+		Array<float> chartGroupSortData;
+		chartGroupSortData.resize(chartGroupCount);
+		for (uint32_t i = 0; i < chartGroupCount; i++)
+			chartGroupSortData[i] = (float)(*args->chartGroups)[i]->faceCount();
+		RadixSort chartGroupSort;
+		chartGroupSort.sort(chartGroupSortData);
+		// Larger chart groups are added first to reduce the chance of thread starvation.
+		ChartGroupComputeChartsTaskGroupArgs taskGroupArgs;
+		taskGroupArgs.atlas = groupArgs->atlas;
+		taskGroupArgs.options = groupArgs->options;
+		taskGroupArgs.progress = groupArgs->progress;
+		taskGroupArgs.taskScheduler = groupArgs->taskScheduler;
+		taskGroupArgs.boundaryGrid = groupArgs->boundaryGrid;
+		taskGroupArgs.chartBuffers = groupArgs->chartBuffers;
+		taskGroupArgs.piecewiseParam = groupArgs->piecewiseParam;
+		TaskGroupHandle taskGroup = groupArgs->taskScheduler->createTaskGroup(&taskGroupArgs, chartGroupCount);
 		for (uint32_t i = 0; i < chartGroupCount; i++) {
 			Task task;
-			task.userData = &taskArgs[i];
-			task.func = runChartGroupComputeChartFacesJob;
-			args->taskScheduler->run(taskGroup, task);
+			task.userData = (*args->chartGroups)[chartGroupCount - i - 1];
+			task.func = runChartGroupComputeChartsTask;
+			groupArgs->taskScheduler->run(taskGroup, task);
 		}
-		args->taskScheduler->wait(&taskGroup);
+		groupArgs->taskScheduler->wait(&taskGroup);
 		XA_PROFILE_END(chartGroupComputeChartsReal)
 	}
 	XA_PROFILE_END(computeChartsThread)
-	args->progress->value++;
-	args->progress->update();
 cleanup:
 	if (meshFaceGroups) {
 		meshFaceGroups->~MeshFaceGroups();
@@ -8087,43 +7490,13 @@ cleanup:
 	}
 }
 
-struct ParameterizeChartsTaskArgs
-{
-	TaskScheduler *taskScheduler;
-	ChartGroup *chartGroup;
-	const ParameterizeOptions *options;
-	ThreadLocal<UniformGrid2> *boundaryGrid;
-	ThreadLocal<ChartCtorBuffers> *chartBuffers;
-#if XA_RECOMPUTE_CHARTS
-	ThreadLocal<PiecewiseParam> *piecewiseParam;
-#endif
-	Progress *progress;
-};
-
-static void runParameterizeChartsJob(void *userData)
-{
-	auto args = (ParameterizeChartsTaskArgs *)userData;
-	if (args->progress->cancel)
-		return;
-	XA_PROFILE_START(parameterizeChartsThread)
-#if XA_RECOMPUTE_CHARTS
-	args->chartGroup->parameterizeCharts(args->taskScheduler, *args->options, args->boundaryGrid, args->chartBuffers, args->piecewiseParam);
-#else
-	args->chartGroup->parameterizeCharts(args->taskScheduler, *args->options, args->boundaryGrid, args->chartBuffers);
-#endif
-	XA_PROFILE_END(parameterizeChartsThread)
-	args->progress->value++;
-	args->progress->update();
-}
-
 /// An atlas is a set of chart groups.
-class Atlas
-{
+class Atlas {
 public:
-	Atlas() : m_chartsComputed(false), m_chartsParameterized(false) {}
+	Atlas() :
+			m_chartsComputed(false) {}
 
-	~Atlas()
-	{
+	~Atlas() {
 		for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
 			for (uint32_t j = 0; j < m_meshChartGroups[i].size(); j++) {
 				m_meshChartGroups[i][j]->~ChartGroup();
@@ -8137,22 +7510,25 @@ public:
 	uint32_t meshCount() const { return m_meshes.size(); }
 	const InvalidMeshGeometry &invalidMeshGeometry(uint32_t meshIndex) const { return m_invalidMeshGeometry[meshIndex]; }
 	bool chartsComputed() const { return m_chartsComputed; }
-	bool chartsParameterized() const { return m_chartsParameterized; }
 	uint32_t chartGroupCount(uint32_t mesh) const { return m_meshChartGroups[mesh].size(); }
 	const ChartGroup *chartGroupAt(uint32_t mesh, uint32_t group) const { return m_meshChartGroups[mesh][group]; }
 
-	void addMesh(const Mesh *mesh)
-	{
+	void addMesh(const Mesh *mesh) {
 		m_meshes.push_back(mesh);
 	}
 
-	bool computeCharts(TaskScheduler *taskScheduler, const ChartOptions &options, ProgressFunc progressFunc, void *progressUserData)
-	{
+	bool computeCharts(TaskScheduler *taskScheduler, const ChartOptions &options, ProgressFunc progressFunc, void *progressUserData) {
+		XA_PROFILE_START(computeChartsReal)
 #if XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS
 		segment::s_planarRegionsCurrentRegion = segment::s_planarRegionsCurrentVertex = 0;
 #endif
+		// Progress is per-face x 2 (1 for chart faces, 1 for parameterized chart faces).
+		const uint32_t meshCount = m_meshes.size();
+		uint32_t totalFaceCount = 0;
+		for (uint32_t i = 0; i < meshCount; i++)
+			totalFaceCount += m_meshes[i]->faceCount();
+		Progress progress(ProgressCategory::ComputeCharts, progressFunc, progressUserData, totalFaceCount * 2);
 		m_chartsComputed = false;
-		m_chartsParameterized = false;
 		// Clear chart groups, since this function may be called multiple times.
 		if (!m_meshChartGroups.isEmpty()) {
 			for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
@@ -8162,27 +7538,20 @@ public:
 				}
 				m_meshChartGroups[i].clear();
 			}
-			XA_ASSERT(m_meshChartGroups.size() == m_meshes.size()); // The number of meshes shouldn't have changed.
+			XA_ASSERT(m_meshChartGroups.size() == meshCount); // The number of meshes shouldn't have changed.
 		}
-		m_meshChartGroups.resize(m_meshes.size());
+		m_meshChartGroups.resize(meshCount);
 		m_meshChartGroups.runCtors();
-		m_invalidMeshGeometry.resize(m_meshes.size());
+		m_invalidMeshGeometry.resize(meshCount);
 		m_invalidMeshGeometry.runCtors();
 		// One task per mesh.
-		const uint32_t meshCount = m_meshes.size();
-		Progress progress(ProgressCategory::ComputeCharts, progressFunc, progressUserData, meshCount);
-		ThreadLocal<segment::Atlas> atlas;
-		Array<MeshComputeChartFacesTaskArgs> taskArgs;
+		Array<MeshComputeChartsTaskArgs> taskArgs;
 		taskArgs.resize(meshCount);
 		for (uint32_t i = 0; i < meshCount; i++) {
-			MeshComputeChartFacesTaskArgs &args = taskArgs[i];
-			args.atlas = &atlas;
+			MeshComputeChartsTaskArgs &args = taskArgs[i];
+			args.sourceMesh = m_meshes[i];
 			args.chartGroups = &m_meshChartGroups[i];
 			args.invalidMeshGeometry = &m_invalidMeshGeometry[i];
-			args.options = &options;
-			args.progress = &progress;
-			args.sourceMesh = m_meshes[i];
-			args.taskScheduler = taskScheduler;
 		}
 		// Sort meshes by indexCount.
 		Array<float> meshSortData;
@@ -8192,105 +7561,53 @@ public:
 		RadixSort meshSort;
 		meshSort.sort(meshSortData);
 		// Larger meshes are added first to reduce the chance of thread starvation.
-		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(meshCount);
-		for (uint32_t i = 0; i < meshCount; i++) {
-			Task task;
-			task.userData = &taskArgs[meshSort.ranks()[meshCount - i - 1]];
-			task.func = runMeshComputeChartFacesJob;
-			taskScheduler->run(taskGroup, task);
-		}
-		taskScheduler->wait(&taskGroup);
-		if (progress.cancel)
-			return false;
-		m_chartsComputed = true;
-		return true;
-	}
-
-	bool parameterizeCharts(TaskScheduler *taskScheduler, const ParameterizeOptions &options, ProgressFunc progressFunc, void *progressUserData)
-	{
-		m_chartsParameterized = false;
-		uint32_t chartGroupCount = 0;
-		for (uint32_t i = 0; i < m_meshChartGroups.size(); i++)
-			chartGroupCount += m_meshChartGroups[i].size();
-		Progress progress(ProgressCategory::ParameterizeCharts, progressFunc, progressUserData, chartGroupCount);
+		ThreadLocal<segment::Atlas> atlas;
 		ThreadLocal<UniformGrid2> boundaryGrid; // For Quality boundary intersection.
 		ThreadLocal<ChartCtorBuffers> chartBuffers;
-#if XA_RECOMPUTE_CHARTS
 		ThreadLocal<PiecewiseParam> piecewiseParam;
-#endif
-		Array<ParameterizeChartsTaskArgs> taskArgs;
-		taskArgs.resize(chartGroupCount);
-		{
-			uint32_t k = 0;
-			for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
-				const uint32_t count = m_meshChartGroups[i].size();
-				for (uint32_t j = 0; j < count; j++) {
-					ParameterizeChartsTaskArgs &args = taskArgs[k];
-					args.taskScheduler = taskScheduler;
-					args.chartGroup = m_meshChartGroups[i][j];
-					args.options = &options;
-					args.boundaryGrid = &boundaryGrid;
-					args.chartBuffers = &chartBuffers;
-#if XA_RECOMPUTE_CHARTS
-					args.piecewiseParam = &piecewiseParam;
-#endif
-					args.progress = &progress;
-					k++;
-				}
-			}
-		}
-		// Sort chart groups by face count.
-		Array<float> chartGroupSortData;
-		chartGroupSortData.resize(chartGroupCount);
-		{
-			uint32_t k = 0;
-			for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
-				const uint32_t count = m_meshChartGroups[i].size();
-				for (uint32_t j = 0; j < count; j++) {
-					chartGroupSortData[k++] = (float)m_meshChartGroups[i][j]->faceCount();
-				}
-			}
-		}
-		RadixSort chartGroupSort;
-		chartGroupSort.sort(chartGroupSortData);
-		// Larger chart groups are added first to reduce the chance of thread starvation.
-		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartGroupCount);
-		for (uint32_t i = 0; i < chartGroupCount; i++) {
+		MeshComputeChartsTaskGroupArgs taskGroupArgs;
+		taskGroupArgs.atlas = &atlas;
+		taskGroupArgs.options = &options;
+		taskGroupArgs.progress = &progress;
+		taskGroupArgs.taskScheduler = taskScheduler;
+		taskGroupArgs.boundaryGrid = &boundaryGrid;
+		taskGroupArgs.chartBuffers = &chartBuffers;
+		taskGroupArgs.piecewiseParam = &piecewiseParam;
+		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(&taskGroupArgs, meshCount);
+		for (uint32_t i = 0; i < meshCount; i++) {
 			Task task;
-			task.userData = &taskArgs[chartGroupSort.ranks()[chartGroupCount - i - 1]];
-			task.func = runParameterizeChartsJob;
+			task.userData = &taskArgs[meshSort.ranks()[meshCount - i - 1]];
+			task.func = runMeshComputeChartsTask;
 			taskScheduler->run(taskGroup, task);
 		}
 		taskScheduler->wait(&taskGroup);
+		XA_PROFILE_END(computeChartsReal)
 		if (progress.cancel)
 			return false;
-		m_chartsParameterized = true;
+		m_chartsComputed = true;
 		return true;
 	}
 
 private:
 	Array<const Mesh *> m_meshes;
 	Array<InvalidMeshGeometry> m_invalidMeshGeometry; // 1 per mesh.
-	Array<Array<ChartGroup *> > m_meshChartGroups;
+	Array<Array<ChartGroup *>> m_meshChartGroups;
 	bool m_chartsComputed;
-	bool m_chartsParameterized;
 };
 
 } // namespace param
 
 namespace pack {
 
-class AtlasImage
-{
+class AtlasImage {
 public:
-	AtlasImage(uint32_t width, uint32_t height) : m_width(width), m_height(height)
-	{
+	AtlasImage(uint32_t width, uint32_t height) :
+			m_width(width), m_height(height) {
 		m_data.resize(m_width * m_height);
 		memset(m_data.data(), 0, sizeof(uint32_t) * m_data.size());
 	}
 
-	void resize(uint32_t width, uint32_t height)
-	{
+	void resize(uint32_t width, uint32_t height) {
 		Array<uint32_t> data;
 		data.resize(width * height);
 		memset(data.data(), 0, sizeof(uint32_t) * data.size());
@@ -8301,8 +7618,7 @@ public:
 		data.moveTo(m_data);
 	}
 
-	void addChart(uint32_t chartIndex, const BitImage *image, const BitImage *imageBilinear, const BitImage *imagePadding, int atlas_w, int atlas_h, int offset_x, int offset_y)
-	{
+	void addChart(uint32_t chartIndex, const BitImage *image, const BitImage *imageBilinear, const BitImage *imagePadding, int atlas_w, int atlas_h, int offset_x, int offset_y) {
 		const int w = image->width();
 		const int h = image->height();
 		for (int y = 0; y < h; y++) {
@@ -8328,15 +7644,13 @@ public:
 		}
 	}
 
-	void copyTo(uint32_t *dest, uint32_t destWidth, uint32_t destHeight, int padding) const
-	{
+	void copyTo(uint32_t *dest, uint32_t destWidth, uint32_t destHeight, int padding) const {
 		for (uint32_t y = 0; y < destHeight; y++)
 			memcpy(&dest[y * destWidth], &m_data[padding + (y + padding) * m_width], destWidth * sizeof(uint32_t));
 	}
 
 #if XA_DEBUG_EXPORT_ATLAS_IMAGES
-	void writeTga(const char *filename, uint32_t width, uint32_t height) const
-	{
+	void writeTga(const char *filename, uint32_t width, uint32_t height) const {
 		Array<uint8_t> image;
 		image.resize(width * height * 3);
 		for (uint32_t y = 0; y < height; y++) {
@@ -8378,18 +7692,14 @@ private:
 	Array<uint32_t> m_data;
 };
 
-struct Chart
-{
+struct Chart {
 	int32_t atlasIndex;
 	uint32_t material;
-	uint32_t indexCount;
-	const uint32_t *indices;
+	ConstArrayView<uint32_t> indices;
 	float parametricArea;
 	float surfaceArea;
-	Vector2 *vertices;
-	uint32_t vertexCount;
+	ArrayView<Vector2> vertices;
 	Array<uint32_t> uniqueVertices;
-	bool allowRotate;
 	// bounding box
 	Vector2 majorAxis, minorAxis, minCorner, maxCorner;
 	// Mesh only
@@ -8398,29 +7708,26 @@ struct Chart
 	Array<uint32_t> faces;
 
 	Vector2 &uniqueVertexAt(uint32_t v) { return uniqueVertices.isEmpty() ? vertices[v] : vertices[uniqueVertices[v]]; }
-	uint32_t uniqueVertexCount() const { return uniqueVertices.isEmpty() ? vertexCount : uniqueVertices.size(); }
+	uint32_t uniqueVertexCount() const { return uniqueVertices.isEmpty() ? vertices.length : uniqueVertices.size(); }
 };
 
-struct AddChartTaskArgs
-{
-	ThreadLocal<BoundingBox2D> *boundingBox;
+struct AddChartTaskArgs {
 	param::Chart *paramChart;
 	Chart *chart; // out
 };
 
-static void runAddChartTask(void *userData)
-{
+static void runAddChartTask(void *groupUserData, void *taskUserData) {
 	XA_PROFILE_START(packChartsAddChartsThread)
-	auto args = (AddChartTaskArgs *)userData;
+	auto boundingBox = (ThreadLocal<BoundingBox2D> *)groupUserData;
+	auto args = (AddChartTaskArgs *)taskUserData;
 	param::Chart *paramChart = args->paramChart;
 	XA_PROFILE_START(packChartsAddChartsRestoreTexcoords)
 	paramChart->restoreTexcoords();
 	XA_PROFILE_END(packChartsAddChartsRestoreTexcoords)
-	Mesh *mesh = paramChart->mesh();
+	Mesh *mesh = paramChart->unifiedMesh();
 	Chart *chart = args->chart = XA_NEW(MemTag::Default, Chart);
 	chart->atlasIndex = -1;
 	chart->material = 0;
-	chart->indexCount = mesh->indexCount();
 	chart->indices = mesh->indices();
 	chart->parametricArea = mesh->computeParametricArea();
 	if (chart->parametricArea < kAreaEpsilon) {
@@ -8430,17 +7737,15 @@ static void runAddChartTask(void *userData)
 	}
 	chart->surfaceArea = mesh->computeSurfaceArea();
 	chart->vertices = mesh->texcoords();
-	chart->vertexCount = mesh->vertexCount();
-	chart->allowRotate = true;
 	chart->boundaryEdges = &mesh->boundaryEdges();
 	// Compute bounding box of chart.
-	BoundingBox2D &bb = args->boundingBox->get();
+	BoundingBox2D &bb = boundingBox->get();
 	bb.clear();
-	for (uint32_t v = 0; v < chart->vertexCount; v++) {
+	for (uint32_t v = 0; v < chart->vertices.length; v++) {
 		if (mesh->isBoundaryVertex(v))
 			bb.appendBoundaryVertex(mesh->texcoord(v));
 	}
-	bb.compute(mesh->texcoords(), mesh->vertexCount());
+	bb.compute(mesh->texcoords());
 	chart->majorAxis = bb.majorAxis;
 	chart->minorAxis = bb.minorAxis;
 	chart->minCorner = bb.minCorner;
@@ -8448,10 +7753,8 @@ static void runAddChartTask(void *userData)
 	XA_PROFILE_END(packChartsAddChartsThread)
 }
 
-struct Atlas
-{
-	~Atlas()
-	{
+struct Atlas {
+	~Atlas() {
 		for (uint32_t i = 0; i < m_atlasImages.size(); i++) {
 			m_atlasImages[i]->~AtlasImage();
 			XA_FREE(m_atlasImages[i]);
@@ -8475,8 +7778,7 @@ struct Atlas
 	const Array<AtlasImage *> &getImages() const { return m_atlasImages; }
 	float getUtilization(uint32_t atlas) const { return m_utilization[atlas]; }
 
-	void addCharts(TaskScheduler *taskScheduler, param::Atlas *paramAtlas)
-	{
+	void addCharts(TaskScheduler *taskScheduler, param::Atlas *paramAtlas) {
 		// Count charts.
 		uint32_t chartCount = 0;
 		for (uint32_t i = 0; i < paramAtlas->meshCount(); i++) {
@@ -8489,11 +7791,11 @@ struct Atlas
 		if (chartCount == 0)
 			return;
 		// Run one task per chart.
+		ThreadLocal<BoundingBox2D> boundingBox;
+		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(&boundingBox, chartCount);
 		Array<AddChartTaskArgs> taskArgs;
 		taskArgs.resize(chartCount);
-		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartCount);
 		uint32_t chartIndex = 0;
-		ThreadLocal<BoundingBox2D> boundingBox;
 		for (uint32_t i = 0; i < paramAtlas->meshCount(); i++) {
 			const uint32_t chartGroupsCount = paramAtlas->chartGroupCount(i);
 			for (uint32_t j = 0; j < chartGroupsCount; j++) {
@@ -8501,7 +7803,6 @@ struct Atlas
 				const uint32_t count = chartGroup->chartCount();
 				for (uint32_t k = 0; k < count; k++) {
 					AddChartTaskArgs &args = taskArgs[chartIndex];
-					args.boundingBox = &boundingBox;
 					args.paramChart = chartGroup->chartAt(k);
 					Task task;
 					task.userData = &taskArgs[chartIndex];
@@ -8518,8 +7819,10 @@ struct Atlas
 			m_charts[i] = taskArgs[i].chart;
 	}
 
-	void addUvMeshCharts(UvMeshInstance *mesh)
-	{
+	void addUvMeshCharts(UvMeshInstance *mesh) {
+		// Copy texcoords from mesh.
+		mesh->texcoords.resize(mesh->mesh->texcoords.size());
+		memcpy(mesh->texcoords.data(), mesh->mesh->texcoords.data(), mesh->texcoords.size() * sizeof(Vector2));
 		BitArray vertexUsed(mesh->texcoords.size());
 		BoundingBox2D boundingBox;
 		for (uint32_t c = 0; c < mesh->mesh->charts.size(); c++) {
@@ -8527,17 +7830,14 @@ struct Atlas
 			Chart *chart = XA_NEW(MemTag::Default, Chart);
 			chart->atlasIndex = -1;
 			chart->material = uvChart->material;
-			chart->indexCount = uvChart->indices.size();
-			chart->indices = uvChart->indices.data();
-			chart->vertices = mesh->texcoords.data();
-			chart->vertexCount = mesh->texcoords.size();
-			chart->allowRotate = mesh->rotateCharts;
+			chart->indices = uvChart->indices;
+			chart->vertices = mesh->texcoords;
 			chart->boundaryEdges = nullptr;
 			chart->faces.resize(uvChart->faces.size());
 			memcpy(chart->faces.data(), uvChart->faces.data(), sizeof(uint32_t) * uvChart->faces.size());
 			// Find unique vertices.
 			vertexUsed.zeroOutMemory();
-			for (uint32_t i = 0; i < chart->indexCount; i++) {
+			for (uint32_t i = 0; i < chart->indices.length; i++) {
 				const uint32_t vertex = chart->indices[i];
 				if (!vertexUsed.get(vertex)) {
 					vertexUsed.set(vertex);
@@ -8546,14 +7846,13 @@ struct Atlas
 			}
 			// Compute parametric and surface areas.
 			chart->parametricArea = 0.0f;
-			for (uint32_t f = 0; f < chart->indexCount / 3; f++) {
+			for (uint32_t f = 0; f < chart->indices.length / 3; f++) {
 				const Vector2 &v1 = chart->vertices[chart->indices[f * 3 + 0]];
 				const Vector2 &v2 = chart->vertices[chart->indices[f * 3 + 1]];
 				const Vector2 &v3 = chart->vertices[chart->indices[f * 3 + 2]];
 				chart->parametricArea += fabsf(triangleArea(v1, v2, v3));
 			}
 			chart->parametricArea *= 0.5f;
-			chart->surfaceArea = chart->parametricArea; // Identical for UV meshes.
 			if (chart->parametricArea < kAreaEpsilon) {
 				// When the parametric area is too small we use a rough approximation to prevent divisions by very small numbers.
 				Vector2 minCorner(FLT_MAX, FLT_MAX);
@@ -8565,6 +7864,9 @@ struct Atlas
 				const Vector2 bounds = (maxCorner - minCorner) * 0.5f;
 				chart->parametricArea = bounds.x * bounds.y;
 			}
+			XA_DEBUG_ASSERT(isFinite(chart->parametricArea));
+			XA_DEBUG_ASSERT(!isNan(chart->parametricArea));
+			chart->surfaceArea = chart->parametricArea; // Identical for UV meshes.
 			// Compute bounding box of chart.
 			// Using all unique vertices for simplicity, can compute real boundaries if this is too slow.
 			boundingBox.clear();
@@ -8580,8 +7882,7 @@ struct Atlas
 	}
 
 	// Pack charts in the smallest possible rectangle.
-	bool packCharts(const PackOptions &options, ProgressFunc progressFunc, void *progressUserData)
-	{
+	bool packCharts(const PackOptions &options, ProgressFunc progressFunc, void *progressUserData) {
 		if (progressFunc) {
 			if (!progressFunc(ProgressCategory::PackCharts, 0, progressUserData))
 				return false;
@@ -8627,19 +7928,19 @@ struct Atlas
 			// Compute chart scale
 			float scale = 1.0f;
 			if (chart->parametricArea != 0.0f) {
-				scale = (chart->surfaceArea / chart->parametricArea) * m_texelsPerUnit;
+				scale = sqrtf(chart->surfaceArea / chart->parametricArea) * m_texelsPerUnit;
 				XA_ASSERT(isFinite(scale));
 			}
 			// Translate, rotate and scale vertices. Compute extents.
 			Vector2 minCorner(FLT_MAX, FLT_MAX);
-			if (!chart->allowRotate) {
+			if (!options.rotateChartsToAxis) {
 				for (uint32_t i = 0; i < chart->uniqueVertexCount(); i++)
 					minCorner = min(minCorner, chart->uniqueVertexAt(i));
 			}
 			Vector2 extents(0.0f);
 			for (uint32_t i = 0; i < chart->uniqueVertexCount(); i++) {
 				Vector2 &texcoord = chart->uniqueVertexAt(i);
-				if (chart->allowRotate) {
+				if (options.rotateChartsToAxis) {
 					const float x = dot(texcoord, chart->majorAxis);
 					const float y = dot(texcoord, chart->minorAxis);
 					texcoord.x = x;
@@ -8750,27 +8051,27 @@ struct Atlas
 			// Resize and clear (discard = true) chart images.
 			// Leave room for padding at extents.
 			chartImage.resize(ftoi_ceil(chartExtents[c].x) + options.padding, ftoi_ceil(chartExtents[c].y) + options.padding, true);
-			if (chart->allowRotate)
+			if (options.rotateCharts)
 				chartImageRotated.resize(chartImage.height(), chartImage.width(), true);
 			if (options.bilinear) {
 				chartImageBilinear.resize(chartImage.width(), chartImage.height(), true);
-				if (chart->allowRotate)
+				if (options.rotateCharts)
 					chartImageBilinearRotated.resize(chartImage.height(), chartImage.width(), true);
 			}
 			// Rasterize chart faces.
-			const uint32_t faceCount = chart->indexCount / 3;
+			const uint32_t faceCount = chart->indices.length / 3;
 			for (uint32_t f = 0; f < faceCount; f++) {
 				Vector2 vertices[3];
 				for (uint32_t v = 0; v < 3; v++)
 					vertices[v] = chart->vertices[chart->indices[f * 3 + v]];
 				DrawTriangleCallbackArgs args;
 				args.chartBitImage = &chartImage;
-				args.chartBitImageRotated = chart->allowRotate ? &chartImageRotated : nullptr;
+				args.chartBitImageRotated = options.rotateCharts ? &chartImageRotated : nullptr;
 				raster::drawTriangle(Vector2((float)chartImage.width(), (float)chartImage.height()), vertices, drawTriangleCallback, &args);
 			}
 			// Expand chart by pixels sampled by bilinear interpolation.
 			if (options.bilinear)
-				bilinearExpand(chart, &chartImage, &chartImageBilinear, chart->allowRotate ? &chartImageBilinearRotated : nullptr, boundaryEdgeGrid);
+				bilinearExpand(chart, &chartImage, &chartImageBilinear, options.rotateCharts ? &chartImageBilinearRotated : nullptr, boundaryEdgeGrid);
 			// Expand chart by padding pixels (dilation).
 			if (options.padding > 0) {
 				// Copy into the same BitImage instances for every chart to avoid reallocating BitImage buffers (largest chart is packed first).
@@ -8780,7 +8081,7 @@ struct Atlas
 				else
 					chartImage.copyTo(chartImagePadding);
 				chartImagePadding.dilate(options.padding);
-				if (chart->allowRotate) {
+				if (options.rotateCharts) {
 					if (options.bilinear)
 						chartImageBilinearRotated.copyTo(chartImagePaddingRotated);
 					else
@@ -8815,23 +8116,25 @@ struct Atlas
 			int best_x = 0, best_y = 0;
 			int best_cw = 0, best_ch = 0;
 			int best_r = 0;
-			for (;;)
-			{
+			for (;;) {
+#if XA_DEBUG
 				bool firstChartInBitImage = false;
-				XA_UNUSED(firstChartInBitImage);
+#endif
 				if (currentAtlas + 1 > m_bitImages.size()) {
 					// Chart doesn't fit in the current bitImage, create a new one.
 					BitImage *bi = XA_NEW_ARGS(MemTag::Default, BitImage, resolution, resolution);
 					m_bitImages.push_back(bi);
 					atlasSizes.push_back(Vector2i(0, 0));
+#if XA_DEBUG
 					firstChartInBitImage = true;
+#endif
 					if (createImage)
 						m_atlasImages.push_back(XA_NEW_ARGS(MemTag::Default, AtlasImage, resolution, resolution));
 					// Start positions are per-atlas, so create a new one of those too.
 					chartStartPositions.push_back(Vector2i(0, 0));
 				}
 				XA_PROFILE_START(packChartsFindLocation)
-				const bool foundLocation = findChartLocation(chartStartPositions[currentAtlas], options.bruteForce, m_bitImages[currentAtlas], chartImageToPack, chartImageToPackRotated, atlasSizes[currentAtlas].x, atlasSizes[currentAtlas].y, &best_x, &best_y, &best_cw, &best_ch, &best_r, options.blockAlign, maxResolution, chart->allowRotate);
+				const bool foundLocation = findChartLocation(options, chartStartPositions[currentAtlas], m_bitImages[currentAtlas], chartImageToPack, chartImageToPackRotated, atlasSizes[currentAtlas].x, atlasSizes[currentAtlas].y, &best_x, &best_y, &best_cw, &best_ch, &best_r, maxResolution);
 				XA_PROFILE_END(packChartsFindLocation)
 				XA_DEBUG_ASSERT(!(firstChartInBitImage && !foundLocation)); // Chart doesn't fit in an empty, newly allocated bitImage. Shouldn't happen, since charts are resized if they are too big to fit in the atlas.
 				if (maxResolution == 0) {
@@ -8849,8 +8152,7 @@ struct Atlas
 				if (best_x + best_cw > atlasSizes[currentAtlas].x || best_y + best_ch > atlasSizes[currentAtlas].y) {
 					for (uint32_t j = 0; j < chartStartPositions.size(); j++)
 						chartStartPositions[j] = Vector2i(0, 0);
-				}
-				else {
+				} else {
 					chartStartPositions[currentAtlas] = Vector2i(best_x, best_y);
 				}
 			}
@@ -8897,7 +8199,7 @@ struct Atlas
 				Vector2 &texcoord = chart->uniqueVertexAt(v);
 				Vector2 t = texcoord;
 				if (best_r) {
-					XA_DEBUG_ASSERT(chart->allowRotate);
+					XA_DEBUG_ASSERT(options.rotateCharts);
 					swap(t.x, t.y);
 				}
 				texcoord.x = best_x + t.x;
@@ -8938,8 +8240,7 @@ struct Atlas
 			}
 			if (m_utilization.size() > 1) {
 				XA_PRINT("   %u: %f%% utilization\n", i, m_utilization[i] * 100.0f);
-			}
-			else {
+			} else {
 				XA_PRINT("   %f%% utilization\n", m_utilization[i] * 100.0f);
 			}
 		}
@@ -8958,28 +8259,22 @@ struct Atlas
 	}
 
 private:
-	// IC: Brute force is slow, and random may take too much time to converge. We start inserting large charts in a small atlas. Using brute force is lame, because most of the space
-	// is occupied at this point. At the end we have many small charts and a large atlas with sparse holes. Finding those holes randomly is slow. A better approach would be to
-	// start stacking large charts as if they were tetris pieces. Once charts get small try to place them randomly. It may be interesting to try a intermediate strategy, first try
-	// along one axis and then try exhaustively along that axis.
-	bool findChartLocation(const Vector2i &startPosition, bool bruteForce, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, bool blockAligned, uint32_t maxResolution, bool allowRotate)
-	{
+	bool findChartLocation(const PackOptions &options, const Vector2i &startPosition, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, uint32_t maxResolution) {
 		const int attempts = 4096;
-		if (bruteForce || attempts >= w * h)
-			return findChartLocation_bruteForce(startPosition, atlasBitImage, chartBitImage, chartBitImageRotated, w, h, best_x, best_y, best_w, best_h, best_r, blockAligned, maxResolution, allowRotate);
-		return findChartLocation_random(atlasBitImage, chartBitImage, chartBitImageRotated, w, h, best_x, best_y, best_w, best_h, best_r, attempts, blockAligned, maxResolution, allowRotate);
+		if (options.bruteForce || attempts >= w * h)
+			return findChartLocation_bruteForce(options, startPosition, atlasBitImage, chartBitImage, chartBitImageRotated, w, h, best_x, best_y, best_w, best_h, best_r, maxResolution);
+		return findChartLocation_random(options, atlasBitImage, chartBitImage, chartBitImageRotated, w, h, best_x, best_y, best_w, best_h, best_r, attempts, maxResolution);
 	}
 
-	bool findChartLocation_bruteForce(const Vector2i &startPosition, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, bool blockAligned, uint32_t maxResolution, bool allowRotate)
-	{
-		const int stepSize = blockAligned ? 4 : 1;
+	bool findChartLocation_bruteForce(const PackOptions &options, const Vector2i &startPosition, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, uint32_t maxResolution) {
+		const int stepSize = options.blockAlign ? 4 : 1;
 		int best_metric = INT_MAX;
 		// Try two different orientations.
 		for (int r = 0; r < 2; r++) {
 			int cw = chartBitImage->width();
 			int ch = chartBitImage->height();
 			if (r == 1) {
-				if (allowRotate)
+				if (options.rotateCharts)
 					swap(cw, ch);
 				else
 					break;
@@ -9016,15 +8311,14 @@ private:
 		return best_metric != INT_MAX;
 	}
 
-	bool findChartLocation_random(const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, int minTrialCount, bool blockAligned, uint32_t maxResolution, bool allowRotate)
-	{
+	bool findChartLocation_random(const PackOptions &options, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, int attempts, uint32_t maxResolution) {
 		bool result = false;
 		const int BLOCK_SIZE = 4;
 		int best_metric = INT_MAX;
-		for (int i = 0; i < minTrialCount; i++) {
+		for (int i = 0; i < attempts; i++) {
 			int cw = chartBitImage->width();
 			int ch = chartBitImage->height();
-			int r = allowRotate ? m_rand.getRange(1) : 0;
+			int r = options.rotateCharts ? m_rand.getRange(1) : 0;
 			if (r == 1)
 				swap(cw, ch);
 			// + 1 to extend atlas in case atlas full. We may want to use a higher number to increase probability of extending atlas.
@@ -9037,7 +8331,7 @@ private:
 			}
 			int x = m_rand.getRange(xRange);
 			int y = m_rand.getRange(yRange);
-			if (blockAligned) {
+			if (options.blockAlign) {
 				x = align(x, BLOCK_SIZE);
 				y = align(y, BLOCK_SIZE);
 				if (maxResolution > 0 && (x > (int)maxResolution - cw || y > (int)maxResolution - ch))
@@ -9062,7 +8356,7 @@ private:
 				*best_y = y;
 				*best_w = cw;
 				*best_h = ch;
-				*best_r = allowRotate ? r : 0;
+				*best_r = options.rotateCharts ? r : 0;
 				if (area == w * h) {
 					// Chart is completely inside, do not look at any other location.
 					break;
@@ -9072,8 +8366,7 @@ private:
 		return result;
 	}
 
-	void addChart(BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int atlas_w, int atlas_h, int offset_x, int offset_y, int r)
-	{
+	void addChart(BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int atlas_w, int atlas_h, int offset_x, int offset_y, int r) {
 		XA_DEBUG_ASSERT(r == 0 || r == 1);
 		const BitImage *image = r == 0 ? chartBitImage : chartBitImageRotated;
 		const int w = image->width();
@@ -9096,15 +8389,14 @@ private:
 		}
 	}
 
-	void bilinearExpand(const Chart *chart, BitImage *source, BitImage *dest, BitImage *destRotated, UniformGrid2 &boundaryEdgeGrid) const
-	{
+	void bilinearExpand(const Chart *chart, BitImage *source, BitImage *dest, BitImage *destRotated, UniformGrid2 &boundaryEdgeGrid) const {
 		boundaryEdgeGrid.reset(chart->vertices, chart->indices);
 		if (chart->boundaryEdges) {
 			const uint32_t edgeCount = chart->boundaryEdges->size();
 			for (uint32_t i = 0; i < edgeCount; i++)
 				boundaryEdgeGrid.append((*chart->boundaryEdges)[i]);
 		} else {
-			for (uint32_t i = 0; i < chart->indexCount; i++)
+			for (uint32_t i = 0; i < chart->indices.length; i++)
 				boundaryEdgeGrid.append(i);
 		}
 		const int xOffsets[] = { -1, 0, 1, -1, 1, -1, 0, 1 };
@@ -9152,13 +8444,11 @@ private:
 		}
 	}
 
-	struct DrawTriangleCallbackArgs
-	{
+	struct DrawTriangleCallbackArgs {
 		BitImage *chartBitImage, *chartBitImageRotated;
 	};
 
-	static bool drawTriangleCallback(void *param, int x, int y)
-	{
+	static bool drawTriangleCallback(void *param, int x, int y) {
 		auto args = (DrawTriangleCallbackArgs *)param;
 		args->chartBitImage->set(x, y);
 		if (args->chartBitImageRotated)
@@ -9180,8 +8470,14 @@ private:
 } // namespace pack
 } // namespace internal
 
-struct Context
-{
+// Used to map triangulated polygons back to polygons.
+struct MeshPolygonMapping {
+	internal::Array<uint8_t> faceVertexCount; // Copied from MeshDecl::faceVertexCount.
+	internal::Array<uint32_t> triangleToPolygonMap; // Triangle index (mesh face index) to polygon index.
+	internal::Array<uint32_t> triangleToPolygonIndicesMap; // Triangle indices to polygon indices.
+};
+
+struct Context {
 	Atlas atlas;
 	internal::Progress *addMeshProgress = nullptr;
 	internal::TaskGroupHandle addMeshTaskGroup;
@@ -9190,20 +8486,20 @@ struct Context
 	void *progressUserData = nullptr;
 	internal::TaskScheduler *taskScheduler;
 	internal::Array<internal::Mesh *> meshes;
+	internal::Array<MeshPolygonMapping *> meshPolygonMappings;
 	internal::Array<internal::UvMesh *> uvMeshes;
 	internal::Array<internal::UvMeshInstance *> uvMeshInstances;
+	bool uvMeshChartsComputed = false;
 };
 
-Atlas *Create()
-{
+Atlas *Create() {
 	Context *ctx = XA_NEW(internal::MemTag::Default, Context);
 	memset(&ctx->atlas, 0, sizeof(Atlas));
 	ctx->taskScheduler = XA_NEW(internal::MemTag::Default, internal::TaskScheduler);
 	return &ctx->atlas;
 }
 
-static void DestroyOutputMeshes(Context *ctx)
-{
+static void DestroyOutputMeshes(Context *ctx) {
 	if (!ctx->atlas.meshes)
 		return;
 	for (int i = 0; i < (int)ctx->atlas.meshCount; i++) {
@@ -9224,8 +8520,7 @@ static void DestroyOutputMeshes(Context *ctx)
 	ctx->atlas.meshes = nullptr;
 }
 
-void Destroy(Atlas *atlas)
-{
+void Destroy(Atlas *atlas) {
 	XA_DEBUG_ASSERT(atlas);
 	Context *ctx = (Context *)atlas;
 	if (atlas->utilization)
@@ -9244,6 +8539,13 @@ void Destroy(Atlas *atlas)
 		mesh->~Mesh();
 		XA_FREE(mesh);
 	}
+	for (uint32_t i = 0; i < ctx->meshPolygonMappings.size(); i++) {
+		MeshPolygonMapping *mapping = ctx->meshPolygonMappings[i];
+		if (mapping) {
+			mapping->~MeshPolygonMapping();
+			XA_FREE(mapping);
+		}
+	}
 	for (uint32_t i = 0; i < ctx->uvMeshes.size(); i++) {
 		internal::UvMesh *mesh = ctx->uvMeshes[i];
 		for (uint32_t j = 0; j < mesh->charts.size(); j++) {
@@ -9265,66 +8567,52 @@ void Destroy(Atlas *atlas)
 #endif
 }
 
-struct AddMeshTaskArgs
-{
-	Context *ctx;
-	internal::Mesh *mesh;
-};
-
-static void runAddMeshTask(void *userData)
-{
+static void runAddMeshTask(void *groupUserData, void *taskUserData) {
 	XA_PROFILE_START(addMeshThread)
-	auto args = (AddMeshTaskArgs *)userData; // Responsible for freeing this.
-	internal::Mesh *mesh = args->mesh;
-	internal::Progress *progress = args->ctx->addMeshProgress;
-	if (progress->cancel)
-		goto cleanup;
-	{
-		XA_PROFILE_START(addMeshCreateColocals)
-		mesh->createColocals();
-		XA_PROFILE_END(addMeshCreateColocals)
+	auto ctx = (Context *)groupUserData;
+	auto mesh = (internal::Mesh *)taskUserData;
+	internal::Progress *progress = ctx->addMeshProgress;
+	if (progress->cancel) {
+		XA_PROFILE_END(addMeshThread)
+		return;
 	}
-	if (progress->cancel)
-		goto cleanup;
-	progress->value++;
-	progress->update();
-cleanup:
-	args->~AddMeshTaskArgs();
-	XA_FREE(args);
+	XA_PROFILE_START(addMeshCreateColocals)
+	mesh->createColocals();
+	XA_PROFILE_END(addMeshCreateColocals)
+	if (progress->cancel) {
+		XA_PROFILE_END(addMeshThread)
+		return;
+	}
+	progress->increment(1);
 	XA_PROFILE_END(addMeshThread)
 }
 
-static internal::Vector3 DecodePosition(const MeshDecl &meshDecl, uint32_t index)
-{
+static internal::Vector3 DecodePosition(const MeshDecl &meshDecl, uint32_t index) {
 	XA_DEBUG_ASSERT(meshDecl.vertexPositionData);
 	XA_DEBUG_ASSERT(meshDecl.vertexPositionStride > 0);
 	return *((const internal::Vector3 *)&((const uint8_t *)meshDecl.vertexPositionData)[meshDecl.vertexPositionStride * index]);
 }
 
-static internal::Vector3 DecodeNormal(const MeshDecl &meshDecl, uint32_t index)
-{
+static internal::Vector3 DecodeNormal(const MeshDecl &meshDecl, uint32_t index) {
 	XA_DEBUG_ASSERT(meshDecl.vertexNormalData);
 	XA_DEBUG_ASSERT(meshDecl.vertexNormalStride > 0);
 	return *((const internal::Vector3 *)&((const uint8_t *)meshDecl.vertexNormalData)[meshDecl.vertexNormalStride * index]);
 }
 
-static internal::Vector2 DecodeUv(const MeshDecl &meshDecl, uint32_t index)
-{
+static internal::Vector2 DecodeUv(const MeshDecl &meshDecl, uint32_t index) {
 	XA_DEBUG_ASSERT(meshDecl.vertexUvData);
 	XA_DEBUG_ASSERT(meshDecl.vertexUvStride > 0);
 	return *((const internal::Vector2 *)&((const uint8_t *)meshDecl.vertexUvData)[meshDecl.vertexUvStride * index]);
 }
 
-static uint32_t DecodeIndex(IndexFormat::Enum format, const void *indexData, int32_t offset, uint32_t i)
-{
+static uint32_t DecodeIndex(IndexFormat format, const void *indexData, int32_t offset, uint32_t i) {
 	XA_DEBUG_ASSERT(indexData);
 	if (format == IndexFormat::UInt16)
 		return uint16_t((int32_t)((const uint16_t *)indexData)[i] + offset);
 	return uint32_t((int32_t)((const uint32_t *)indexData)[i] + offset);
 }
 
-AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t meshCountHint)
-{
+AddMeshError AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t meshCountHint) {
 	XA_DEBUG_ASSERT(atlas);
 	if (!atlas) {
 		XA_PRINT_WARNING("AddMesh: atlas is null.\n");
@@ -9337,33 +8625,36 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 	}
 #if XA_PROFILE
 	if (ctx->meshes.isEmpty())
-		internal::s_profile.addMeshReal = clock();
+		internal::s_profile.addMeshRealStart = std::chrono::high_resolution_clock::now();
 #endif
 	// Don't know how many times AddMesh will be called, so progress needs to adjusted each time.
 	if (!ctx->addMeshProgress) {
 		ctx->addMeshProgress = XA_NEW_ARGS(internal::MemTag::Default, internal::Progress, ProgressCategory::AddMesh, ctx->progressFunc, ctx->progressUserData, 1);
-	}
-	else {
+	} else {
 		ctx->addMeshProgress->setMaxValue(internal::max(ctx->meshes.size() + 1, meshCountHint));
 	}
 	XA_PROFILE_START(addMeshCopyData)
 	const bool hasIndices = meshDecl.indexCount > 0;
 	const uint32_t indexCount = hasIndices ? meshDecl.indexCount : meshDecl.vertexCount;
-	XA_PRINT("Adding mesh %d: %u vertices, %u triangles\n", ctx->meshes.size(), meshDecl.vertexCount, indexCount / 3);
-	// Expecting triangle faces.
-	if ((indexCount % 3) != 0)
-		return AddMeshError::InvalidIndexCount;
-	if (hasIndices) {
-		// Check if any index is out of range.
-		for (uint32_t i = 0; i < indexCount; i++) {
-			const uint32_t index = DecodeIndex(meshDecl.indexFormat, meshDecl.indexData, meshDecl.indexOffset, i);
-			if (index >= meshDecl.vertexCount)
-				return AddMeshError::IndexOutOfRange;
+	uint32_t faceCount = indexCount / 3;
+	if (meshDecl.faceVertexCount) {
+		faceCount = meshDecl.faceCount;
+		XA_PRINT("Adding mesh %d: %u vertices, %u polygons\n", ctx->meshes.size(), meshDecl.vertexCount, faceCount);
+		for (uint32_t f = 0; f < faceCount; f++) {
+			if (meshDecl.faceVertexCount[f] < 3)
+				return AddMeshError::InvalidFaceVertexCount;
 		}
+	} else {
+		XA_PRINT("Adding mesh %d: %u vertices, %u triangles\n", ctx->meshes.size(), meshDecl.vertexCount, faceCount);
+		// Expecting triangle faces unless otherwise specified.
+		if ((indexCount % 3) != 0)
+			return AddMeshError::InvalidIndexCount;
 	}
 	uint32_t meshFlags = internal::MeshFlags::HasIgnoredFaces;
 	if (meshDecl.vertexNormalData)
 		meshFlags |= internal::MeshFlags::HasNormals;
+	if (meshDecl.faceMaterialData)
+		meshFlags |= internal::MeshFlags::HasMaterials;
 	internal::Mesh *mesh = XA_NEW_ARGS(internal::MemTag::Mesh, internal::Mesh, meshDecl.epsilon, meshDecl.vertexCount, indexCount / 3, meshFlags, ctx->meshes.size());
 	for (uint32_t i = 0; i < meshDecl.vertexCount; i++) {
 		internal::Vector3 normal(0.0f);
@@ -9374,17 +8665,42 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 			texcoord = DecodeUv(meshDecl, i);
 		mesh->addVertex(DecodePosition(meshDecl, i), normal, texcoord);
 	}
+	MeshPolygonMapping *meshPolygonMapping = nullptr;
+	if (meshDecl.faceVertexCount) {
+		meshPolygonMapping = XA_NEW(internal::MemTag::Default, MeshPolygonMapping);
+		// Copy MeshDecl::faceVertexCount so it can be used later when building output meshes.
+		meshPolygonMapping->faceVertexCount.copyFrom(meshDecl.faceVertexCount, meshDecl.faceCount);
+		// There should be at least as many triangles as polygons.
+		meshPolygonMapping->triangleToPolygonMap.reserve(meshDecl.faceCount);
+		meshPolygonMapping->triangleToPolygonIndicesMap.reserve(meshDecl.indexCount);
+	}
 	const uint32_t kMaxWarnings = 50;
 	uint32_t warningCount = 0;
-	for (uint32_t i = 0; i < indexCount / 3; i++) {
-		uint32_t tri[3];
-		for (int j = 0; j < 3; j++)
-			tri[j] = hasIndices ? DecodeIndex(meshDecl.indexFormat, meshDecl.indexData, meshDecl.indexOffset, i * 3 + j) : i * 3 + j;
+	internal::Array<uint32_t> triIndices;
+	uint32_t firstFaceIndex = 0;
+	internal::Triangulator triangulator;
+	for (uint32_t face = 0; face < faceCount; face++) {
+		// Decode face indices.
+		const uint32_t faceVertexCount = meshDecl.faceVertexCount ? (uint32_t)meshDecl.faceVertexCount[face] : 3;
+		uint32_t polygon[UINT8_MAX];
+		for (uint32_t i = 0; i < faceVertexCount; i++) {
+			if (hasIndices) {
+				polygon[i] = DecodeIndex(meshDecl.indexFormat, meshDecl.indexData, meshDecl.indexOffset, face * faceVertexCount + i);
+				// Check if any index is out of range.
+				if (polygon[i] >= meshDecl.vertexCount) {
+					mesh->~Mesh();
+					XA_FREE(mesh);
+					return AddMeshError::IndexOutOfRange;
+				}
+			} else {
+				polygon[i] = face * faceVertexCount + i;
+			}
+		}
+		// Ignore faces with degenerate or zero length edges.
 		bool ignore = false;
-		// Check for degenerate or zero length edges.
-		for (int j = 0; j < 3; j++) {
-			const uint32_t index1 = tri[j];
-			const uint32_t index2 = tri[(j + 1) % 3];
+		for (uint32_t i = 0; i < faceVertexCount; i++) {
+			const uint32_t index1 = polygon[i];
+			const uint32_t index2 = polygon[(i + 1) % 3];
 			if (index1 == index2) {
 				ignore = true;
 				if (++warningCount <= kMaxWarnings)
@@ -9402,119 +8718,136 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 		}
 		// Ignore faces with any nan vertex attributes.
 		if (!ignore) {
-			for (int j = 0; j < 3; j++) {
-				const internal::Vector3 &pos = mesh->position(tri[j]);
+			for (uint32_t i = 0; i < faceVertexCount; i++) {
+				const internal::Vector3 &pos = mesh->position(polygon[i]);
 				if (internal::isNan(pos.x) || internal::isNan(pos.y) || internal::isNan(pos.z)) {
 					if (++warningCount <= kMaxWarnings)
-						XA_PRINT("   NAN position in face: %d\n", i);
+						XA_PRINT("   NAN position in face: %d\n", face);
 					ignore = true;
 					break;
 				}
 				if (meshDecl.vertexNormalData) {
-					const internal::Vector3 &normal = mesh->normal(tri[j]);
+					const internal::Vector3 &normal = mesh->normal(polygon[i]);
 					if (internal::isNan(normal.x) || internal::isNan(normal.y) || internal::isNan(normal.z)) {
 						if (++warningCount <= kMaxWarnings)
-							XA_PRINT("   NAN normal in face: %d\n", i);
+							XA_PRINT("   NAN normal in face: %d\n", face);
 						ignore = true;
 						break;
 					}
 				}
 				if (meshDecl.vertexUvData) {
-					const internal::Vector2 &uv = mesh->texcoord(tri[j]);
+					const internal::Vector2 &uv = mesh->texcoord(polygon[i]);
 					if (internal::isNan(uv.x) || internal::isNan(uv.y)) {
 						if (++warningCount <= kMaxWarnings)
-							XA_PRINT("   NAN texture coordinate in face: %d\n", i);
+							XA_PRINT("   NAN texture coordinate in face: %d\n", face);
 						ignore = true;
 						break;
 					}
 				}
 			}
 		}
-		const internal::Vector3 &a = mesh->position(tri[0]);
-		const internal::Vector3 &b = mesh->position(tri[1]);
-		const internal::Vector3 &c = mesh->position(tri[2]);
-		// Check for zero area faces.
-		float area = 0.0f;
-		if (!ignore) {
-			area = internal::length(internal::cross(b - a, c - a)) * 0.5f;
-			if (area <= internal::kAreaEpsilon) {
-				ignore = true;
-				if (++warningCount <= kMaxWarnings)
-					XA_PRINT("   Zero area face: %d, indices (%d %d %d), area is %f\n", i, tri[0], tri[1], tri[2], area);
-			}
+		// Triangulate if necessary.
+		triIndices.clear();
+		if (faceVertexCount == 3) {
+			triIndices.push_back(polygon[0]);
+			triIndices.push_back(polygon[1]);
+			triIndices.push_back(polygon[2]);
+		} else {
+			triangulator.triangulatePolygon(mesh->positions(), internal::ConstArrayView<uint32_t>(polygon, faceVertexCount), triIndices);
 		}
+		// Check for zero area faces.
 		if (!ignore) {
-			if (internal::equal(a, b, meshDecl.epsilon) || internal::equal(a, c, meshDecl.epsilon) || internal::equal(b, c, meshDecl.epsilon)) {
-				ignore = true;
-				if (++warningCount <= kMaxWarnings)
-					XA_PRINT("   Degenerate face: %d, area is %f\n", i, area);
+			for (uint32_t i = 0; i < triIndices.size(); i += 3) {
+				const internal::Vector3 &a = mesh->position(triIndices[i + 0]);
+				const internal::Vector3 &b = mesh->position(triIndices[i + 1]);
+				const internal::Vector3 &c = mesh->position(triIndices[i + 2]);
+				const float area = internal::length(internal::cross(b - a, c - a)) * 0.5f;
+				if (area <= internal::kAreaEpsilon) {
+					ignore = true;
+					if (++warningCount <= kMaxWarnings)
+						XA_PRINT("   Zero area face: %d, area is %f\n", face, area);
+					break;
+				}
 			}
 		}
-		if (meshDecl.faceIgnoreData && meshDecl.faceIgnoreData[i])
+		// User face ignore.
+		if (meshDecl.faceIgnoreData && meshDecl.faceIgnoreData[face])
 			ignore = true;
-		mesh->addFace(tri[0], tri[1], tri[2], ignore);
+		// User material.
+		uint32_t material = UINT32_MAX;
+		if (meshDecl.faceMaterialData)
+			material = meshDecl.faceMaterialData[face];
+		// Add the face(s).
+		for (uint32_t i = 0; i < triIndices.size(); i += 3) {
+			mesh->addFace(&triIndices[i], ignore, material);
+			if (meshPolygonMapping)
+				meshPolygonMapping->triangleToPolygonMap.push_back(face);
+		}
+		if (meshPolygonMapping) {
+			for (uint32_t i = 0; i < triIndices.size(); i++)
+				meshPolygonMapping->triangleToPolygonIndicesMap.push_back(triIndices[i]);
+		}
+		firstFaceIndex += faceVertexCount;
 	}
 	if (warningCount > kMaxWarnings)
 		XA_PRINT("   %u additional warnings truncated\n", warningCount - kMaxWarnings);
 	XA_PROFILE_END(addMeshCopyData)
 	ctx->meshes.push_back(mesh);
+	ctx->meshPolygonMappings.push_back(meshPolygonMapping);
 	ctx->paramAtlas.addMesh(mesh);
 	if (ctx->addMeshTaskGroup.value == UINT32_MAX)
-		ctx->addMeshTaskGroup = ctx->taskScheduler->createTaskGroup();
-	AddMeshTaskArgs *taskArgs = XA_NEW(internal::MemTag::Default, AddMeshTaskArgs); // The task frees this.
-	taskArgs->ctx = ctx;
-	taskArgs->mesh = mesh;
+		ctx->addMeshTaskGroup = ctx->taskScheduler->createTaskGroup(ctx);
 	internal::Task task;
-	task.userData = taskArgs;
+	task.userData = mesh;
 	task.func = runAddMeshTask;
 	ctx->taskScheduler->run(ctx->addMeshTaskGroup, task);
 	return AddMeshError::Success;
 }
 
-void AddMeshJoin(Atlas *atlas)
-{
+void AddMeshJoin(Atlas *atlas) {
 	XA_DEBUG_ASSERT(atlas);
 	if (!atlas) {
 		XA_PRINT_WARNING("AddMeshJoin: atlas is null.\n");
 		return;
 	}
 	Context *ctx = (Context *)atlas;
-	if (!ctx->addMeshProgress)
-		return;
-	ctx->taskScheduler->wait(&ctx->addMeshTaskGroup);
-	ctx->addMeshProgress->~Progress();
-	XA_FREE(ctx->addMeshProgress);
-	ctx->addMeshProgress = nullptr;
+	if (!ctx->uvMeshes.isEmpty()) {
 #if XA_PROFILE
-	XA_PRINT("Added %u meshes\n", ctx->meshes.size());
-	internal::s_profile.addMeshReal = clock() - internal::s_profile.addMeshReal;
+		XA_PRINT("Added %u UV meshes\n", ctx->uvMeshes.size());
+		internal::s_profile.addMeshReal = uint64_t(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - internal::s_profile.addMeshRealStart).count());
 #endif
-	XA_PROFILE_PRINT_AND_RESET("   Total (real): ", addMeshReal)
-	XA_PROFILE_PRINT_AND_RESET("      Copy data: ", addMeshCopyData)
-	XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", addMeshThread)
-	XA_PROFILE_PRINT_AND_RESET("      Create colocals: ", addMeshCreateColocals)
+		XA_PROFILE_PRINT_AND_RESET("   Total: ", addMeshReal)
+		XA_PROFILE_PRINT_AND_RESET("      Copy data: ", addMeshCopyData)
 #if XA_PROFILE_ALLOC
-	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+		XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
 #endif
-	XA_PRINT_MEM_USAGE
+		XA_PRINT_MEM_USAGE
+	} else {
+		if (!ctx->addMeshProgress)
+			return;
+		ctx->taskScheduler->wait(&ctx->addMeshTaskGroup);
+		ctx->addMeshProgress->~Progress();
+		XA_FREE(ctx->addMeshProgress);
+		ctx->addMeshProgress = nullptr;
+#if XA_PROFILE
+		XA_PRINT("Added %u meshes\n", ctx->meshes.size());
+		internal::s_profile.addMeshReal = uint64_t(std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - internal::s_profile.addMeshRealStart).count());
+#endif
+		XA_PROFILE_PRINT_AND_RESET("   Total (real): ", addMeshReal)
+		XA_PROFILE_PRINT_AND_RESET("      Copy data: ", addMeshCopyData)
+		XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", addMeshThread)
+		XA_PROFILE_PRINT_AND_RESET("      Create colocals: ", addMeshCreateColocals)
+#if XA_PROFILE_ALLOC
+		XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+#endif
+		XA_PRINT_MEM_USAGE
 #if XA_DEBUG_EXPORT_OBJ_FACE_GROUPS
-	internal::param::s_faceGroupsCurrentVertex = 0;
+		internal::param::s_faceGroupsCurrentVertex = 0;
 #endif
+	}
 }
 
-struct EdgeKey
-{
-	EdgeKey() {}
-	EdgeKey(const EdgeKey &k) : v0(k.v0), v1(k.v1) {}
-	EdgeKey(uint32_t v0, uint32_t v1) : v0(v0), v1(v1) {}
-	bool operator==(const EdgeKey &k) const { return v0 == k.v0 && v1 == k.v1; }
-
-	uint32_t v0;
-	uint32_t v1;
-};
-
-AddMeshError::Enum AddUvMesh(Atlas *atlas, const UvMeshDecl &decl)
-{
+AddMeshError AddUvMesh(Atlas *atlas, const UvMeshDecl &decl) {
 	XA_DEBUG_ASSERT(atlas);
 	if (!atlas) {
 		XA_PRINT_WARNING("AddUvMesh: atlas is null.\n");
@@ -9525,13 +8858,18 @@ AddMeshError::Enum AddUvMesh(Atlas *atlas, const UvMeshDecl &decl)
 		XA_PRINT_WARNING("AddUvMesh: Meshes and UV meshes cannot be added to the same atlas.\n");
 		return AddMeshError::Error;
 	}
-	const bool decoded = (decl.indexCount <= 0);
-	const uint32_t indexCount = decoded ? decl.vertexCount : decl.indexCount;
+#if XA_PROFILE
+	if (ctx->uvMeshInstances.isEmpty())
+		internal::s_profile.addMeshRealStart = std::chrono::high_resolution_clock::now();
+#endif
+	XA_PROFILE_START(addMeshCopyData)
+	const bool hasIndices = decl.indexCount > 0;
+	const uint32_t indexCount = hasIndices ? decl.indexCount : decl.vertexCount;
 	XA_PRINT("Adding UV mesh %d: %u vertices, %u triangles\n", ctx->uvMeshes.size(), decl.vertexCount, indexCount / 3);
 	// Expecting triangle faces.
 	if ((indexCount % 3) != 0)
 		return AddMeshError::InvalidIndexCount;
-	if (!decoded) {
+	if (hasIndices) {
 		// Check if any index is out of range.
 		for (uint32_t i = 0; i < indexCount; i++) {
 			const uint32_t index = DecodeIndex(decl.indexFormat, decl.indexData, decl.indexOffset, i);
@@ -9539,319 +8877,266 @@ AddMeshError::Enum AddUvMesh(Atlas *atlas, const UvMeshDecl &decl)
 				return AddMeshError::IndexOutOfRange;
 		}
 	}
+	// Create a mesh instance.
 	internal::UvMeshInstance *meshInstance = XA_NEW(internal::MemTag::Default, internal::UvMeshInstance);
-	meshInstance->texcoords.resize(decl.vertexCount);
-	for (uint32_t i = 0; i < decl.vertexCount; i++) {
-		internal::Vector2 texcoord = *((const internal::Vector2 *)&((const uint8_t *)decl.vertexUvData)[decl.vertexStride * i]);
-		// Set nan values to 0.
-		if (internal::isNan(texcoord.x) || internal::isNan(texcoord.y))
-			texcoord.x = texcoord.y = 0.0f;
-		meshInstance->texcoords[i] = texcoord;
-	}
-	meshInstance->rotateCharts = decl.rotateCharts;
+	meshInstance->mesh = nullptr;
+	ctx->uvMeshInstances.push_back(meshInstance);
 	// See if this is an instance of an already existing mesh.
 	internal::UvMesh *mesh = nullptr;
 	for (uint32_t m = 0; m < ctx->uvMeshes.size(); m++) {
 		if (memcmp(&ctx->uvMeshes[m]->decl, &decl, sizeof(UvMeshDecl)) == 0) {
-			meshInstance->mesh = mesh = ctx->uvMeshes[m];
+			mesh = ctx->uvMeshes[m];
+			XA_PRINT("   instance of a previous UV mesh\n");
 			break;
 		}
 	}
 	if (!mesh) {
 		// Copy geometry to mesh.
-		meshInstance->mesh = mesh = XA_NEW(internal::MemTag::Default, internal::UvMesh);
+		mesh = XA_NEW(internal::MemTag::Default, internal::UvMesh);
+		ctx->uvMeshes.push_back(mesh);
 		mesh->decl = decl;
+		if (decl.faceMaterialData) {
+			mesh->faceMaterials.resize(decl.indexCount / 3);
+			memcpy(mesh->faceMaterials.data(), decl.faceMaterialData, mesh->faceMaterials.size() * sizeof(uint32_t));
+		}
 		mesh->indices.resize(decl.indexCount);
 		for (uint32_t i = 0; i < indexCount; i++)
-			mesh->indices[i] = decoded ? i : DecodeIndex(decl.indexFormat, decl.indexData, decl.indexOffset, i);
-		mesh->vertexToChartMap.resize(decl.vertexCount);
-		for (uint32_t i = 0; i < mesh->vertexToChartMap.size(); i++)
-			mesh->vertexToChartMap[i] = UINT32_MAX;
-		// Calculate charts (incident faces).
-		internal::HashMap<internal::Vector2> vertexToFaceMap(internal::MemTag::Default, indexCount); // Face is index / 3
-		const uint32_t faceCount = indexCount / 3;
-		for (uint32_t i = 0; i < indexCount; i++)
-			vertexToFaceMap.add(meshInstance->texcoords[mesh->indices[i]]);
-		internal::BitArray faceAssigned(faceCount);
-		faceAssigned.zeroOutMemory();
-		for (uint32_t f = 0; f < faceCount; f++) {
-			if (faceAssigned.get(f))
-				continue;
-			// Found an unassigned face, create a new chart.
-			internal::UvMeshChart *chart = XA_NEW(internal::MemTag::Default, internal::UvMeshChart);
-			chart->material = decl.faceMaterialData ? decl.faceMaterialData[f] : 0;
-			// Walk incident faces and assign them to the chart.
-			faceAssigned.set(f);
-			chart->faces.push_back(f);
-			for (;;) {
-				bool newFaceAssigned = false;
-				const uint32_t faceCount2 = chart->faces.size();
-				for (uint32_t f2 = 0; f2 < faceCount2; f2++) {
-					const uint32_t face = chart->faces[f2];
-					for (uint32_t i = 0; i < 3; i++) {
-						const internal::Vector2 &texcoord = meshInstance->texcoords[meshInstance->mesh->indices[face * 3 + i]];
-						uint32_t mapIndex = vertexToFaceMap.get(texcoord);
-						while (mapIndex != UINT32_MAX) {
-							const uint32_t face2 = mapIndex / 3; // 3 vertices added per face.
-							// Materials must match.
-							if (!faceAssigned.get(face2) && (!decl.faceMaterialData || decl.faceMaterialData[face] == decl.faceMaterialData[face2])) {
-								faceAssigned.set(face2);
-								chart->faces.push_back(face2);
-								newFaceAssigned = true;
-							}
-							mapIndex = vertexToFaceMap.getNext(mapIndex);
-						}
-					}
-				}
-				if (!newFaceAssigned)
+			mesh->indices[i] = hasIndices ? DecodeIndex(decl.indexFormat, decl.indexData, decl.indexOffset, i) : i;
+		mesh->texcoords.resize(decl.vertexCount);
+		for (uint32_t i = 0; i < decl.vertexCount; i++)
+			mesh->texcoords[i] = *((const internal::Vector2 *)&((const uint8_t *)decl.vertexUvData)[decl.vertexStride * i]);
+		// Validate.
+		mesh->faceIgnore.resize(decl.indexCount / 3);
+		mesh->faceIgnore.zeroOutMemory();
+		const uint32_t kMaxWarnings = 50;
+		uint32_t warningCount = 0;
+		for (uint32_t f = 0; f < indexCount / 3; f++) {
+			bool ignore = false;
+			uint32_t tri[3];
+			for (uint32_t i = 0; i < 3; i++)
+				tri[i] = mesh->indices[f * 3 + i];
+			// Check for nan UVs.
+			for (uint32_t i = 0; i < 3; i++) {
+				const uint32_t vertex = tri[i];
+				if (internal::isNan(mesh->texcoords[vertex].x) || internal::isNan(mesh->texcoords[vertex].y)) {
+					ignore = true;
+					if (++warningCount <= kMaxWarnings)
+						XA_PRINT("   NAN texture coordinate in vertex %u\n", vertex);
 					break;
+				}
 			}
-			for (uint32_t i = 0; i < chart->faces.size(); i++) {
-				for (uint32_t j = 0; j < 3; j++) {
-					const uint32_t vertex = meshInstance->mesh->indices[chart->faces[i] * 3 + j];
-					chart->indices.push_back(vertex);
-					mesh->vertexToChartMap[vertex] = mesh->charts.size();
+			// Check for zero area faces.
+			if (!ignore) {
+				const internal::Vector2 &v1 = mesh->texcoords[tri[0]];
+				const internal::Vector2 &v2 = mesh->texcoords[tri[1]];
+				const internal::Vector2 &v3 = mesh->texcoords[tri[2]];
+				const float area = fabsf(((v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y)) * 0.5f);
+				if (area <= internal::kAreaEpsilon) {
+					ignore = true;
+					if (++warningCount <= kMaxWarnings)
+						XA_PRINT("   Zero area face: %d, indices (%d %d %d), area is %f\n", f, tri[0], tri[1], tri[2], area);
 				}
 			}
-			mesh->charts.push_back(chart);
+			if (ignore)
+				mesh->faceIgnore.set(f);
 		}
-		ctx->uvMeshes.push_back(mesh);
-	} else {
-		XA_PRINT("   instance of a previous UV mesh\n");
+		if (warningCount > kMaxWarnings)
+			XA_PRINT("   %u additional warnings truncated\n", warningCount - kMaxWarnings);
 	}
-	XA_PRINT("   %u charts\n", meshInstance->mesh->charts.size());
-	ctx->uvMeshInstances.push_back(meshInstance);
+	meshInstance->mesh = mesh;
+	XA_PROFILE_END(addMeshCopyData)
 	return AddMeshError::Success;
 }
 
-void ComputeCharts(Atlas *atlas, ChartOptions options)
-{
+void ComputeCharts(Atlas *atlas, ChartOptions options) {
 	if (!atlas) {
 		XA_PRINT_WARNING("ComputeCharts: atlas is null.\n");
 		return;
 	}
 	Context *ctx = (Context *)atlas;
-	if (!ctx->uvMeshInstances.isEmpty()) {
-		XA_PRINT_WARNING("ComputeCharts: This function should not be called with UV meshes.\n");
-		return;
-	}
 	AddMeshJoin(atlas);
-	if (ctx->meshes.isEmpty()) {
-		XA_PRINT_WARNING("ComputeCharts: No meshes. Call AddMesh first.\n");
-		return;
-	}
-	XA_PRINT("Computing charts\n");
-	XA_PROFILE_START(computeChartsReal)
-	if (!ctx->paramAtlas.computeCharts(ctx->taskScheduler, options, ctx->progressFunc, ctx->progressUserData)) {
-		XA_PRINT("   Cancelled by user\n");
-		return;
-	}
-	XA_PROFILE_END(computeChartsReal)
-	// Count charts.
-	uint32_t chartCount = 0;
-	const uint32_t meshCount = ctx->meshes.size();
-	for (uint32_t i = 0; i < meshCount; i++) {
-		for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
-			const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
-			chartCount += chartGroup->segmentChartCount();
-		}
-	}
-	XA_PRINT("   %u charts\n", chartCount);
-#if XA_PROFILE
-	XA_PRINT("   Chart groups\n");
-	uint32_t chartGroupCount = 0;
-	for (uint32_t i = 0; i < meshCount; i++) {
-		XA_PRINT("      Mesh %u: %u chart groups\n", i, ctx->paramAtlas.chartGroupCount(i));
-		chartGroupCount += ctx->paramAtlas.chartGroupCount(i);
-	}
-	XA_PRINT("      %u total\n", chartGroupCount);
-#endif
-	XA_PROFILE_PRINT_AND_RESET("   Total (real): ", computeChartsReal)
-	XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", computeChartsThread)
-	XA_PROFILE_PRINT_AND_RESET("      Create face groups: ", createFaceGroups)
-	XA_PROFILE_PRINT_AND_RESET("      Extract invalid mesh geometry: ", extractInvalidMeshGeometry)
-	XA_PROFILE_PRINT_AND_RESET("      Chart group compute charts (real): ", chartGroupComputeChartsReal)
-	XA_PROFILE_PRINT_AND_RESET("      Chart group compute charts (thread): ", chartGroupComputeChartsThread)
-	XA_PROFILE_PRINT_AND_RESET("         Create chart group mesh: ", createChartGroupMesh)
-	XA_PROFILE_PRINT_AND_RESET("            Create colocals: ", createChartGroupMeshColocals)
-	XA_PROFILE_PRINT_AND_RESET("            Create boundaries: ", createChartGroupMeshBoundaries)
-	XA_PROFILE_PRINT_AND_RESET("         Build atlas: ", buildAtlas)
-	XA_PROFILE_PRINT_AND_RESET("            Init: ", buildAtlasInit)
-	XA_PROFILE_PRINT_AND_RESET("            Planar charts: ", planarCharts)
-	XA_PROFILE_PRINT_AND_RESET("            Clustered charts: ", clusteredCharts)
-	XA_PROFILE_PRINT_AND_RESET("               Place seeds: ", clusteredChartsPlaceSeeds)
-	XA_PROFILE_PRINT_AND_RESET("                  Boundary intersection: ", clusteredChartsPlaceSeedsBoundaryIntersection)
-	XA_PROFILE_PRINT_AND_RESET("               Relocate seeds: ", clusteredChartsRelocateSeeds)
-	XA_PROFILE_PRINT_AND_RESET("               Reset: ", clusteredChartsReset)
-	XA_PROFILE_PRINT_AND_RESET("               Grow: ", clusteredChartsGrow)
-	XA_PROFILE_PRINT_AND_RESET("                  Boundary intersection: ", clusteredChartsGrowBoundaryIntersection)
-	XA_PROFILE_PRINT_AND_RESET("               Merge: ", clusteredChartsMerge)
-	XA_PROFILE_PRINT_AND_RESET("               Fill holes: ", clusteredChartsFillHoles)
-	XA_PROFILE_PRINT_AND_RESET("         Copy chart faces: ", copyChartFaces)
-#if XA_PROFILE_ALLOC
-	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
-#endif
-	XA_PRINT_MEM_USAGE
-}
-
-void ParameterizeCharts(Atlas *atlas, ParameterizeOptions options)
-{
-	if (!atlas) {
-		XA_PRINT_WARNING("ParameterizeCharts: atlas is null.\n");
-		return;
-	}
-	Context *ctx = (Context *)atlas;
-	if (!ctx->uvMeshInstances.isEmpty()) {
-		XA_PRINT_WARNING("ParameterizeCharts: This function should not be called with UV meshes.\n");
-		return;
-	}
-	if (!ctx->paramAtlas.chartsComputed()) {
-		XA_PRINT_WARNING("ParameterizeCharts: ComputeCharts must be called first.\n");
+	if (ctx->meshes.isEmpty() && ctx->uvMeshInstances.isEmpty()) {
+		XA_PRINT_WARNING("ComputeCharts: No meshes. Call AddMesh or AddUvMesh first.\n");
 		return;
 	}
-	atlas->atlasCount = 0;
-	atlas->height = 0;
-	atlas->texelsPerUnit = 0;
-	atlas->width = 0;
-	if (atlas->utilization) {
+	// Reset atlas state. This function may be called multiple times, or again after PackCharts.
+	if (atlas->utilization)
 		XA_FREE(atlas->utilization);
-		atlas->utilization = nullptr;
-	}
-	if (atlas->image) {
+	if (atlas->image)
 		XA_FREE(atlas->image);
-		atlas->image = nullptr;
-	}
 	DestroyOutputMeshes(ctx);
-	XA_PRINT("Parameterizing charts\n");
-	XA_PROFILE_START(parameterizeChartsReal)
-	if (!ctx->paramAtlas.parameterizeCharts(ctx->taskScheduler, options, ctx->progressFunc, ctx->progressUserData)) {
-		XA_PRINT("   Cancelled by user\n");
+	memset(&ctx->atlas, 0, sizeof(Atlas));
+	XA_PRINT("Computing charts\n");
+	if (!ctx->meshes.isEmpty()) {
+		if (!ctx->paramAtlas.computeCharts(ctx->taskScheduler, options, ctx->progressFunc, ctx->progressUserData)) {
+			XA_PRINT("   Cancelled by user\n");
 			return;
-	}
-	XA_PROFILE_END(parameterizeChartsReal)
-	const uint32_t meshCount = ctx->meshes.size();
-	uint32_t chartCount = 0, chartsWithHolesCount = 0, holesCount = 0, chartsWithTJunctionsCount = 0, tJunctionsCount = 0, orthoChartsCount = 0, planarChartsCount = 0, lscmChartsCount = 0, piecewiseChartsCount = 0, chartsAddedCount = 0, chartsDeletedCount = 0;
-	for (uint32_t i = 0; i < meshCount; i++) {
-		for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
-			const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
-			for (uint32_t k = 0; k < chartGroup->chartCount(); k++) {
-				const internal::param::Chart *chart = chartGroup->chartAt(k);
-#if XA_PRINT_CHART_WARNINGS
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::CloseHolesFailed)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): failed to close holes\n", chartCount, i, j, k);
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::FixTJunctionsDuplicatedEdge)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): fixing t-junctions created non-manifold geometry\n", chartCount, i, j, k);
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::FixTJunctionsFailed)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): fixing t-junctions failed\n", chartCount, i, j, k);
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::TriangulateDuplicatedEdge)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): triangulation created non-manifold geometry\n", chartCount, i, j, k);
-#endif
-				holesCount += chart->closedHolesCount();
-				if (chart->closedHolesCount() > 0)
-					chartsWithHolesCount++;
-				tJunctionsCount += chart->fixedTJunctionsCount();
-				if (chart->fixedTJunctionsCount() > 0)
-					chartsWithTJunctionsCount++;
-				if (chart->type() == ChartType::Planar)
-					planarChartsCount++;
-				else if (chart->type() == ChartType::Ortho)
-					orthoChartsCount++;
-				else if (chart->type() == ChartType::LSCM)
-					lscmChartsCount++;
-				else if (chart->type() == ChartType::Piecewise)
-					piecewiseChartsCount++;
-			}
-			chartCount += chartGroup->chartCount();
-			chartsAddedCount += chartGroup->paramAddedChartsCount();
-			chartsDeletedCount += chartGroup->paramDeletedChartsCount();
-		}
-	}
-	if (holesCount > 0)
-		XA_PRINT("   %u holes closed in %u charts\n", holesCount, chartsWithHolesCount);
-	if (tJunctionsCount > 0)
-		XA_PRINT("   %u t-junctions fixed in %u charts\n", tJunctionsCount, chartsWithTJunctionsCount);
-	XA_PRINT("   %u planar charts, %u ortho charts, %u LSCM charts, %u piecewise charts\n", planarChartsCount, orthoChartsCount, lscmChartsCount, piecewiseChartsCount);
-	if (chartsDeletedCount > 0) {
-		XA_PRINT("   %u charts with invalid parameterizations replaced with %u new charts\n", chartsDeletedCount, chartsAddedCount);
+		}
+		uint32_t chartsWithTJunctionsCount = 0, tJunctionCount = 0, orthoChartsCount = 0, planarChartsCount = 0, lscmChartsCount = 0, piecewiseChartsCount = 0, originalUvChartsCount = 0;
+		uint32_t chartCount = 0;
+		const uint32_t meshCount = ctx->meshes.size();
+		for (uint32_t i = 0; i < meshCount; i++) {
+			for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
+				const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
+				for (uint32_t k = 0; k < chartGroup->chartCount(); k++) {
+					const internal::param::Chart *chart = chartGroup->chartAt(k);
+					tJunctionCount += chart->tjunctionCount();
+					if (chart->tjunctionCount() > 0)
+						chartsWithTJunctionsCount++;
+					if (chart->type() == ChartType::Planar)
+						planarChartsCount++;
+					else if (chart->type() == ChartType::Ortho)
+						orthoChartsCount++;
+					else if (chart->type() == ChartType::LSCM)
+						lscmChartsCount++;
+					else if (chart->type() == ChartType::Piecewise)
+						piecewiseChartsCount++;
+					if (chart->generatorType() == internal::segment::ChartGeneratorType::OriginalUv)
+						originalUvChartsCount++;
+				}
+				chartCount += chartGroup->chartCount();
+			}
+		}
+		if (tJunctionCount > 0)
+			XA_PRINT("   %u t-junctions found in %u charts\n", tJunctionCount, chartsWithTJunctionsCount);
 		XA_PRINT("   %u charts\n", chartCount);
-	}
-	uint32_t chartIndex = 0, invalidParamCount = 0;
-	for (uint32_t i = 0; i < meshCount; i++) {
-		for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
-			const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
-			for (uint32_t k = 0; k < chartGroup->chartCount(); k++) {
-				internal::param::Chart *chart = chartGroup->chartAt(k);
-				const internal::param::Quality &quality = chart->quality();
+		XA_PRINT("      %u planar, %u ortho, %u LSCM, %u piecewise\n", planarChartsCount, orthoChartsCount, lscmChartsCount, piecewiseChartsCount);
+		if (originalUvChartsCount > 0)
+			XA_PRINT("      %u with original UVs\n", originalUvChartsCount);
+		uint32_t chartIndex = 0, invalidParamCount = 0;
+		for (uint32_t i = 0; i < meshCount; i++) {
+			for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
+				const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
+				for (uint32_t k = 0; k < chartGroup->chartCount(); k++) {
+					internal::param::Chart *chart = chartGroup->chartAt(k);
+					const internal::param::Quality &quality = chart->quality();
 #if XA_DEBUG_EXPORT_OBJ_CHARTS_AFTER_PARAMETERIZATION
-				{
-					char filename[256];
-					XA_SPRINTF(filename, sizeof(filename), "debug_chart_%03u_after_parameterization.obj", chartIndex);
-					chart->unifiedMesh()->writeObjFile(filename);
-				}
-#endif
-				const char *type = "LSCM";
-				if (chart->type() == ChartType::Planar)
-					type = "planar";
-				else if (chart->type() == ChartType::Ortho)
-					type = "ortho";
-				else if (chart->type() == ChartType::Piecewise)
-					type = "piecewise";
-				if (chart->isInvalid()) {
-					if (quality.boundaryIntersection) {
-						XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u) (%s): invalid parameterization, self-intersecting boundary.\n", chartIndex, i, j, k, type);
-					}
-					if (quality.flippedTriangleCount > 0) {
-						XA_PRINT_WARNING("   Chart %u  (mesh %u, group %u, id %u) (%s): invalid parameterization, %u / %u flipped triangles.\n", chartIndex, i, j, k, type, quality.flippedTriangleCount, quality.totalTriangleCount);
+					{
+						char filename[256];
+						XA_SPRINTF(filename, sizeof(filename), "debug_chart_%03u_after_parameterization.obj", chartIndex);
+						chart->unifiedMesh()->writeObjFile(filename);
 					}
-					invalidParamCount++;
+#endif
+					const char *type = "LSCM";
+					if (chart->type() == ChartType::Planar)
+						type = "planar";
+					else if (chart->type() == ChartType::Ortho)
+						type = "ortho";
+					else if (chart->type() == ChartType::Piecewise)
+						type = "piecewise";
+					if (chart->isInvalid()) {
+						if (quality.boundaryIntersection) {
+							XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u) (%s): invalid parameterization, self-intersecting boundary.\n", chartIndex, i, j, k, type);
+						}
+						if (quality.flippedTriangleCount > 0) {
+							XA_PRINT_WARNING("   Chart %u  (mesh %u, group %u, id %u) (%s): invalid parameterization, %u / %u flipped triangles.\n", chartIndex, i, j, k, type, quality.flippedTriangleCount, quality.totalTriangleCount);
+						}
+						if (quality.zeroAreaTriangleCount > 0) {
+							XA_PRINT_WARNING("   Chart %u  (mesh %u, group %u, id %u) (%s): invalid parameterization, %u / %u zero area triangles.\n", chartIndex, i, j, k, type, quality.zeroAreaTriangleCount, quality.totalTriangleCount);
+						}
+						invalidParamCount++;
 #if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
-					char filename[256];
-					XA_SPRINTF(filename, sizeof(filename), "debug_chart_%03u_invalid_parameterization.obj", chartIndex);
-					const internal::Mesh *mesh = chart->unifiedMesh();
-					FILE *file;
-					XA_FOPEN(file, filename, "w");
-					if (file) {
-						mesh->writeObjVertices(file);
-						fprintf(file, "s off\n");
-						fprintf(file, "o object\n");
-						for (uint32_t f = 0; f < mesh->faceCount(); f++)
-							mesh->writeObjFace(file, f);
-						if (!chart->paramFlippedFaces().isEmpty()) {
-							fprintf(file, "o flipped_faces\n");
-							for (uint32_t f = 0; f < chart->paramFlippedFaces().size(); f++)
-								mesh->writeObjFace(file, chart->paramFlippedFaces()[f]);
+						char filename[256];
+						XA_SPRINTF(filename, sizeof(filename), "debug_chart_%03u_invalid_parameterization.obj", chartIndex);
+						const internal::Mesh *mesh = chart->unifiedMesh();
+						FILE *file;
+						XA_FOPEN(file, filename, "w");
+						if (file) {
+							mesh->writeObjVertices(file);
+							fprintf(file, "s off\n");
+							fprintf(file, "o object\n");
+							for (uint32_t f = 0; f < mesh->faceCount(); f++)
+								mesh->writeObjFace(file, f);
+							if (!chart->paramFlippedFaces().isEmpty()) {
+								fprintf(file, "o flipped_faces\n");
+								for (uint32_t f = 0; f < chart->paramFlippedFaces().size(); f++)
+									mesh->writeObjFace(file, chart->paramFlippedFaces()[f]);
+							}
+							mesh->writeObjBoundaryEges(file);
+							fclose(file);
 						}
-						mesh->writeObjBoundaryEges(file);
-						mesh->writeObjLinkedBoundaries(file);
-						fclose(file);
-					}
 #endif
+					}
+					chartIndex++;
 				}
-				chartIndex++;
 			}
 		}
+		if (invalidParamCount > 0)
+			XA_PRINT_WARNING("   %u charts with invalid parameterizations\n", invalidParamCount);
+#if XA_PROFILE
+		XA_PRINT("   Chart groups\n");
+		uint32_t chartGroupCount = 0;
+		for (uint32_t i = 0; i < meshCount; i++) {
+#if 0
+			XA_PRINT("      Mesh %u: %u chart groups\n", i, ctx->paramAtlas.chartGroupCount(i));
+#endif
+			chartGroupCount += ctx->paramAtlas.chartGroupCount(i);
+		}
+		XA_PRINT("      %u total\n", chartGroupCount);
+#endif
+		XA_PROFILE_PRINT_AND_RESET("   Compute charts total (real): ", computeChartsReal)
+		XA_PROFILE_PRINT_AND_RESET("   Compute charts total (thread): ", computeChartsThread)
+		XA_PROFILE_PRINT_AND_RESET("      Create face groups: ", createFaceGroups)
+		XA_PROFILE_PRINT_AND_RESET("      Extract invalid mesh geometry: ", extractInvalidMeshGeometry)
+		XA_PROFILE_PRINT_AND_RESET("      Chart group compute charts (real): ", chartGroupComputeChartsReal)
+		XA_PROFILE_PRINT_AND_RESET("      Chart group compute charts (thread): ", chartGroupComputeChartsThread)
+		XA_PROFILE_PRINT_AND_RESET("         Create chart group mesh: ", createChartGroupMesh)
+		XA_PROFILE_PRINT_AND_RESET("            Create colocals: ", createChartGroupMeshColocals)
+		XA_PROFILE_PRINT_AND_RESET("            Create boundaries: ", createChartGroupMeshBoundaries)
+		XA_PROFILE_PRINT_AND_RESET("         Build atlas: ", buildAtlas)
+		XA_PROFILE_PRINT_AND_RESET("            Init: ", buildAtlasInit)
+		XA_PROFILE_PRINT_AND_RESET("            Planar charts: ", planarCharts)
+		if (options.useInputMeshUvs) {
+			XA_PROFILE_PRINT_AND_RESET("            Original UV charts: ", originalUvCharts)
+		}
+		XA_PROFILE_PRINT_AND_RESET("            Clustered charts: ", clusteredCharts)
+		XA_PROFILE_PRINT_AND_RESET("               Place seeds: ", clusteredChartsPlaceSeeds)
+		XA_PROFILE_PRINT_AND_RESET("                  Boundary intersection: ", clusteredChartsPlaceSeedsBoundaryIntersection)
+		XA_PROFILE_PRINT_AND_RESET("               Relocate seeds: ", clusteredChartsRelocateSeeds)
+		XA_PROFILE_PRINT_AND_RESET("               Reset: ", clusteredChartsReset)
+		XA_PROFILE_PRINT_AND_RESET("               Grow: ", clusteredChartsGrow)
+		XA_PROFILE_PRINT_AND_RESET("                  Boundary intersection: ", clusteredChartsGrowBoundaryIntersection)
+		XA_PROFILE_PRINT_AND_RESET("               Merge: ", clusteredChartsMerge)
+		XA_PROFILE_PRINT_AND_RESET("               Fill holes: ", clusteredChartsFillHoles)
+		XA_PROFILE_PRINT_AND_RESET("         Copy chart faces: ", copyChartFaces)
+		XA_PROFILE_PRINT_AND_RESET("      Create chart mesh and parameterize (real): ", createChartMeshAndParameterizeReal)
+		XA_PROFILE_PRINT_AND_RESET("      Create chart mesh and parameterize (thread): ", createChartMeshAndParameterizeThread)
+		XA_PROFILE_PRINT_AND_RESET("         Create chart mesh: ", createChartMesh)
+		XA_PROFILE_PRINT_AND_RESET("         Parameterize charts: ", parameterizeCharts)
+		XA_PROFILE_PRINT_AND_RESET("            Orthogonal: ", parameterizeChartsOrthogonal)
+		XA_PROFILE_PRINT_AND_RESET("            LSCM: ", parameterizeChartsLSCM)
+		XA_PROFILE_PRINT_AND_RESET("            Recompute: ", parameterizeChartsRecompute)
+		XA_PROFILE_PRINT_AND_RESET("               Piecewise: ", parameterizeChartsPiecewise)
+		XA_PROFILE_PRINT_AND_RESET("                  Boundary intersection: ", parameterizeChartsPiecewiseBoundaryIntersection)
+		XA_PROFILE_PRINT_AND_RESET("            Evaluate quality: ", parameterizeChartsEvaluateQuality)
+#if XA_PROFILE_ALLOC
+		XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+#endif
+		XA_PRINT_MEM_USAGE
+	} else {
+		XA_PROFILE_START(computeChartsReal)
+		if (!internal::segment::computeUvMeshCharts(ctx->taskScheduler, ctx->uvMeshes, ctx->progressFunc, ctx->progressUserData)) {
+			XA_PRINT("   Cancelled by user\n");
+			return;
+		}
+		XA_PROFILE_END(computeChartsReal)
+		ctx->uvMeshChartsComputed = true;
+		// Count charts.
+		uint32_t chartCount = 0;
+		const uint32_t meshCount = ctx->uvMeshes.size();
+		for (uint32_t i = 0; i < meshCount; i++)
+			chartCount += ctx->uvMeshes[i]->charts.size();
+		XA_PRINT("   %u charts\n", chartCount);
+		XA_PROFILE_PRINT_AND_RESET("   Total (real): ", computeChartsReal)
+		XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", computeChartsThread)
 	}
-	if (invalidParamCount > 0)
-		XA_PRINT_WARNING("   %u charts with invalid parameterizations\n", invalidParamCount);
-	XA_PROFILE_PRINT_AND_RESET("   Total (real): ", parameterizeChartsReal)
-	XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", parameterizeChartsThread)
-	XA_PROFILE_PRINT_AND_RESET("      Create chart mesh: ", createChartMesh)
-	XA_PROFILE_PRINT_AND_RESET("         Fix t-junctions: ", fixChartMeshTJunctions)
-	XA_PROFILE_PRINT_AND_RESET("         Close holes: ", closeChartMeshHoles)
-	XA_PROFILE_PRINT_AND_RESET("      Orthogonal: ", parameterizeChartsOrthogonal)
-	XA_PROFILE_PRINT_AND_RESET("      LSCM: ", parameterizeChartsLSCM)
-	XA_PROFILE_PRINT_AND_RESET("      Recompute: ", parameterizeChartsRecompute)
-	XA_PROFILE_PRINT_AND_RESET("         Piecewise: ", parameterizeChartsPiecewise)
-	XA_PROFILE_PRINT_AND_RESET("            Boundary intersection: ", parameterizeChartsPiecewiseBoundaryIntersection)
-	XA_PROFILE_PRINT_AND_RESET("      Evaluate quality: ", parameterizeChartsEvaluateQuality)
 #if XA_PROFILE_ALLOC
 	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
 #endif
 	XA_PRINT_MEM_USAGE
 }
 
-void PackCharts(Atlas *atlas, PackOptions packOptions)
-{
+void PackCharts(Atlas *atlas, PackOptions packOptions) {
 	// Validate arguments and context state.
 	if (!atlas) {
 		XA_PRINT_WARNING("PackCharts: atlas is null.\n");
@@ -9867,10 +9152,9 @@ void PackCharts(Atlas *atlas, PackOptions packOptions)
 			XA_PRINT_WARNING("PackCharts: ComputeCharts must be called first.\n");
 			return;
 		}
-		if (!ctx->paramAtlas.chartsParameterized()) {
-			XA_PRINT_WARNING("PackCharts: ParameterizeCharts must be called first.\n");
-			return;
-		}
+	} else if (!ctx->uvMeshChartsComputed) {
+		XA_PRINT_WARNING("PackCharts: ComputeCharts must be called first.\n");
+		return;
 	}
 	if (packOptions.texelsPerUnit < 0.0f) {
 		XA_PRINT_WARNING("PackCharts: PackOptions::texelsPerUnit is negative.\n");
@@ -9893,8 +9177,7 @@ void PackCharts(Atlas *atlas, PackOptions packOptions)
 	if (!ctx->uvMeshInstances.isEmpty()) {
 		for (uint32_t i = 0; i < ctx->uvMeshInstances.size(); i++)
 			packAtlas.addUvMeshCharts(ctx->uvMeshInstances[i]);
-	}
-	else
+	} else
 		packAtlas.addCharts(ctx->taskScheduler, &ctx->paramAtlas);
 	XA_PROFILE_END(packChartsAddCharts)
 	XA_PROFILE_START(packCharts)
@@ -9946,16 +9229,35 @@ void PackCharts(Atlas *atlas, PackOptions packOptions)
 		uint32_t chartIndex = 0;
 		for (uint32_t i = 0; i < atlas->meshCount; i++) {
 			Mesh &outputMesh = atlas->meshes[i];
+			MeshPolygonMapping *meshPolygonMapping = ctx->meshPolygonMappings[i];
+			// One polygon can have many triangles. Don't want to process the same polygon more than once when counting indices, building chart faces etc.
+			internal::BitArray polygonTouched;
+			if (meshPolygonMapping) {
+				polygonTouched.resize(meshPolygonMapping->faceVertexCount.size());
+				polygonTouched.zeroOutMemory();
+			}
 			// Count and alloc arrays.
-			const internal::param::InvalidMeshGeometry &invalid = ctx->paramAtlas.invalidMeshGeometry(i);
+			const internal::InvalidMeshGeometry &invalid = ctx->paramAtlas.invalidMeshGeometry(i);
 			outputMesh.vertexCount += invalid.vertices().length;
 			outputMesh.indexCount += invalid.faces().length * 3;
 			for (uint32_t cg = 0; cg < ctx->paramAtlas.chartGroupCount(i); cg++) {
 				const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, cg);
 				for (uint32_t c = 0; c < chartGroup->chartCount(); c++) {
 					const internal::param::Chart *chart = chartGroup->chartAt(c);
-					outputMesh.vertexCount += chart->mesh()->vertexCount();
-					outputMesh.indexCount += chart->mesh()->faceCount() * 3;
+					outputMesh.vertexCount += chart->originalVertexCount();
+					const uint32_t faceCount = chart->unifiedMesh()->faceCount();
+					if (meshPolygonMapping) {
+						// Map triangles back to polygons and count the polygon vertices.
+						for (uint32_t f = 0; f < faceCount; f++) {
+							const uint32_t polygon = meshPolygonMapping->triangleToPolygonMap[chart->mapFaceToSourceFace(f)];
+							if (!polygonTouched.get(polygon)) {
+								polygonTouched.set(polygon);
+								outputMesh.indexCount += meshPolygonMapping->faceVertexCount[polygon];
+							}
+						}
+					} else {
+						outputMesh.indexCount += faceCount * 3;
+					}
 					outputMesh.chartCount++;
 				}
 			}
@@ -9966,7 +9268,7 @@ void PackCharts(Atlas *atlas, PackOptions packOptions)
 			// Copy mesh data.
 			uint32_t firstVertex = 0;
 			{
-				const internal::param::InvalidMeshGeometry &mesh = ctx->paramAtlas.invalidMeshGeometry(i);
+				const internal::InvalidMeshGeometry &mesh = ctx->paramAtlas.invalidMeshGeometry(i);
 				internal::ConstArrayView<uint32_t> faces = mesh.faces();
 				internal::ConstArrayView<uint32_t> indices = mesh.indices();
 				internal::ConstArrayView<uint32_t> vertices = mesh.vertices();
@@ -9991,23 +9293,50 @@ void PackCharts(Atlas *atlas, PackOptions packOptions)
 				const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, cg);
 				for (uint32_t c = 0; c < chartGroup->chartCount(); c++) {
 					const internal::param::Chart *chart = chartGroup->chartAt(c);
-					const internal::Mesh *mesh = chart->mesh();
+					const internal::Mesh *unifiedMesh = chart->unifiedMesh();
+					const uint32_t faceCount = unifiedMesh->faceCount();
+#if XA_CHECK_PARAM_WINDING
+					uint32_t flippedCount = 0;
+					for (uint32_t f = 0; f < faceCount; f++) {
+						const float area = mesh->computeFaceParametricArea(f);
+						if (area < 0.0f)
+							flippedCount++;
+					}
+					const char *type = "LSCM";
+					if (chart->type() == ChartType::Planar)
+						type = "planar";
+					else if (chart->type() == ChartType::Ortho)
+						type = "ortho";
+					else if (chart->type() == ChartType::Piecewise)
+						type = "piecewise";
+					if (flippedCount > 0) {
+						if (flippedCount == faceCount) {
+							XA_PRINT_WARNING("chart %u (%s): all face flipped\n", chartIndex, type);
+						} else {
+							XA_PRINT_WARNING("chart %u (%s): %u / %u faces flipped\n", chartIndex, type, flippedCount, faceCount);
+						}
+					}
+#endif
 					// Vertices.
-					for (uint32_t v = 0; v < mesh->vertexCount(); v++) {
+					for (uint32_t v = 0; v < chart->originalVertexCount(); v++) {
 						Vertex &vertex = outputMesh.vertexArray[firstVertex + v];
 						vertex.atlasIndex = packAtlas.getChart(chartIndex)->atlasIndex;
 						XA_DEBUG_ASSERT(vertex.atlasIndex >= 0);
 						vertex.chartIndex = (int32_t)chartIndex;
-						const internal::Vector2 &uv = mesh->texcoord(v);
+						const internal::Vector2 &uv = unifiedMesh->texcoord(chart->originalVertexToUnifiedVertex(v));
 						vertex.uv[0] = internal::max(0.0f, uv.x);
 						vertex.uv[1] = internal::max(0.0f, uv.y);
 						vertex.xref = chart->mapChartVertexToSourceVertex(v);
 					}
 					// Indices.
-					for (uint32_t f = 0; f < mesh->faceCount(); f++) {
+					for (uint32_t f = 0; f < faceCount; f++) {
 						const uint32_t indexOffset = chart->mapFaceToSourceFace(f) * 3;
-						for (uint32_t j = 0; j < 3; j++)
-							outputMesh.indexArray[indexOffset + j] = firstVertex + mesh->vertexAt(f * 3 + j);
+						for (uint32_t j = 0; j < 3; j++) {
+							uint32_t outIndex = indexOffset + j;
+							if (meshPolygonMapping)
+								outIndex = meshPolygonMapping->triangleToPolygonIndicesMap[outIndex];
+							outputMesh.indexArray[outIndex] = firstVertex + chart->originalVertices()[f * 3 + j];
+						}
 					}
 					// Charts.
 					Chart *outputChart = &outputMesh.chartArray[meshChartIndex];
@@ -10015,14 +9344,38 @@ void PackCharts(Atlas *atlas, PackOptions packOptions)
 					XA_DEBUG_ASSERT(atlasIndex >= 0);
 					outputChart->atlasIndex = (uint32_t)atlasIndex;
 					outputChart->type = chart->isInvalid() ? ChartType::Invalid : chart->type();
-					outputChart->faceCount = mesh->faceCount();
-					outputChart->faceArray = XA_ALLOC_ARRAY(internal::MemTag::Default, uint32_t, outputChart->faceCount);
-					for (uint32_t f = 0; f < outputChart->faceCount; f++)
-						outputChart->faceArray[f] = chart->mapFaceToSourceFace(f);
+					if (meshPolygonMapping) {
+						// Count polygons.
+						polygonTouched.zeroOutMemory();
+						outputChart->faceCount = 0;
+						for (uint32_t f = 0; f < faceCount; f++) {
+							const uint32_t polygon = meshPolygonMapping->triangleToPolygonMap[chart->mapFaceToSourceFace(f)];
+							if (!polygonTouched.get(polygon)) {
+								polygonTouched.set(polygon);
+								outputChart->faceCount++;
+							}
+						}
+						// Write polygons.
+						outputChart->faceArray = XA_ALLOC_ARRAY(internal::MemTag::Default, uint32_t, outputChart->faceCount);
+						polygonTouched.zeroOutMemory();
+						uint32_t of = 0;
+						for (uint32_t f = 0; f < faceCount; f++) {
+							const uint32_t polygon = meshPolygonMapping->triangleToPolygonMap[chart->mapFaceToSourceFace(f)];
+							if (!polygonTouched.get(polygon)) {
+								polygonTouched.set(polygon);
+								outputChart->faceArray[of++] = polygon;
+							}
+						}
+					} else {
+						outputChart->faceCount = faceCount;
+						outputChart->faceArray = XA_ALLOC_ARRAY(internal::MemTag::Default, uint32_t, outputChart->faceCount);
+						for (uint32_t f = 0; f < outputChart->faceCount; f++)
+							outputChart->faceArray[f] = chart->mapFaceToSourceFace(f);
+					}
 					outputChart->material = 0;
 					meshChartIndex++;
 					chartIndex++;
-					firstVertex += mesh->vertexCount();
+					firstVertex += chart->originalVertexCount();
 				}
 			}
 			XA_DEBUG_ASSERT(outputMesh.vertexCount == firstVertex);
@@ -10102,28 +9455,21 @@ void PackCharts(Atlas *atlas, PackOptions packOptions)
 	XA_PRINT_MEM_USAGE
 }
 
-void Generate(Atlas *atlas, ChartOptions chartOptions, ParameterizeOptions parameterizeOptions, PackOptions packOptions)
-{
+void Generate(Atlas *atlas, ChartOptions chartOptions, PackOptions packOptions) {
 	if (!atlas) {
 		XA_PRINT_WARNING("Generate: atlas is null.\n");
 		return;
 	}
 	Context *ctx = (Context *)atlas;
-	if (!ctx->uvMeshInstances.isEmpty()) {
-		XA_PRINT_WARNING("Generate: This function should not be called with UV meshes.\n");
-		return;
-	}
-	if (ctx->meshes.isEmpty()) {
-		XA_PRINT_WARNING("Generate: No meshes. Call AddMesh first.\n");
+	if (ctx->meshes.isEmpty() && ctx->uvMeshInstances.isEmpty()) {
+		XA_PRINT_WARNING("Generate: No meshes. Call AddMesh or AddUvMesh first.\n");
 		return;
 	}
 	ComputeCharts(atlas, chartOptions);
-	ParameterizeCharts(atlas, parameterizeOptions);
 	PackCharts(atlas, packOptions);
 }
 
-void SetProgressCallback(Atlas *atlas, ProgressFunc progressFunc, void *progressUserData)
-{
+void SetProgressCallback(Atlas *atlas, ProgressFunc progressFunc, void *progressUserData) {
 	if (!atlas) {
 		XA_PRINT_WARNING("SetProgressCallback: atlas is null.\n");
 		return;
@@ -10133,37 +9479,33 @@ void SetProgressCallback(Atlas *atlas, ProgressFunc progressFunc, void *progress
 	ctx->progressUserData = progressUserData;
 }
 
-void SetAlloc(ReallocFunc reallocFunc, FreeFunc freeFunc)
-{
+void SetAlloc(ReallocFunc reallocFunc, FreeFunc freeFunc) {
 	internal::s_realloc = reallocFunc;
 	internal::s_free = freeFunc;
 }
 
-void SetPrint(PrintFunc print, bool verbose)
-{
+void SetPrint(PrintFunc print, bool verbose) {
 	internal::s_print = print;
 	internal::s_printVerbose = verbose;
 }
 
-const char *StringForEnum(AddMeshError::Enum error)
-{
+const char *StringForEnum(AddMeshError error) {
 	if (error == AddMeshError::Error)
 		return "Unspecified error";
 	if (error == AddMeshError::IndexOutOfRange)
 		return "Index out of range";
+	if (error == AddMeshError::InvalidFaceVertexCount)
+		return "Invalid face vertex count";
 	if (error == AddMeshError::InvalidIndexCount)
 		return "Invalid index count";
 	return "Success";
 }
 
-const char *StringForEnum(ProgressCategory::Enum category)
-{
+const char *StringForEnum(ProgressCategory category) {
 	if (category == ProgressCategory::AddMesh)
 		return "Adding mesh(es)";
 	if (category == ProgressCategory::ComputeCharts)
 		return "Computing charts";
-	if (category == ProgressCategory::ParameterizeCharts)
-		return "Parameterizing charts";
 	if (category == ProgressCategory::PackCharts)
 		return "Packing charts";
 	if (category == ProgressCategory::BuildOutputMeshes)
@@ -10172,3 +9514,96 @@ const char *StringForEnum(ProgressCategory::Enum category)
 }
 
 } // namespace xatlas
+
+#if XATLAS_C_API
+static_assert(sizeof(xatlas::Chart) == sizeof(xatlasChart), "xatlasChart size mismatch");
+static_assert(sizeof(xatlas::Vertex) == sizeof(xatlasVertex), "xatlasVertex size mismatch");
+static_assert(sizeof(xatlas::Mesh) == sizeof(xatlasMesh), "xatlasMesh size mismatch");
+static_assert(sizeof(xatlas::Atlas) == sizeof(xatlasAtlas), "xatlasAtlas size mismatch");
+static_assert(sizeof(xatlas::MeshDecl) == sizeof(xatlasMeshDecl), "xatlasMeshDecl size mismatch");
+static_assert(sizeof(xatlas::UvMeshDecl) == sizeof(xatlasUvMeshDecl), "xatlasUvMeshDecl size mismatch");
+static_assert(sizeof(xatlas::ChartOptions) == sizeof(xatlasChartOptions), "xatlasChartOptions size mismatch");
+static_assert(sizeof(xatlas::PackOptions) == sizeof(xatlasPackOptions), "xatlasPackOptions size mismatch");
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+xatlasAtlas *xatlasCreate() {
+	return (xatlasAtlas *)xatlas::Create();
+}
+
+void xatlasDestroy(xatlasAtlas *atlas) {
+	xatlas::Destroy((xatlas::Atlas *)atlas);
+}
+
+xatlasAddMeshError xatlasAddMesh(xatlasAtlas *atlas, const xatlasMeshDecl *meshDecl, uint32_t meshCountHint) {
+	return (xatlasAddMeshError)xatlas::AddMesh((xatlas::Atlas *)atlas, *(const xatlas::MeshDecl *)meshDecl, meshCountHint);
+}
+
+void xatlasAddMeshJoin(xatlasAtlas *atlas) {
+	xatlas::AddMeshJoin((xatlas::Atlas *)atlas);
+}
+
+xatlasAddMeshError xatlasAddUvMesh(xatlasAtlas *atlas, const xatlasUvMeshDecl *decl) {
+	return (xatlasAddMeshError)xatlas::AddUvMesh((xatlas::Atlas *)atlas, *(const xatlas::UvMeshDecl *)decl);
+}
+
+void xatlasComputeCharts(xatlasAtlas *atlas, const xatlasChartOptions *chartOptions) {
+	xatlas::ComputeCharts((xatlas::Atlas *)atlas, chartOptions ? *(xatlas::ChartOptions *)chartOptions : xatlas::ChartOptions());
+}
+
+void xatlasPackCharts(xatlasAtlas *atlas, const xatlasPackOptions *packOptions) {
+	xatlas::PackCharts((xatlas::Atlas *)atlas, packOptions ? *(xatlas::PackOptions *)packOptions : xatlas::PackOptions());
+}
+
+void xatlasGenerate(xatlasAtlas *atlas, const xatlasChartOptions *chartOptions, const xatlasPackOptions *packOptions) {
+	xatlas::Generate((xatlas::Atlas *)atlas, chartOptions ? *(xatlas::ChartOptions *)chartOptions : xatlas::ChartOptions(), packOptions ? *(xatlas::PackOptions *)packOptions : xatlas::PackOptions());
+}
+
+void xatlasSetProgressCallback(xatlasAtlas *atlas, xatlasProgressFunc progressFunc, void *progressUserData) {
+	xatlas::ProgressFunc pf;
+	*(void **)&pf = (void *)progressFunc;
+	xatlas::SetProgressCallback((xatlas::Atlas *)atlas, pf, progressUserData);
+}
+
+void xatlasSetAlloc(xatlasReallocFunc reallocFunc, xatlasFreeFunc freeFunc) {
+	xatlas::SetAlloc((xatlas::ReallocFunc)reallocFunc, (xatlas::FreeFunc)freeFunc);
+}
+
+void xatlasSetPrint(xatlasPrintFunc print, bool verbose) {
+	xatlas::SetPrint((xatlas::PrintFunc)print, verbose);
+}
+
+const char *xatlasAddMeshErrorString(xatlasAddMeshError error) {
+	return xatlas::StringForEnum((xatlas::AddMeshError)error);
+}
+
+const char *xatlasProgressCategoryString(xatlasProgressCategory category) {
+	return xatlas::StringForEnum((xatlas::ProgressCategory)category);
+}
+
+void xatlasMeshDeclInit(xatlasMeshDecl *meshDecl) {
+	xatlas::MeshDecl init;
+	memcpy(meshDecl, &init, sizeof(init));
+}
+
+void xatlasUvMeshDeclInit(xatlasUvMeshDecl *uvMeshDecl) {
+	xatlas::UvMeshDecl init;
+	memcpy(uvMeshDecl, &init, sizeof(init));
+}
+
+void xatlasChartOptionsInit(xatlasChartOptions *chartOptions) {
+	xatlas::ChartOptions init;
+	memcpy(chartOptions, &init, sizeof(init));
+}
+
+void xatlasPackOptionsInit(xatlasPackOptions *packOptions) {
+	xatlas::PackOptions init;
+	memcpy(packOptions, &init, sizeof(init));
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif // XATLAS_C_API
diff --git a/thirdparty/xatlas/xatlas.h b/thirdparty/xatlas/xatlas.h
index cc47f4837e..fc40d9d49c 100644
--- a/thirdparty/xatlas/xatlas.h
+++ b/thirdparty/xatlas/xatlas.h
@@ -31,35 +31,30 @@ Copyright NVIDIA Corporation 2006 -- Ignacio Castano <icastano@nvidia.com>
 #pragma once
 #ifndef XATLAS_H
 #define XATLAS_H
+#include <stddef.h>
 #include <stdint.h>
 
 namespace xatlas {
 
-struct ChartType
-{
-	enum Enum
-	{
-		Planar,
-		Ortho,
-		LSCM,
-		Piecewise,
-		Invalid
-	};
+enum class ChartType {
+	Planar,
+	Ortho,
+	LSCM,
+	Piecewise,
+	Invalid
 };
 
 // A group of connected faces, belonging to a single atlas.
-struct Chart
-{
+struct Chart {
 	uint32_t *faceArray;
 	uint32_t atlasIndex; // Sub-atlas index.
 	uint32_t faceCount;
-	ChartType::Enum type;
+	ChartType type;
 	uint32_t material;
 };
 
 // Output vertex.
-struct Vertex
-{
+struct Vertex {
 	int32_t atlasIndex; // Sub-atlas index. -1 if the vertex doesn't exist in any atlas.
 	int32_t chartIndex; // -1 if the vertex doesn't exist in any chart.
 	float uv[2]; // Not normalized - values are in Atlas width and height range.
@@ -67,8 +62,7 @@ struct Vertex
 };
 
 // Output mesh.
-struct Mesh
-{
+struct Mesh {
 	Chart *chartArray;
 	uint32_t *indexArray;
 	Vertex *vertexArray;
@@ -83,16 +77,15 @@ static const uint32_t kImageIsBilinearBit = 0x40000000;
 static const uint32_t kImageIsPaddingBit = 0x20000000;
 
 // Empty on creation. Populated after charts are packed.
-struct Atlas
-{
+struct Atlas {
 	uint32_t *image;
 	Mesh *meshes; // The output meshes, corresponding to each AddMesh call.
+	float *utilization; // Normalized atlas texel utilization array. E.g. a value of 0.8 means 20% empty space. atlasCount in length.
 	uint32_t width; // Atlas width in texels.
 	uint32_t height; // Atlas height in texels.
 	uint32_t atlasCount; // Number of sub-atlases. Equal to 0 unless PackOptions resolution is changed from default (0).
 	uint32_t chartCount; // Total number of charts in all meshes.
 	uint32_t meshCount; // Number of output meshes. Equal to the number of times AddMesh was called.
-	float *utilization; // Normalized atlas texel utilization array. E.g. a value of 0.8 means 20% empty space. atlasCount in length.
 	float texelsPerUnit; // Equal to PackOptions texelsPerUnit if texelsPerUnit > 0, otherwise an estimated value to match PackOptions resolution.
 };
 
@@ -101,73 +94,76 @@ Atlas *Create();
 
 void Destroy(Atlas *atlas);
 
-struct IndexFormat
-{
-	enum Enum
-	{
-		UInt16,
-		UInt32
-	};
+enum class IndexFormat {
+	UInt16,
+	UInt32
 };
 
 // Input mesh declaration.
-struct MeshDecl
-{
+struct MeshDecl {
 	const void *vertexPositionData = nullptr;
 	const void *vertexNormalData = nullptr; // optional
 	const void *vertexUvData = nullptr; // optional. The input UVs are provided as a hint to the chart generator.
 	const void *indexData = nullptr; // optional
-	
-	// Optional. indexCount / 3 (triangle count) in length.
+
+	// Optional. Must be faceCount in length.
 	// Don't atlas faces set to true. Ignored faces still exist in the output meshes, Vertex uv is set to (0, 0) and Vertex atlasIndex to -1.
 	const bool *faceIgnoreData = nullptr;
 
+	// Optional. Must be faceCount in length.
+	// Only faces with the same material will be assigned to the same chart.
+	const uint32_t *faceMaterialData = nullptr;
+
+	// Optional. Must be faceCount in length.
+	// Polygon / n-gon support. Faces are assumed to be triangles if this is null.
+	const uint8_t *faceVertexCount = nullptr;
+
 	uint32_t vertexCount = 0;
 	uint32_t vertexPositionStride = 0;
 	uint32_t vertexNormalStride = 0; // optional
 	uint32_t vertexUvStride = 0; // optional
 	uint32_t indexCount = 0;
 	int32_t indexOffset = 0; // optional. Add this offset to all indices.
-	IndexFormat::Enum indexFormat = IndexFormat::UInt16;
+	uint32_t faceCount = 0; // Optional if faceVertexCount is null. Otherwise assumed to be indexCount / 3.
+	IndexFormat indexFormat = IndexFormat::UInt16;
 
 	// Vertex positions within epsilon distance of each other are considered colocal.
 	float epsilon = 1.192092896e-07F;
 };
 
-struct AddMeshError
-{
-	enum Enum
-	{
-		Success, // No error.
-		Error, // Unspecified error.
-		IndexOutOfRange, // An index is >= MeshDecl vertexCount.
-		InvalidIndexCount // Not evenly divisible by 3 - expecting triangles.
-	};
+enum class AddMeshError {
+	Success, // No error.
+	Error, // Unspecified error.
+	IndexOutOfRange, // An index is >= MeshDecl vertexCount.
+	InvalidFaceVertexCount, // Must be >= 3.
+	InvalidIndexCount // Not evenly divisible by 3 - expecting triangles.
 };
 
 // Add a mesh to the atlas. MeshDecl data is copied, so it can be freed after AddMesh returns.
-AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t meshCountHint = 0);
+AddMeshError AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t meshCountHint = 0);
 
 // Wait for AddMesh async processing to finish. ComputeCharts / Generate call this internally.
 void AddMeshJoin(Atlas *atlas);
 
-struct UvMeshDecl
-{
+struct UvMeshDecl {
 	const void *vertexUvData = nullptr;
 	const void *indexData = nullptr; // optional
-	const uint32_t *faceMaterialData = nullptr; // Optional. Faces with different materials won't be assigned to the same chart. Must be indexCount / 3 in length.
+	const uint32_t *faceMaterialData = nullptr; // Optional. Overlapping UVs should be assigned a different material. Must be indexCount / 3 in length.
 	uint32_t vertexCount = 0;
 	uint32_t vertexStride = 0;
 	uint32_t indexCount = 0;
 	int32_t indexOffset = 0; // optional. Add this offset to all indices.
-	IndexFormat::Enum indexFormat = IndexFormat::UInt16;
-	bool rotateCharts = true;
+	IndexFormat indexFormat = IndexFormat::UInt16;
 };
 
-AddMeshError::Enum AddUvMesh(Atlas *atlas, const UvMeshDecl &decl);
+AddMeshError AddUvMesh(Atlas *atlas, const UvMeshDecl &decl);
+
+// Custom parameterization function. texcoords initial values are an orthogonal parameterization.
+typedef void (*ParameterizeFunc)(const float *positions, float *texcoords, uint32_t vertexCount, const uint32_t *indices, uint32_t indexCount);
+
+struct ChartOptions {
+	ParameterizeFunc paramFunc = nullptr;
 
-struct ChartOptions
-{
 	float maxChartArea = 0.0f; // Don't grow charts to be larger than this. 0 means no limit.
 	float maxBoundaryLength = 0.0f; // Don't grow charts to have a longer boundary than this. 0 means no limit.
 
@@ -180,26 +176,31 @@ struct ChartOptions
 
 	float maxCost = 2.0f; // If total of all metrics * weights > maxCost, don't grow chart. Lower values result in more charts.
 	uint32_t maxIterations = 1; // Number of iterations of the chart growing and seeding phases. Higher values result in better charts.
+
+	bool useInputMeshUvs = false; // Use MeshDecl::vertexUvData for charts.
+	bool fixWinding = false; // Enforce consistent texture coordinate winding.
 };
 
 // Call after all AddMesh calls. Can be called multiple times to recompute charts with different options.
 void ComputeCharts(Atlas *atlas, ChartOptions options = ChartOptions());
 
-// Custom parameterization function. texcoords initial values are an orthogonal parameterization.
-typedef void (*ParameterizeFunc)(const float *positions, float *texcoords, uint32_t vertexCount, const uint32_t *indices, uint32_t indexCount);
+struct PackOptions {
+	// Charts larger than this will be scaled down. 0 means no limit.
+	uint32_t maxChartSize = 0;
 
-struct ParameterizeOptions
-{
-	ParameterizeFunc func = nullptr;
-	bool closeHoles = true; // If the custom parameterization function works with multiple boundaries, this can be set to false to improve performance.
-	bool fixTJunctions = true; // If meshes don't have T-junctions, this can be set to false to improve performance.
-};
+	// Number of pixels to pad charts with.
+	uint32_t padding = 0;
+
+	// Unit to texel scale. e.g. a 1x1 quad with texelsPerUnit of 32 will take up approximately 32x32 texels in the atlas.
+	// If 0, an estimated value will be calculated to approximately match the given resolution.
+	// If resolution is also 0, the estimated value will approximately match a 1024x1024 atlas.
+	float texelsPerUnit = 0.0f;
 
-// Call after ComputeCharts. Can be called multiple times to re-parameterize charts with a different ParameterizeFunc.
-void ParameterizeCharts(Atlas *atlas, ParameterizeOptions options = ParameterizeOptions());
+	// If 0, generate a single atlas with texelsPerUnit determining the final resolution.
+	// If not 0, and texelsPerUnit is not 0, generate one or more atlases with that exact resolution.
+	// If not 0, and texelsPerUnit is 0, texelsPerUnit is estimated to approximately match the resolution.
+	uint32_t resolution = 0;
 
-struct PackOptions
-{
 	// Leave space around charts for texels that would be sampled by bilinear filtering.
 	bool bilinear = true;
 
@@ -212,44 +213,29 @@ struct PackOptions
 	// Create Atlas::image
 	bool createImage = false;
 
-	// Charts larger than this will be scaled down. 0 means no limit.
-	uint32_t maxChartSize = 0;
-
-	// Number of pixels to pad charts with.
-	uint32_t padding = 0;
+	// Rotate charts to the axis of their convex hull.
+	bool rotateChartsToAxis = true;
 
-	// Unit to texel scale. e.g. a 1x1 quad with texelsPerUnit of 32 will take up approximately 32x32 texels in the atlas.
-	// If 0, an estimated value will be calculated to approximately match the given resolution.
-	// If resolution is also 0, the estimated value will approximately match a 1024x1024 atlas.
-	float texelsPerUnit = 0.0f;
-
-	// If 0, generate a single atlas with texelsPerUnit determining the final resolution.
-	// If not 0, and texelsPerUnit is not 0, generate one or more atlases with that exact resolution.
-	// If not 0, and texelsPerUnit is 0, texelsPerUnit is estimated to approximately match the resolution.
-	uint32_t resolution = 0;
+	// Rotate charts to improve packing.
+	bool rotateCharts = true;
 };
 
-// Call after ParameterizeCharts. Can be called multiple times to re-pack charts with different options.
+// Call after ComputeCharts. Can be called multiple times to re-pack charts with different options.
 void PackCharts(Atlas *atlas, PackOptions packOptions = PackOptions());
 
-// Equivalent to calling ComputeCharts, ParameterizeCharts and PackCharts in sequence. Can be called multiple times to regenerate with different options.
-void Generate(Atlas *atlas, ChartOptions chartOptions = ChartOptions(), ParameterizeOptions parameterizeOptions = ParameterizeOptions(), PackOptions packOptions = PackOptions());
+// Equivalent to calling ComputeCharts and PackCharts in sequence. Can be called multiple times to regenerate with different options.
+void Generate(Atlas *atlas, ChartOptions chartOptions = ChartOptions(), PackOptions packOptions = PackOptions());
 
 // Progress tracking.
-struct ProgressCategory
-{
-	enum Enum
-	{
-		AddMesh,
-		ComputeCharts,
-		ParameterizeCharts,
-		PackCharts,
-		BuildOutputMeshes
-	};
+enum class ProgressCategory {
+	AddMesh,
+	ComputeCharts,
+	PackCharts,
+	BuildOutputMeshes
 };
 
 // May be called from any thread. Return false to cancel.
-typedef bool (*ProgressFunc)(ProgressCategory::Enum category, int progress, void *userData);
+typedef bool (*ProgressFunc)(ProgressCategory category, int progress, void *userData);
 
 void SetProgressCallback(Atlas *atlas, ProgressFunc progressFunc = nullptr, void *progressUserData = nullptr);
 
@@ -263,8 +249,8 @@ typedef int (*PrintFunc)(const char *, ...);
 void SetPrint(PrintFunc print, bool verbose);
 
 // Helper functions for error messages.
-const char *StringForEnum(AddMeshError::Enum error);
-const char *StringForEnum(ProgressCategory::Enum category);
+const char *StringForEnum(AddMeshError error);
+const char *StringForEnum(ProgressCategory category);
 
 } // namespace xatlas