1 files changed, 3753 insertions, 2726 deletions
diff --git a/thirdparty/xatlas/xatlas.cpp b/thirdparty/xatlas/xatlas.cpp
index b1cbeb980f..43aec33a9f 100644
--- a/thirdparty/xatlas/xatlas.cpp
+++ b/thirdparty/xatlas/xatlas.cpp
@@ -1,7 +1,7 @@
 /*
 MIT License
 
-Copyright (c) 2018-2019 Jonathan Young
+Copyright (c) 2018-2020 Jonathan Young
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -33,19 +33,19 @@ https://github.com/brandonpelfrey/Fast-BVH
 MIT License
 Copyright (c) 2012 Brandon Pelfrey
 */
-#include <assert.h>
-#include <float.h> // FLT_MAX
-#include <limits.h>
-#include <math.h>
 #include <atomic>
 #include <condition_variable>
 #include <mutex>
 #include <thread>
+#include <assert.h>
+#include <float.h> // FLT_MAX
+#include <limits.h>
+#include <math.h>
 #define __STDC_LIMIT_MACROS
-#include "xatlas.h"
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
+#include "xatlas.h"
 
 #ifndef XA_DEBUG
 #ifdef NDEBUG
@@ -70,10 +70,7 @@ Copyright (c) 2012 Brandon Pelfrey
 #define XA_XSTR(x) XA_STR(x)
 
 #ifndef XA_ASSERT
-#define XA_ASSERT(exp)                                                              \
-	if (!(exp)) {                                                                   \
-		XA_PRINT_WARNING("\rASSERT: %s %s %d\n", XA_XSTR(exp), __FILE__, __LINE__); \
-	}
+#define XA_ASSERT(exp) if (!(exp)) { XA_PRINT_WARNING("\rASSERT: %s %s %d\n", XA_XSTR(exp), __FILE__, __LINE__); }
 #endif
 
 #ifndef XA_DEBUG_ASSERT
@@ -81,20 +78,20 @@ Copyright (c) 2012 Brandon Pelfrey
 #endif
 
 #ifndef XA_PRINT
-#define XA_PRINT(...)                                                  \
+#define XA_PRINT(...) \
 	if (xatlas::internal::s_print && xatlas::internal::s_printVerbose) \
 		xatlas::internal::s_print(__VA_ARGS__);
 #endif
 
 #ifndef XA_PRINT_WARNING
-#define XA_PRINT_WARNING(...)      \
+#define XA_PRINT_WARNING(...) \
 	if (xatlas::internal::s_print) \
 		xatlas::internal::s_print(__VA_ARGS__);
 #endif
 
 #define XA_ALLOC(tag, type) (type *)internal::Realloc(nullptr, sizeof(type), tag, __FILE__, __LINE__)
-#define XA_ALLOC_ARRAY(tag, type, num) (type *)internal::Realloc(nullptr, sizeof(type) * num, tag, __FILE__, __LINE__)
-#define XA_REALLOC(tag, ptr, type, num) (type *)internal::Realloc(ptr, sizeof(type) * num, tag, __FILE__, __LINE__)
+#define XA_ALLOC_ARRAY(tag, type, num) (type *)internal::Realloc(nullptr, sizeof(type) * (num), tag, __FILE__, __LINE__)
+#define XA_REALLOC(tag, ptr, type, num) (type *)internal::Realloc(ptr, sizeof(type) * (num), tag, __FILE__, __LINE__)
 #define XA_REALLOC_SIZE(tag, ptr, size) (uint8_t *)internal::Realloc(ptr, size, tag, __FILE__, __LINE__)
 #define XA_FREE(ptr) internal::Realloc(ptr, 0, internal::MemTag::Default, __FILE__, __LINE__)
 #define XA_NEW(tag, type) new (XA_ALLOC(tag, type)) type()
@@ -125,11 +122,12 @@ Copyright (c) 2012 Brandon Pelfrey
 
 #define XA_DEBUG_HEAP 0
 #define XA_DEBUG_SINGLE_CHART 0
+#define XA_DEBUG_ALL_CHARTS_INVALID 0
 #define XA_DEBUG_EXPORT_ATLAS_IMAGES 0
 #define XA_DEBUG_EXPORT_ATLAS_IMAGES_PER_CHART 0 // Export an atlas image after each chart is added.
 #define XA_DEBUG_EXPORT_BOUNDARY_GRID 0
 #define XA_DEBUG_EXPORT_TGA (XA_DEBUG_EXPORT_ATLAS_IMAGES || XA_DEBUG_EXPORT_BOUNDARY_GRID)
-#define XA_DEBUG_EXPORT_OBJ_SOURCE_MESHES 0
+#define XA_DEBUG_EXPORT_OBJ_FACE_GROUPS 0
 #define XA_DEBUG_EXPORT_OBJ_CHART_GROUPS 0
 #define XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS 0
 #define XA_DEBUG_EXPORT_OBJ_CHARTS 0
@@ -139,13 +137,19 @@ Copyright (c) 2012 Brandon Pelfrey
 #define XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION 0
 #define XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS 0
 
-#define XA_DEBUG_EXPORT_OBJ (0 || XA_DEBUG_EXPORT_OBJ_SOURCE_MESHES || XA_DEBUG_EXPORT_OBJ_CHART_GROUPS || XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS || XA_DEBUG_EXPORT_OBJ_CHARTS || XA_DEBUG_EXPORT_OBJ_BEFORE_FIX_TJUNCTION || XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR || XA_DEBUG_EXPORT_OBJ_CHARTS_AFTER_PARAMETERIZATION || XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION || XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS)
+#define XA_DEBUG_EXPORT_OBJ (0 \
+	|| XA_DEBUG_EXPORT_OBJ_FACE_GROUPS \
+	|| XA_DEBUG_EXPORT_OBJ_CHART_GROUPS \
+	|| XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS \
+	|| XA_DEBUG_EXPORT_OBJ_CHARTS \
+	|| XA_DEBUG_EXPORT_OBJ_BEFORE_FIX_TJUNCTION \
+	|| XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR \
+	|| XA_DEBUG_EXPORT_OBJ_CHARTS_AFTER_PARAMETERIZATION \
+	|| XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION \
+	|| XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS)
 
 #ifdef _MSC_VER
-#define XA_FOPEN(_file, _filename, _mode)                         \
-	{                                                             \
-		if (fopen_s(&_file, _filename, _mode) != 0) _file = NULL; \
-	}
+#define XA_FOPEN(_file, _filename, _mode) { if (fopen_s(&_file, _filename, _mode) != 0) _file = NULL; }
 #define XA_SPRINTF(_buffer, _size, _format, ...) sprintf_s(_buffer, _size, _format, __VA_ARGS__)
 #else
 #define XA_FOPEN(_file, _filename, _mode) _file = fopen(_filename, _mode)
@@ -160,12 +164,90 @@ static FreeFunc s_free = free;
 static PrintFunc s_print = printf;
 static bool s_printVerbose = false;
 
-struct MemTag {
-	enum {
+#if XA_PROFILE
+#define XA_PROFILE_START(var) const clock_t var##Start = clock();
+#define XA_PROFILE_END(var) internal::s_profile.var += clock() - var##Start;
+#define XA_PROFILE_PRINT_AND_RESET(label, var) XA_PRINT("%s%.2f seconds (%g ms)\n", label, internal::clockToSeconds(internal::s_profile.var), internal::clockToMs(internal::s_profile.var)); internal::s_profile.var = 0;
+#define XA_PROFILE_ALLOC 0
+
+struct ProfileData
+{
+#if XA_PROFILE_ALLOC
+	std::atomic<clock_t> alloc;
+#endif
+	clock_t addMeshReal;
+	clock_t addMeshCopyData;
+	std::atomic<clock_t> addMeshThread;
+	std::atomic<clock_t> addMeshCreateColocals;
+	clock_t computeChartsReal;
+	std::atomic<clock_t> computeChartsThread;
+	std::atomic<clock_t> createFaceGroups;
+	std::atomic<clock_t> extractInvalidMeshGeometry;
+	std::atomic<clock_t> chartGroupComputeChartsReal;
+	std::atomic<clock_t> chartGroupComputeChartsThread;
+	std::atomic<clock_t> createChartGroupMesh;
+	std::atomic<clock_t> createChartGroupMeshColocals;
+	std::atomic<clock_t> createChartGroupMeshBoundaries;
+	std::atomic<clock_t> buildAtlas;
+	std::atomic<clock_t> buildAtlasInit;
+	std::atomic<clock_t> planarCharts;
+	std::atomic<clock_t> clusteredCharts;
+	std::atomic<clock_t> clusteredChartsPlaceSeeds;
+	std::atomic<clock_t> clusteredChartsPlaceSeedsBoundaryIntersection;
+	std::atomic<clock_t> clusteredChartsRelocateSeeds;
+	std::atomic<clock_t> clusteredChartsReset;
+	std::atomic<clock_t> clusteredChartsGrow;
+	std::atomic<clock_t> clusteredChartsGrowBoundaryIntersection;
+	std::atomic<clock_t> clusteredChartsMerge;
+	std::atomic<clock_t> clusteredChartsFillHoles;
+	std::atomic<clock_t> copyChartFaces;
+	clock_t parameterizeChartsReal;
+	std::atomic<clock_t> parameterizeChartsThread;
+	std::atomic<clock_t> createChartMesh;
+	std::atomic<clock_t> fixChartMeshTJunctions;
+	std::atomic<clock_t> closeChartMeshHoles;
+	std::atomic<clock_t> parameterizeChartsOrthogonal;
+	std::atomic<clock_t> parameterizeChartsLSCM;
+	std::atomic<clock_t> parameterizeChartsRecompute;
+	std::atomic<clock_t> parameterizeChartsPiecewise;
+	std::atomic<clock_t> parameterizeChartsPiecewiseBoundaryIntersection;
+	std::atomic<clock_t> parameterizeChartsEvaluateQuality;
+	clock_t packCharts;
+	clock_t packChartsAddCharts;
+	std::atomic<clock_t> packChartsAddChartsThread;
+	std::atomic<clock_t> packChartsAddChartsRestoreTexcoords;
+	clock_t packChartsRasterize;
+	clock_t packChartsDilate;
+	clock_t packChartsFindLocation;
+	clock_t packChartsBlit;
+	clock_t buildOutputMeshes;
+};
+
+static ProfileData s_profile;
+
+static double clockToMs(clock_t c)
+{
+	return c * 1000.0 / CLOCKS_PER_SEC;
+}
+
+static double clockToSeconds(clock_t c)
+{
+	return c / (double)CLOCKS_PER_SEC;
+}
+#else
+#define XA_PROFILE_START(var)
+#define XA_PROFILE_END(var)
+#define XA_PROFILE_PRINT_AND_RESET(label, var)
+#define XA_PROFILE_ALLOC 0
+#endif
+
+struct MemTag
+{
+	enum
+	{
 		Default,
 		BitImage,
 		BVH,
-		FullVector,
 		Matrix,
 		Mesh,
 		MeshBoundaries,
@@ -175,6 +257,7 @@ struct MemTag {
 		MeshNormals,
 		MeshPositions,
 		MeshTexcoords,
+		OpenNL,
 		SegmentAtlasChartCandidates,
 		SegmentAtlasChartFaces,
 		SegmentAtlasMeshData,
@@ -184,7 +267,8 @@ struct MemTag {
 };
 
 #if XA_DEBUG_HEAP
-struct AllocHeader {
+struct AllocHeader
+{
 	size_t size;
 	const char *file;
 	int line;
@@ -197,10 +281,11 @@ struct AllocHeader {
 static std::mutex s_allocMutex;
 static AllocHeader *s_allocRoot = nullptr;
 static size_t s_allocTotalCount = 0, s_allocTotalSize = 0, s_allocPeakSize = 0, s_allocCount[MemTag::Count] = { 0 }, s_allocTotalTagSize[MemTag::Count] = { 0 }, s_allocPeakTagSize[MemTag::Count] = { 0 };
-static uint32_t s_allocId = 0;
+static uint32_t s_allocId =0 ;
 static constexpr uint32_t kAllocRedzone = 0x12345678;
 
-static void *Realloc(void *ptr, size_t size, int tag, const char *file, int line) {
+static void *Realloc(void *ptr, size_t size, int tag, const char *file, int line)
+{
 	std::unique_lock<std::mutex> lock(s_allocMutex);
 	if (!size && !ptr)
 		return nullptr;
@@ -261,7 +346,8 @@ static void *Realloc(void *ptr, size_t size, int tag, const char *file, int line
 	return newPtr + sizeof(AllocHeader);
 }
 
-static void ReportLeaks() {
+static void ReportLeaks()
+{
 	printf("Checking for memory leaks...\n");
 	bool anyLeaks = false;
 	AllocHeader *header = s_allocRoot;
@@ -289,14 +375,14 @@ static void ReportLeaks() {
 		s_allocTotalTagSize[i] = s_allocPeakTagSize[i] = 0;
 }
 
-static void PrintMemoryUsage() {
+static void PrintMemoryUsage()
+{
 	XA_PRINT("Total allocations: %zu\n", s_allocTotalCount);
 	XA_PRINT("Memory usage: %0.2fMB current, %0.2fMB peak\n", internal::s_allocTotalSize / 1024.0f / 1024.0f, internal::s_allocPeakSize / 1024.0f / 1024.0f);
 	static const char *labels[] = { // Sync with MemTag
 		"Default",
 		"BitImage",
 		"BVH",
-		"FullVector",
 		"Matrix",
 		"Mesh",
 		"MeshBoundaries",
@@ -306,6 +392,7 @@ static void PrintMemoryUsage() {
 		"MeshNormals",
 		"MeshPositions",
 		"MeshTexcoords",
+		"OpenNL",
 		"SegmentAtlasChartCandidates",
 		"SegmentAtlasChartFaces",
 		"SegmentAtlasMeshData",
@@ -318,158 +405,116 @@ static void PrintMemoryUsage() {
 
 #define XA_PRINT_MEM_USAGE internal::PrintMemoryUsage();
 #else
-static void *Realloc(void *ptr, size_t size, int /*tag*/, const char * /*file*/, int /*line*/) {
+static void *Realloc(void *ptr, size_t size, int /*tag*/, const char * /*file*/, int /*line*/)
+{
 	if (size == 0 && !ptr)
 		return nullptr;
 	if (size == 0 && s_free) {
 		s_free(ptr);
 		return nullptr;
 	}
+#if XA_PROFILE_ALLOC
+	XA_PROFILE_START(alloc)
+#endif
 	void *mem = s_realloc(ptr, size);
-	if (size > 0) {
-		XA_DEBUG_ASSERT(mem);
-	}
+#if XA_PROFILE_ALLOC
+	XA_PROFILE_END(alloc)
+#endif
+	XA_DEBUG_ASSERT(size <= 0 || (size > 0 && mem));
 	return mem;
 }
 #define XA_PRINT_MEM_USAGE
 #endif
 
-#if XA_PROFILE
-#define XA_PROFILE_START(var) const clock_t var##Start = clock();
-#define XA_PROFILE_END(var) internal::s_profile.var += clock() - var##Start;
-#define XA_PROFILE_PRINT_AND_RESET(label, var)                                                                                                    \
-	XA_PRINT("%s%.2f seconds (%g ms)\n", label, internal::clockToSeconds(internal::s_profile.var), internal::clockToMs(internal::s_profile.var)); \
-	internal::s_profile.var = 0;
-
-struct ProfileData {
-	clock_t addMeshReal;
-	clock_t addMeshCopyData;
-	std::atomic<clock_t> addMeshThread;
-	std::atomic<clock_t> addMeshCreateColocals;
-	std::atomic<clock_t> addMeshCreateFaceGroups;
-	std::atomic<clock_t> addMeshCreateChartGroupsReal;
-	std::atomic<clock_t> addMeshCreateChartGroupsThread;
-	clock_t computeChartsReal;
-	std::atomic<clock_t> computeChartsThread;
-	std::atomic<clock_t> buildAtlas;
-	std::atomic<clock_t> buildAtlasInit;
-	std::atomic<clock_t> buildAtlasPlaceSeeds;
-	std::atomic<clock_t> buildAtlasRelocateSeeds;
-	std::atomic<clock_t> buildAtlasResetCharts;
-	std::atomic<clock_t> buildAtlasGrowCharts;
-	std::atomic<clock_t> buildAtlasMergeCharts;
-	std::atomic<clock_t> buildAtlasFillHoles;
-	std::atomic<clock_t> createChartMeshesReal;
-	std::atomic<clock_t> createChartMeshesThread;
-	std::atomic<clock_t> fixChartMeshTJunctions;
-	std::atomic<clock_t> closeChartMeshHoles;
-	clock_t parameterizeChartsReal;
-	std::atomic<clock_t> parameterizeChartsThread;
-	std::atomic<clock_t> parameterizeChartsOrthogonal;
-	std::atomic<clock_t> parameterizeChartsLSCM;
-	std::atomic<clock_t> parameterizeChartsEvaluateQuality;
-	clock_t packCharts;
-	clock_t packChartsAddCharts;
-	std::atomic<clock_t> packChartsAddChartsThread;
-	std::atomic<clock_t> packChartsAddChartsRestoreTexcoords;
-	clock_t packChartsRasterize;
-	clock_t packChartsDilate;
-	clock_t packChartsFindLocation;
-	clock_t packChartsBlit;
-	clock_t buildOutputMeshes;
-};
-
-static ProfileData s_profile;
-
-static double clockToMs(clock_t c) {
-	return c * 1000.0 / CLOCKS_PER_SEC;
-}
-
-static double clockToSeconds(clock_t c) {
-	return c / (double)CLOCKS_PER_SEC;
-}
-#else
-#define XA_PROFILE_START(var)
-#define XA_PROFILE_END(var)
-#define XA_PROFILE_PRINT_AND_RESET(label, var)
-#endif
-
 static constexpr float kPi = 3.14159265358979323846f;
 static constexpr float kPi2 = 6.28318530717958647692f;
-static constexpr float kPi4 = 12.56637061435917295384f;
 static constexpr float kEpsilon = 0.0001f;
 static constexpr float kAreaEpsilon = FLT_EPSILON;
 static constexpr float kNormalEpsilon = 0.001f;
 
-static int align(int x, int a) {
+static int align(int x, int a)
+{
 	return (x + a - 1) & ~(a - 1);
 }
 
 template <typename T>
-static T max(const T &a, const T &b) {
+static T max(const T &a, const T &b)
+{
 	return a > b ? a : b;
 }
 
 template <typename T>
-static T min(const T &a, const T &b) {
+static T min(const T &a, const T &b)
+{
 	return a < b ? a : b;
 }
 
 template <typename T>
-static T max3(const T &a, const T &b, const T &c) {
+static T max3(const T &a, const T &b, const T &c)
+{
 	return max(a, max(b, c));
 }
 
 /// Return the maximum of the three arguments.
 template <typename T>
-static T min3(const T &a, const T &b, const T &c) {
+static T min3(const T &a, const T &b, const T &c)
+{
 	return min(a, min(b, c));
 }
 
 /// Clamp between two values.
 template <typename T>
-static T clamp(const T &x, const T &a, const T &b) {
+static T clamp(const T &x, const T &a, const T &b)
+{
 	return min(max(x, a), b);
 }
 
 template <typename T>
-static void swap(T &a, T &b) {
+static void swap(T &a, T &b)
+{
 	T temp = a;
 	a = b;
 	b = temp;
 }
 
-union FloatUint32 {
+union FloatUint32
+{
 	float f;
 	uint32_t u;
 };
 
-static bool isFinite(float f) {
+static bool isFinite(float f)
+{
 	FloatUint32 fu;
 	fu.f = f;
 	return fu.u != 0x7F800000u && fu.u != 0x7F800001u;
 }
 
-static bool isNan(float f) {
+static bool isNan(float f)
+{
 	return f != f;
 }
 
 // Robust floating point comparisons:
 // http://realtimecollisiondetection.net/blog/?p=89
-static bool equal(const float f0, const float f1, const float epsilon) {
+static bool equal(const float f0, const float f1, const float epsilon)
+{
 	//return fabs(f0-f1) <= epsilon;
 	return fabs(f0 - f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
 }
 
-static int ftoi_ceil(float val) {
+static int ftoi_ceil(float val)
+{
 	return (int)ceilf(val);
 }
 
-static bool isZero(const float f, const float epsilon) {
+static bool isZero(const float f, const float epsilon)
+{
 	return fabs(f) <= epsilon;
 }
 
-static float square(float f) {
+static float square(float f)
+{
 	return f * f;
 }
 
@@ -479,8 +524,9 @@ static float square(float f) {
 * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
 * @note nextPowerOfTwo(x) = 2 << log2(x-1)
 */
-static uint32_t nextPowerOfTwo(uint32_t x) {
-	XA_DEBUG_ASSERT(x != 0);
+static uint32_t nextPowerOfTwo(uint32_t x)
+{
+	XA_DEBUG_ASSERT( x != 0 );
 	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
 	x--;
 	x |= x >> 1;
@@ -491,59 +537,38 @@ static uint32_t nextPowerOfTwo(uint32_t x) {
 	return x + 1;
 }
 
-static uint32_t sdbmHash(const void *data_in, uint32_t size, uint32_t h = 5381) {
-	const uint8_t *data = (const uint8_t *)data_in;
-	uint32_t i = 0;
-	while (i < size) {
-		h = (h << 16) + (h << 6) - h + (uint32_t)data[i++];
-	}
-	return h;
-}
-
-template <typename T>
-static uint32_t hash(const T &t, uint32_t h = 5381) {
-	return sdbmHash(&t, sizeof(T), h);
-}
-
-// Functors for hash table:
-template <typename Key>
-struct Hash {
-	uint32_t operator()(const Key &k) const { return hash(k); }
-};
-
-template <typename Key>
-struct Equal {
-	bool operator()(const Key &k0, const Key &k1) const { return k0 == k1; }
-};
-
-class Vector2 {
+class Vector2
+{
 public:
 	Vector2() {}
-	explicit Vector2(float f) :
-			x(f), y(f) {}
-	Vector2(float x, float y) :
-			x(x), y(y) {}
+	explicit Vector2(float f) : x(f), y(f) {}
+	Vector2(float x, float y): x(x), y(y) {}
 
-	Vector2 operator-() const {
+	Vector2 operator-() const
+	{
 		return Vector2(-x, -y);
 	}
 
-	void operator+=(const Vector2 &v) {
+	void operator+=(const Vector2 &v)
+	{
 		x += v.x;
 		y += v.y;
 	}
 
-	void operator-=(const Vector2 &v) {
+	void operator-=(const Vector2 &v)
+	{
 		x -= v.x;
 		y -= v.y;
 	}
 
-	void operator*=(float s) {
+	void operator*=(float s)
+	{
 		x *= s;
 		y *= s;
 	}
 
-	void operator*=(const Vector2 &v) {
+	void operator*=(const Vector2 &v)
+	{
 		x *= v.x;
 		y *= v.y;
 	}
@@ -551,11 +576,13 @@ public:
 	float x, y;
 };
 
-static bool operator==(const Vector2 &a, const Vector2 &b) {
+static bool operator==(const Vector2 &a, const Vector2 &b)
+{
 	return a.x == b.x && a.y == b.y;
 }
 
-static bool operator!=(const Vector2 &a, const Vector2 &b) {
+static bool operator!=(const Vector2 &a, const Vector2 &b)
+{
 	return a.x != b.x || a.y != b.y;
 }
 
@@ -564,33 +591,40 @@ static bool operator!=(const Vector2 &a, const Vector2 &b) {
 	return Vector2(a.x + b.x, a.y + b.y);
 }*/
 
-static Vector2 operator-(const Vector2 &a, const Vector2 &b) {
+static Vector2 operator-(const Vector2 &a, const Vector2 &b)
+{
 	return Vector2(a.x - b.x, a.y - b.y);
 }
 
-static Vector2 operator*(const Vector2 &v, float s) {
+static Vector2 operator*(const Vector2 &v, float s)
+{
 	return Vector2(v.x * s, v.y * s);
 }
 
-static float dot(const Vector2 &a, const Vector2 &b) {
+static float dot(const Vector2 &a, const Vector2 &b)
+{
 	return a.x * b.x + a.y * b.y;
 }
 
-static float lengthSquared(const Vector2 &v) {
+static float lengthSquared(const Vector2 &v)
+{
 	return v.x * v.x + v.y * v.y;
 }
 
-static float length(const Vector2 &v) {
+static float length(const Vector2 &v)
+{
 	return sqrtf(lengthSquared(v));
 }
 
 #if XA_DEBUG
-static bool isNormalized(const Vector2 &v, float epsilon = kNormalEpsilon) {
+static bool isNormalized(const Vector2 &v, float epsilon = kNormalEpsilon)
+{
 	return equal(length(v), 1, epsilon);
 }
 #endif
 
-static Vector2 normalize(const Vector2 &v, float epsilon) {
+static Vector2 normalize(const Vector2 &v, float epsilon)
+{
 	float l = length(v);
 	XA_DEBUG_ASSERT(!isZero(l, epsilon));
 	XA_UNUSED(epsilon);
@@ -599,30 +633,36 @@ static Vector2 normalize(const Vector2 &v, float epsilon) {
 	return n;
 }
 
-static Vector2 normalizeSafe(const Vector2 &v, const Vector2 &fallback, float epsilon) {
+static Vector2 normalizeSafe(const Vector2 &v, const Vector2 &fallback, float epsilon)
+{
 	float l = length(v);
 	if (isZero(l, epsilon))
 		return fallback;
 	return v * (1.0f / l);
 }
 
-static bool equal(const Vector2 &v1, const Vector2 &v2, float epsilon) {
+static bool equal(const Vector2 &v1, const Vector2 &v2, float epsilon)
+{
 	return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
 }
 
-static Vector2 min(const Vector2 &a, const Vector2 &b) {
+static Vector2 min(const Vector2 &a, const Vector2 &b)
+{
 	return Vector2(min(a.x, b.x), min(a.y, b.y));
 }
 
-static Vector2 max(const Vector2 &a, const Vector2 &b) {
+static Vector2 max(const Vector2 &a, const Vector2 &b)
+{
 	return Vector2(max(a.x, b.x), max(a.y, b.y));
 }
 
-static bool isFinite(const Vector2 &v) {
+static bool isFinite(const Vector2 &v)
+{
 	return isFinite(v.x) && isFinite(v.y);
 }
 
-static float triangleArea(const Vector2 &a, const Vector2 &b, const Vector2 &c) {
+static float triangleArea(const Vector2 &a, const Vector2 &b, const Vector2 &c)
+{
 	// IC: While it may be appealing to use the following expression:
 	//return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y) * 0.5f;
 	// That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point
@@ -636,7 +676,8 @@ static float triangleArea(const Vector2 &a, const Vector2 &b, const Vector2 &c)
 	return (v0.x * v1.y - v0.y * v1.x) * 0.5f;
 }
 
-static bool linesIntersect(const Vector2 &a1, const Vector2 &a2, const Vector2 &b1, const Vector2 &b2, float epsilon) {
+static bool linesIntersect(const Vector2 &a1, const Vector2 &a2, const Vector2 &b1, const Vector2 &b2, float epsilon)
+{
 	const Vector2 v0 = a2 - a1;
 	const Vector2 v1 = b2 - b1;
 	const float denom = -v1.x * v0.y + v0.x * v1.y;
@@ -644,70 +685,76 @@ static bool linesIntersect(const Vector2 &a1, const Vector2 &a2, const Vector2 &
 		return false;
 	const float s = (-v0.y * (a1.x - b1.x) + v0.x * (a1.y - b1.y)) / denom;
 	if (s > epsilon && s < 1.0f - epsilon) {
-		const float t = (v1.x * (a1.y - b1.y) - v1.y * (a1.x - b1.x)) / denom;
+		const float t = ( v1.x * (a1.y - b1.y) - v1.y * (a1.x - b1.x)) / denom;
 		return t > epsilon && t < 1.0f - epsilon;
 	}
 	return false;
 }
 
-struct Vector2i {
+struct Vector2i
+{
 	Vector2i() {}
-	Vector2i(int32_t x, int32_t y) :
-			x(x), y(y) {}
+	Vector2i(int32_t x, int32_t y) : x(x), y(y) {}
 
 	int32_t x, y;
 };
 
-class Vector3 {
+class Vector3
+{
 public:
 	Vector3() {}
-	explicit Vector3(float f) :
-			x(f), y(f), z(f) {}
-	Vector3(float x, float y, float z) :
-			x(x), y(y), z(z) {}
-	Vector3(const Vector2 &v, float z) :
-			x(v.x), y(v.y), z(z) {}
-
-	Vector2 xy() const {
+	explicit Vector3(float f) : x(f), y(f), z(f) {}
+	Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
+	Vector3(const Vector2 &v, float z) : x(v.x), y(v.y), z(z) {}
+
+	Vector2 xy() const
+	{
 		return Vector2(x, y);
 	}
 
-	Vector3 operator-() const {
+	Vector3 operator-() const
+	{
 		return Vector3(-x, -y, -z);
 	}
 
-	void operator+=(const Vector3 &v) {
+	void operator+=(const Vector3 &v)
+	{
 		x += v.x;
 		y += v.y;
 		z += v.z;
 	}
 
-	void operator-=(const Vector3 &v) {
+	void operator-=(const Vector3 &v)
+	{
 		x -= v.x;
 		y -= v.y;
 		z -= v.z;
 	}
 
-	void operator*=(float s) {
+	void operator*=(float s)
+	{
 		x *= s;
 		y *= s;
 		z *= s;
 	}
 
-	void operator/=(float s) {
+	void operator/=(float s)
+	{
 		float is = 1.0f / s;
 		x *= is;
 		y *= is;
 		z *= is;
 	}
 
-	void operator*=(const Vector3 &v) {
+	void operator*=(const Vector3 &v)
+	{
 		x *= v.x;
 		y *= v.y;
 		z *= v.z;
 	}
 
-	void operator/=(const Vector3 &v) {
+	void operator/=(const Vector3 &v)
+	{
 		x /= v.x;
 		y /= v.y;
 		z /= v.z;
@@ -716,43 +763,53 @@ public:
 	float x, y, z;
 };
 
-static Vector3 operator+(const Vector3 &a, const Vector3 &b) {
+static Vector3 operator+(const Vector3 &a, const Vector3 &b)
+{
 	return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
 }
 
-static Vector3 operator-(const Vector3 &a, const Vector3 &b) {
+static Vector3 operator-(const Vector3 &a, const Vector3 &b)
+{
 	return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
 }
 
-static Vector3 cross(const Vector3 &a, const Vector3 &b) {
+static Vector3 cross(const Vector3 &a, const Vector3 &b)
+{
 	return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
 }
 
-static Vector3 operator*(const Vector3 &v, float s) {
+static Vector3 operator*(const Vector3 &v, float s)
+{
 	return Vector3(v.x * s, v.y * s, v.z * s);
 }
 
-static Vector3 operator/(const Vector3 &v, float s) {
+static Vector3 operator/(const Vector3 &v, float s)
+{
 	return v * (1.0f / s);
 }
 
-static float dot(const Vector3 &a, const Vector3 &b) {
+static float dot(const Vector3 &a, const Vector3 &b)
+{
 	return a.x * b.x + a.y * b.y + a.z * b.z;
 }
 
-static float lengthSquared(const Vector3 &v) {
+static float lengthSquared(const Vector3 &v)
+{
 	return v.x * v.x + v.y * v.y + v.z * v.z;
 }
 
-static float length(const Vector3 &v) {
+static float length(const Vector3 &v)
+{
 	return sqrtf(lengthSquared(v));
 }
 
-static bool isNormalized(const Vector3 &v, float epsilon = kNormalEpsilon) {
+static bool isNormalized(const Vector3 &v, float epsilon = kNormalEpsilon)
+{
 	return equal(length(v), 1, epsilon);
 }
 
-static Vector3 normalize(const Vector3 &v, float epsilon) {
+static Vector3 normalize(const Vector3 &v, float epsilon)
+{
 	float l = length(v);
 	XA_DEBUG_ASSERT(!isZero(l, epsilon));
 	XA_UNUSED(epsilon);
@@ -761,7 +818,8 @@ static Vector3 normalize(const Vector3 &v, float epsilon) {
 	return n;
 }
 
-static Vector3 normalizeSafe(const Vector3 &v, const Vector3 &fallback, float epsilon) {
+static Vector3 normalizeSafe(const Vector3 &v, const Vector3 &fallback, float epsilon)
+{
 	float l = length(v);
 	if (isZero(l, epsilon)) {
 		return fallback;
@@ -769,59 +827,80 @@ static Vector3 normalizeSafe(const Vector3 &v, const Vector3 &fallback, float ep
 	return v * (1.0f / l);
 }
 
-static bool equal(const Vector3 &v0, const Vector3 &v1, float epsilon) {
+static bool equal(const Vector3 &v0, const Vector3 &v1, float epsilon)
+{
 	return fabs(v0.x - v1.x) <= epsilon && fabs(v0.y - v1.y) <= epsilon && fabs(v0.z - v1.z) <= epsilon;
 }
 
-static Vector3 min(const Vector3 &a, const Vector3 &b) {
+static Vector3 min(const Vector3 &a, const Vector3 &b)
+{
 	return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
 }
 
-static Vector3 max(const Vector3 &a, const Vector3 &b) {
+static Vector3 max(const Vector3 &a, const Vector3 &b)
+{
 	return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
 }
 
 #if XA_DEBUG
-bool isFinite(const Vector3 &v) {
+bool isFinite(const Vector3 &v)
+{
 	return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
 }
 #endif
 
-struct Extents2 {
+struct Extents2
+{
 	Vector2 min, max;
 
-	void reset() {
+	Extents2() {}
+	
+	Extents2(Vector2 p1, Vector2 p2)
+	{
+		min = xatlas::internal::min(p1, p2);
+		max = xatlas::internal::max(p1, p2);
+	}
+
+	void reset()
+	{
 		min.x = min.y = FLT_MAX;
 		max.x = max.y = -FLT_MAX;
 	}
 
-	void add(Vector2 p) {
+	void add(Vector2 p)
+	{
 		min = xatlas::internal::min(min, p);
 		max = xatlas::internal::max(max, p);
 	}
 
-	Vector2 midpoint() const {
+	Vector2 midpoint() const
+	{
 		return Vector2(min.x + (max.x - min.x) * 0.5f, min.y + (max.y - min.y) * 0.5f);
 	}
 
-	static bool intersect(Extents2 e1, Extents2 e2) {
+	static bool intersect(const Extents2 &e1, const Extents2 &e2)
+	{
 		return e1.min.x <= e2.max.x && e1.max.x >= e2.min.x && e1.min.y <= e2.max.y && e1.max.y >= e2.min.y;
 	}
 };
 
-struct Plane {
+struct Plane
+{
 	Plane() = default;
-
-	Plane(const Vector3 &p1, const Vector3 &p2, const Vector3 &p3) {
+	
+	Plane(const Vector3 &p1, const Vector3 &p2, const Vector3 &p3)
+	{
 		normal = cross(p2 - p1, p3 - p1);
 		dist = dot(normal, p1);
 	}
 
-	float distance(const Vector3 &p) const {
+	float distance(const Vector3 &p) const
+	{
 		return dot(normal, p) - dist;
 	}
 
-	void normalize() {
+	void normalize()
+	{
 		const float len = length(normal);
 		if (len > 0.0f) {
 			const float il = 1.0f / len;
@@ -834,7 +913,8 @@ struct Plane {
 	float dist;
 };
 
-static bool lineIntersectsPoint(const Vector3 &point, const Vector3 &lineStart, const Vector3 &lineEnd, float *t, float epsilon) {
+static bool lineIntersectsPoint(const Vector3 &point, const Vector3 &lineStart, const Vector3 &lineEnd, float *t, float epsilon)
+{
 	float tt;
 	if (!t)
 		t = &tt;
@@ -851,19 +931,22 @@ static bool lineIntersectsPoint(const Vector3 &point, const Vector3 &lineStart,
 	return *t > kEpsilon && *t < 1.0f - kEpsilon;
 }
 
-static bool sameSide(const Vector3 &p1, const Vector3 &p2, const Vector3 &a, const Vector3 &b) {
+static bool sameSide(const Vector3 &p1, const Vector3 &p2, const Vector3 &a, const Vector3 &b)
+{
 	const Vector3 &ab = b - a;
 	return dot(cross(ab, p1 - a), cross(ab, p2 - a)) >= 0.0f;
 }
 
 // http://blackpawn.com/texts/pointinpoly/default.html
-static bool pointInTriangle(const Vector3 &p, const Vector3 &a, const Vector3 &b, const Vector3 &c) {
+static bool pointInTriangle(const Vector3 &p, const Vector3 &a, const Vector3 &b, const Vector3 &c)
+{
 	return sameSide(p, a, b, c) && sameSide(p, b, a, c) && sameSide(p, c, a, b);
 }
 
 #if XA_CLOSE_HOLES_CHECK_EDGE_INTERSECTION
 // https://en.wikipedia.org/wiki/M%C3%B6ller%E2%80%93Trumbore_intersection_algorithm
-static bool rayIntersectsTriangle(const Vector3 &rayOrigin, const Vector3 &rayDir, const Vector3 *tri, float *t) {
+static bool rayIntersectsTriangle(const Vector3 &rayOrigin, const Vector3 &rayDir, const Vector3 *tri, float *t)
+{
 	*t = 0.0f;
 	const Vector3 &edge1 = tri[1] - tri[0];
 	const Vector3 &edge2 = tri[2] - tri[0];
@@ -890,47 +973,50 @@ static bool rayIntersectsTriangle(const Vector3 &rayOrigin, const Vector3 &rayDi
 #endif
 
 // From Fast-BVH
-struct AABB {
-	AABB() :
-			min(FLT_MAX, FLT_MAX, FLT_MAX), max(-FLT_MAX, -FLT_MAX, -FLT_MAX) {}
-	AABB(const Vector3 &min, const Vector3 &max) :
-			min(min), max(max) {}
-	AABB(const Vector3 &p, float radius = 0.0f) :
-			min(p), max(p) {
-		if (radius > 0.0f) expand(radius);
-	}
+struct AABB
+{
+	AABB() : min(FLT_MAX, FLT_MAX, FLT_MAX), max(-FLT_MAX, -FLT_MAX, -FLT_MAX) {}
+	AABB(const Vector3 &min, const Vector3 &max) : min(min), max(max) { }
+	AABB(const Vector3 &p, float radius = 0.0f) : min(p), max(p) { if (radius > 0.0f) expand(radius); }
 
-	bool intersect(const AABB &other) const {
+	bool intersect(const AABB &other) const
+	{
 		return min.x <= other.max.x && max.x >= other.min.x && min.y <= other.max.y && max.y >= other.min.y && min.z <= other.max.z && max.z >= other.min.z;
 	}
 
-	void expandToInclude(const Vector3 &p) {
+	void expandToInclude(const Vector3 &p)
+	{
 		min = internal::min(min, p);
 		max = internal::max(max, p);
 	}
 
-	void expandToInclude(const AABB &aabb) {
+	void expandToInclude(const AABB &aabb)
+	{
 		min = internal::min(min, aabb.min);
 		max = internal::max(max, aabb.max);
 	}
 
-	void expand(float amount) {
+	void expand(float amount)
+	{
 		min -= Vector3(amount);
 		max += Vector3(amount);
 	}
 
-	Vector3 centroid() const {
+	Vector3 centroid() const
+	{
 		return min + (max - min) * 0.5f;
 	}
 
-	uint32_t maxDimension() const {
+	uint32_t maxDimension() const
+	{
 		const Vector3 extent = max - min;
 		uint32_t result = 0;
 		if (extent.y > extent.x) {
 			result = 1;
 			if (extent.z > extent.y)
 				result = 2;
-		} else if (extent.z > extent.x)
+		}
+		else if(extent.z > extent.x)
 			result = 2;
 		return result;
 	}
@@ -938,9 +1024,10 @@ struct AABB {
 	Vector3 min, max;
 };
 
-struct ArrayBase {
-	ArrayBase(uint32_t elementSize, int memTag = MemTag::Default) :
-			buffer(nullptr), elementSize(elementSize), size(0), capacity(0) {
+struct ArrayBase
+{
+	ArrayBase(uint32_t elementSize, int memTag = MemTag::Default) : buffer(nullptr), elementSize(elementSize), size(0), capacity(0)
+	{
 #if XA_DEBUG_HEAP
 		this->memTag = memTag;
 #else
@@ -948,26 +1035,31 @@ struct ArrayBase {
 #endif
 	}
 
-	~ArrayBase() {
+	~ArrayBase()
+	{
 		XA_FREE(buffer);
 	}
 
-	XA_INLINE void clear() {
+	XA_INLINE void clear()
+	{
 		size = 0;
 	}
 
-	void copyFrom(const uint8_t *data, uint32_t length) {
+	void copyFrom(const uint8_t *data, uint32_t length)
+	{
 		resize(length, true);
 		memcpy(buffer, data, length * elementSize);
 	}
 
-	void copyTo(ArrayBase &other) const {
+	void copyTo(ArrayBase &other) const
+	{
 		XA_DEBUG_ASSERT(elementSize == other.elementSize);
 		other.resize(size, true);
 		memcpy(other.buffer, buffer, size * elementSize);
 	}
 
-	void destroy() {
+	void destroy()
+	{
 		size = 0;
 		XA_FREE(buffer);
 		buffer = nullptr;
@@ -976,7 +1068,8 @@ struct ArrayBase {
 	}
 
 	// Insert the given element at the given index shifting all the elements up.
-	void insertAt(uint32_t index, const uint8_t *value) {
+	void insertAt(uint32_t index, const uint8_t *value)
+	{
 		XA_DEBUG_ASSERT(index >= 0 && index <= size);
 		resize(size + 1, false);
 		if (index < size - 1)
@@ -984,7 +1077,8 @@ struct ArrayBase {
 		memcpy(&buffer[index * elementSize], value, elementSize);
 	}
 
-	void moveTo(ArrayBase &other) {
+	void moveTo(ArrayBase &other)
+	{
 		XA_DEBUG_ASSERT(elementSize == other.elementSize);
 		other.destroy();
 		other.buffer = buffer;
@@ -998,18 +1092,21 @@ struct ArrayBase {
 		elementSize = size = capacity = 0;
 	}
 
-	void pop_back() {
+	void pop_back()
+	{
 		XA_DEBUG_ASSERT(size > 0);
 		resize(size - 1, false);
 	}
 
-	void push_back(const uint8_t *value) {
+	void push_back(const uint8_t *value)
+	{
 		XA_DEBUG_ASSERT(value < buffer || value >= buffer + size);
 		resize(size + 1, false);
 		memcpy(&buffer[(size - 1) * elementSize], value, elementSize);
 	}
 
-	void push_back(const ArrayBase &other) {
+	void push_back(const ArrayBase &other)
+	{
 		XA_DEBUG_ASSERT(elementSize == other.elementSize);
 		if (other.size == 0)
 			return;
@@ -1019,19 +1116,31 @@ struct ArrayBase {
 	}
 
 	// Remove the element at the given index. This is an expensive operation!
-	void removeAt(uint32_t index) {
+	void removeAt(uint32_t index)
+	{
 		XA_DEBUG_ASSERT(index >= 0 && index < size);
 		if (size != 1)
 			memmove(buffer + elementSize * index, buffer + elementSize * (index + 1), elementSize * (size - 1 - index));
 		size--;
 	}
 
-	void reserve(uint32_t desiredSize) {
+	// Element at index is swapped with the last element, then the array length is decremented.
+	void removeAtFast(uint32_t index)
+	{
+		XA_DEBUG_ASSERT(index >= 0 && index < size);
+		if (size != 1 && index != size - 1)
+			memcpy(buffer + elementSize * index, buffer + elementSize * (size - 1), elementSize);
+		size--;
+	}
+
+	void reserve(uint32_t desiredSize)
+	{
 		if (desiredSize > capacity)
 			setArrayCapacity(desiredSize);
 	}
 
-	void resize(uint32_t newSize, bool exact) {
+	void resize(uint32_t newSize, bool exact)
+	{
 		size = newSize;
 		if (size > capacity) {
 			// First allocation is always exact. Otherwise, following allocations grow array to 150% of desired size.
@@ -1044,7 +1153,8 @@ struct ArrayBase {
 		}
 	}
 
-	void setArrayCapacity(uint32_t newCapacity) {
+	void setArrayCapacity(uint32_t newCapacity)
+	{
 		XA_DEBUG_ASSERT(newCapacity >= size);
 		if (newCapacity == 0) {
 			// free the buffer.
@@ -1064,8 +1174,9 @@ struct ArrayBase {
 	}
 
 #if XA_DEBUG_HEAP
-	void setMemTag(int memTag) {
-		this->memTag = memTag;
+	void setMemTag(int _memTag)
+	{
+		this->memTag = _memTag;
 	}
 #endif
 
@@ -1078,25 +1189,28 @@ struct ArrayBase {
 #endif
 };
 
-template <typename T>
-class Array {
+template<typename T>
+class Array
+{
 public:
-	Array(int memTag = MemTag::Default) :
-			m_base(sizeof(T), memTag) {}
-	Array(const Array &) = delete;
+	Array(int memTag = MemTag::Default) : m_base(sizeof(T), memTag) {}
+	Array(const Array&) = delete;
 	Array &operator=(const Array &) = delete;
 
-	XA_INLINE const T &operator[](uint32_t index) const {
+	XA_INLINE const T &operator[](uint32_t index) const
+	{
 		XA_DEBUG_ASSERT(index < m_base.size);
 		return ((const T *)m_base.buffer)[index];
 	}
 
-	XA_INLINE T &operator[](uint32_t index) {
+	XA_INLINE T &operator[](uint32_t index)
+	{
 		XA_DEBUG_ASSERT(index < m_base.size);
 		return ((T *)m_base.buffer)[index];
 	}
 
-	XA_INLINE const T &back() const {
+	XA_INLINE const T &back() const
+	{
 		XA_DEBUG_ASSERT(!isEmpty());
 		return ((const T *)m_base.buffer)[m_base.size - 1];
 	}
@@ -1104,7 +1218,8 @@ public:
 	XA_INLINE T *begin() { return (T *)m_base.buffer; }
 	XA_INLINE void clear() { m_base.clear(); }
 
-	bool contains(const T &value) const {
+	bool contains(const T &value) const
+	{
 		for (uint32_t i = 0; i < m_base.size; i++) {
 			if (((const T *)m_base.buffer)[i] == value)
 				return true;
@@ -1116,6 +1231,7 @@ public:
 	void copyTo(Array &other) const { m_base.copyTo(other.m_base); }
 	XA_INLINE const T *data() const { return (const T *)m_base.buffer; }
 	XA_INLINE T *data() { return (T *)m_base.buffer; }
+	void destroy() { m_base.destroy(); }
 	XA_INLINE T *end() { return (T *)m_base.buffer + m_base.size; }
 	XA_INLINE bool isEmpty() const { return m_base.size == 0; }
 	void insertAt(uint32_t index, const T &value) { m_base.insertAt(index, (const uint8_t *)&value); }
@@ -1124,25 +1240,34 @@ public:
 	void push_back(const Array &other) { m_base.push_back(other.m_base); }
 	void pop_back() { m_base.pop_back(); }
 	void removeAt(uint32_t index) { m_base.removeAt(index); }
+	void removeAtFast(uint32_t index) { m_base.removeAtFast(index); }
 	void reserve(uint32_t desiredSize) { m_base.reserve(desiredSize); }
 	void resize(uint32_t newSize) { m_base.resize(newSize, true); }
 
-	void runCtors() {
+	void runCtors()
+	{
 		for (uint32_t i = 0; i < m_base.size; i++)
 			new (&((T *)m_base.buffer)[i]) T;
 	}
 
-	void runDtors() {
+	void runDtors()
+	{
 		for (uint32_t i = 0; i < m_base.size; i++)
 			((T *)m_base.buffer)[i].~T();
 	}
 
-	void setAll(const T &value) {
+	void fill(const T &value)
+	{
 		auto buffer = (T *)m_base.buffer;
 		for (uint32_t i = 0; i < m_base.size; i++)
 			buffer[i] = value;
 	}
 
+	void fillBytes(uint8_t value)
+	{
+		memset(m_base.buffer, (int)value, m_base.size * m_base.elementSize);
+	}
+
 #if XA_DEBUG_HEAP
 	void setMemTag(int memTag) { m_base.setMemTag(memTag); }
 #endif
@@ -1154,47 +1279,35 @@ private:
 	ArrayBase m_base;
 };
 
-template <typename T>
-struct ArrayView {
-	ArrayView(Array<T> &a) :
-			data(a.data()), length(a.size()) {}
-	ArrayView(T *data, uint32_t length) :
-			data(data), length(length) {}
-	ArrayView &operator=(Array<T> &a) {
-		data = a.data();
-		length = a.size();
-		return *this;
-	}
-	XA_INLINE const T &operator[](uint32_t index) const {
-		XA_DEBUG_ASSERT(index < length);
-		return data[index];
-	}
+template<typename T>
+struct ArrayView
+{
+	ArrayView() : data(nullptr), length(0) {}
+	ArrayView(Array<T> &a) : data(a.data()), length(a.size()) {}
+	ArrayView(T *data, uint32_t length) : data(data), length(length) {}
+	ArrayView &operator=(Array<T> &a) { data = a.data(); length = a.size(); return *this; }
+	XA_INLINE const T &operator[](uint32_t index) const { XA_DEBUG_ASSERT(index < length); return data[index]; }
 	T *data;
 	uint32_t length;
 };
 
-template <typename T>
-struct ConstArrayView {
-	ConstArrayView(const Array<T> &a) :
-			data(a.data()), length(a.size()) {}
-	ConstArrayView(const T *data, uint32_t length) :
-			data(data), length(length) {}
-	ConstArrayView &operator=(const Array<T> &a) {
-		data = a.data();
-		length = a.size();
-		return *this;
-	}
-	XA_INLINE const T &operator[](uint32_t index) const {
-		XA_DEBUG_ASSERT(index < length);
-		return data[index];
-	}
+template<typename T>
+struct ConstArrayView
+{
+	ConstArrayView() : data(nullptr), length(0) {}
+	ConstArrayView(const Array<T> &a) : data(a.data()), length(a.size()) {}
+	ConstArrayView(const T *data, uint32_t length) : data(data), length(length) {}
+	ConstArrayView &operator=(const Array<T> &a) { data = a.data(); length = a.size(); return *this; }
+	XA_INLINE const T &operator[](uint32_t index) const { XA_DEBUG_ASSERT(index < length); return data[index]; }
 	const T *data;
 	uint32_t length;
 };
 
 /// Basis class to compute tangent space basis, ortogonalizations and to transform vectors from one space to another.
-struct Basis {
-	XA_NODISCARD static Vector3 computeTangent(const Vector3 &normal) {
+struct Basis
+{
+	XA_NODISCARD static Vector3 computeTangent(const Vector3 &normal)
+	{
 		XA_ASSERT(isNormalized(normal));
 		// Choose minimum axis.
 		Vector3 tangent;
@@ -1210,7 +1323,8 @@ struct Basis {
 		return tangent;
 	}
 
-	XA_NODISCARD static Vector3 computeBitangent(const Vector3 &normal, const Vector3 &tangent) {
+	XA_NODISCARD static Vector3 computeBitangent(const Vector3 &normal, const Vector3 &tangent)
+	{
 		return cross(normal, tangent);
 	}
 
@@ -1220,31 +1334,42 @@ struct Basis {
 };
 
 // Simple bit array.
-class BitArray {
+class BitArray
+{
 public:
-	BitArray() :
-			m_size(0) {}
+	BitArray() : m_size(0) {}
 
-	BitArray(uint32_t sz) {
+	BitArray(uint32_t sz)
+	{
 		resize(sz);
 	}
 
-	void resize(uint32_t new_size) {
+	void resize(uint32_t new_size)
+	{
 		m_size = new_size;
 		m_wordArray.resize((m_size + 31) >> 5);
 	}
 
-	bool get(uint32_t index) const {
+	bool get(uint32_t index) const
+	{
 		XA_DEBUG_ASSERT(index < m_size);
 		return (m_wordArray[index >> 5] & (1 << (index & 31))) != 0;
 	}
 
-	void set(uint32_t index) {
+	void set(uint32_t index)
+	{
 		XA_DEBUG_ASSERT(index < m_size);
 		m_wordArray[index >> 5] |= (1 << (index & 31));
 	}
 
-	void zeroOutMemory() {
+	void unset(uint32_t index)
+	{
+		XA_DEBUG_ASSERT(index < m_size);
+		m_wordArray[index >> 5] &= ~(1 << (index & 31));
+	}
+
+	void zeroOutMemory()
+	{
 		m_wordArray.zeroOutMemory();
 	}
 
@@ -1253,13 +1378,13 @@ private:
 	Array<uint32_t> m_wordArray;
 };
 
-class BitImage {
+class BitImage
+{
 public:
-	BitImage() :
-			m_width(0), m_height(0), m_rowStride(0), m_data(MemTag::BitImage) {}
+	BitImage() : m_width(0), m_height(0), m_rowStride(0), m_data(MemTag::BitImage) {}
 
-	BitImage(uint32_t w, uint32_t h) :
-			m_width(w), m_height(h), m_data(MemTag::BitImage) {
+	BitImage(uint32_t w, uint32_t h) : m_width(w), m_height(h), m_data(MemTag::BitImage)
+	{
 		m_rowStride = (m_width + 63) >> 6;
 		m_data.resize(m_rowStride * m_height);
 		m_data.zeroOutMemory();
@@ -1270,14 +1395,16 @@ public:
 	uint32_t width() const { return m_width; }
 	uint32_t height() const { return m_height; }
 
-	void copyTo(BitImage &other) {
+	void copyTo(BitImage &other)
+	{
 		other.m_width = m_width;
 		other.m_height = m_height;
 		other.m_rowStride = m_rowStride;
 		m_data.copyTo(other.m_data);
 	}
 
-	void resize(uint32_t w, uint32_t h, bool discard) {
+	void resize(uint32_t w, uint32_t h, bool discard)
+	{
 		const uint32_t rowStride = (w + 63) >> 6;
 		if (discard) {
 			m_data.resize(rowStride * h);
@@ -1301,24 +1428,28 @@ public:
 		m_rowStride = rowStride;
 	}
 
-	bool get(uint32_t x, uint32_t y) const {
+	bool get(uint32_t x, uint32_t y) const
+	{
 		XA_DEBUG_ASSERT(x < m_width && y < m_height);
 		const uint32_t index = (x >> 6) + y * m_rowStride;
 		return (m_data[index] & (UINT64_C(1) << (uint64_t(x) & UINT64_C(63)))) != 0;
 	}
 
-	void set(uint32_t x, uint32_t y) {
+	void set(uint32_t x, uint32_t y)
+	{
 		XA_DEBUG_ASSERT(x < m_width && y < m_height);
 		const uint32_t index = (x >> 6) + y * m_rowStride;
 		m_data[index] |= UINT64_C(1) << (uint64_t(x) & UINT64_C(63));
 		XA_DEBUG_ASSERT(get(x, y));
 	}
 
-	void zeroOutMemory() {
+	void zeroOutMemory()
+	{
 		m_data.zeroOutMemory();
 	}
 
-	bool canBlit(const BitImage &image, uint32_t offsetX, uint32_t offsetY) const {
+	bool canBlit(const BitImage &image, uint32_t offsetX, uint32_t offsetY) const
+	{
 		for (uint32_t y = 0; y < image.m_height; y++) {
 			const uint32_t thisY = y + offsetY;
 			if (thisY >= m_height)
@@ -1342,7 +1473,8 @@ public:
 		return true;
 	}
 
-	void dilate(uint32_t padding) {
+	void dilate(uint32_t padding)
+	{
 		BitImage tmp(m_width, m_height);
 		for (uint32_t p = 0; p < padding; p++) {
 			tmp.zeroOutMemory();
@@ -1379,10 +1511,11 @@ private:
 };
 
 // From Fast-BVH
-class BVH {
+class BVH
+{
 public:
-	BVH(const Array<AABB> &objectAabbs, uint32_t leafSize = 4) :
-			m_objectIds(MemTag::BVH), m_nodes(MemTag::BVH) {
+	BVH(const Array<AABB> &objectAabbs, uint32_t leafSize = 4) : m_objectIds(MemTag::BVH), m_nodes(MemTag::BVH)
+	{
 		m_objectAabbs = &objectAabbs;
 		if (m_objectAabbs->isEmpty())
 			return;
@@ -1402,7 +1535,7 @@ public:
 		Node node;
 		m_nodes.reserve(objectAabbs.size() * 2);
 		uint32_t nNodes = 0;
-		while (stackptr > 0) {
+		while(stackptr > 0) {
 			// Pop the next item off of the stack
 			const BuildEntry &bnode = todo[--stackptr];
 			const uint32_t start = bnode.start;
@@ -1415,7 +1548,7 @@ public:
 			// Calculate the bounding box for this node
 			AABB bb(objectAabbs[m_objectIds[start]]);
 			AABB bc(objectAabbs[m_objectIds[start]].centroid());
-			for (uint32_t p = start + 1; p < end; ++p) {
+			for(uint32_t p = start + 1; p < end; ++p) {
 				bb.expandToInclude(objectAabbs[m_objectIds[p]]);
 				bc.expandToInclude(objectAabbs[m_objectIds[p]].centroid());
 			}
@@ -1431,7 +1564,7 @@ public:
 				m_nodes[bnode.parent].rightOffset--;
 				// When this is the second touch, this is the right child.
 				// The right child sets up the offset for the flat tree.
-				if (m_nodes[bnode.parent].rightOffset == kTouchedTwice)
+				if (m_nodes[bnode.parent].rightOffset == kTouchedTwice )
 					m_nodes[bnode.parent].rightOffset = nNodes - 1 - bnode.parent;
 			}
 			// If this is a leaf, no need to subdivide.
@@ -1466,20 +1599,21 @@ public:
 		}
 	}
 
-	void query(const AABB &queryAabb, Array<uint32_t> &result) const {
+	void query(const AABB &queryAabb, Array<uint32_t> &result) const
+	{
 		result.clear();
 		// Working set
 		uint32_t todo[64];
 		int32_t stackptr = 0;
 		// "Push" on the root node to the working set
 		todo[stackptr] = 0;
-		while (stackptr >= 0) {
+		while(stackptr >= 0) {
 			// Pop off the next node to work on.
 			const int ni = todo[stackptr--];
 			const Node &node = m_nodes[ni];
 			// Is leaf -> Intersect
 			if (node.rightOffset == 0) {
-				for (uint32_t o = 0; o < node.nPrims; ++o) {
+				for(uint32_t o = 0; o < node.nPrims; ++o) {
 					const uint32_t obj = node.start + o;
 					if (queryAabb.intersect((*m_objectAabbs)[m_objectIds[obj]]))
 						result.push_back(m_objectIds[obj]);
@@ -1496,12 +1630,14 @@ public:
 	}
 
 private:
-	struct BuildEntry {
+	struct BuildEntry
+	{
 		uint32_t parent; // If non-zero then this is the index of the parent. (used in offsets)
 		uint32_t start, end; // The range of objects in the object list covered by this node.
 	};
 
-	struct Node {
+	struct Node
+	{
 		AABB aabb;
 		uint32_t start, nPrims, rightOffset;
 	};
@@ -1511,8 +1647,10 @@ private:
 	Array<Node> m_nodes;
 };
 
-struct Fit {
-	static bool computeBasis(const Vector3 *points, uint32_t pointsCount, Basis *basis) {
+struct Fit
+{
+	static bool computeBasis(const Vector3 *points, uint32_t pointsCount, Basis *basis)
+	{
 		if (computeLeastSquaresNormal(points, pointsCount, &basis->normal)) {
 			basis->tangent = Basis::computeTangent(basis->normal);
 			basis->bitangent = Basis::computeBitangent(basis->normal, basis->tangent);
@@ -1526,7 +1664,8 @@ private:
 	// Fast, and accurate to within a few degrees.
 	// Returns None if the points do not span a plane.
 	// https://www.ilikebigbits.com/2015_03_04_plane_from_points.html
-	static bool computeLeastSquaresNormal(const Vector3 *points, uint32_t pointsCount, Vector3 *normal) {
+	static bool computeLeastSquaresNormal(const Vector3 *points, uint32_t pointsCount, Vector3 *normal)
+	{
 		XA_DEBUG_ASSERT(pointsCount >= 3);
 		if (pointsCount == 3) {
 			*normal = normalize(cross(points[2] - points[0], points[1] - points[0]), kEpsilon);
@@ -1591,7 +1730,7 @@ private:
 		// Pick path with best conditioning:
 		Vector3 dir(0.0f);
 		if (det_max == det_x)
-			dir = Vector3(det_x, xz * yz - xy * zz, xy * yz - xz * yy);
+			dir = Vector3(det_x,xz * yz - xy * zz,xy * yz - xz * yy);
 		else if (det_max == det_y)
 			dir = Vector3(xz * yz - xy * zz, det_y, xy * xz - yz * xx);
 		else if (det_max == det_z)
@@ -1604,7 +1743,8 @@ private:
 		return isNormalized(*normal);
 	}
 
-	static bool computeEigen(const Vector3 *points, uint32_t pointsCount, Basis *basis) {
+	static bool computeEigen(const Vector3 *points, uint32_t pointsCount, Basis *basis)
+	{
 		float matrix[6];
 		computeCovariance(pointsCount, points, matrix);
 		if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
@@ -1619,7 +1759,8 @@ private:
 		return true;
 	}
 
-	static Vector3 computeCentroid(int n, const Vector3 *points) {
+	static Vector3 computeCentroid(int n, const Vector3 * points)
+	{
 		Vector3 centroid(0.0f);
 		for (int i = 0; i < n; i++) {
 			centroid += points[i];
@@ -1628,7 +1769,8 @@ private:
 		return centroid;
 	}
 
-	static Vector3 computeCovariance(int n, const Vector3 *points, float *covariance) {
+	static Vector3 computeCovariance(int n, const Vector3 * points, float * covariance)
+	{
 		// compute the centroid
 		Vector3 centroid = computeCentroid(n, points);
 		// compute covariance matrix
@@ -1650,7 +1792,8 @@ private:
 	// Tridiagonal solver from Charles Bloom.
 	// Householder transforms followed by QL decomposition.
 	// Seems to be based on the code from Numerical Recipes in C.
-	static bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]) {
+	static bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
+	{
 		XA_DEBUG_ASSERT(matrix != nullptr && eigenValues != nullptr && eigenVectors != nullptr);
 		float subd[3];
 		float diag[3];
@@ -1675,7 +1818,7 @@ private:
 		// eigenvectors are the columns; make them the rows :
 		for (int i = 0; i < 3; i++) {
 			for (int j = 0; j < 3; j++) {
-				(&eigenVectors[j].x)[i] = (float)work[i][j];
+				(&eigenVectors[j].x)[i] = (float) work[i][j];
 			}
 		}
 		// shuffle to sort by singular value :
@@ -1697,7 +1840,8 @@ private:
 	}
 
 private:
-	static void EigenSolver3_Tridiagonal(float mat[3][3], float *diag, float *subd) {
+	static void EigenSolver3_Tridiagonal(float mat[3][3], float *diag, float *subd)
+	{
 		// Householder reduction T = Q^t M Q
 		//   Input:
 		//     mat, symmetric 3x3 matrix M
@@ -1749,7 +1893,8 @@ private:
 		}
 	}
 
-	static bool EigenSolver3_QLAlgorithm(float mat[3][3], float *diag, float *subd) {
+	static bool EigenSolver3_QLAlgorithm(float mat[3][3], float *diag, float *subd)
+	{
 		// QL iteration with implicit shifting to reduce matrix from tridiagonal
 		// to diagonal
 		const int maxiter = 32;
@@ -1759,21 +1904,21 @@ private:
 				int m;
 				for (m = ell; m <= 1; m++) {
 					float dd = fabsf(diag[m]) + fabsf(diag[m + 1]);
-					if (fabsf(subd[m]) + dd == dd)
+					if ( fabsf(subd[m]) + dd == dd )
 						break;
 				}
-				if (m == ell)
+				if ( m == ell )
 					break;
 				float g = (diag[ell + 1] - diag[ell]) / (2 * subd[ell]);
 				float r = sqrtf(g * g + 1);
-				if (g < 0)
+				if ( g < 0 )
 					g = diag[m] - diag[ell] + subd[ell] / (g - r);
 				else
 					g = diag[m] - diag[ell] + subd[ell] / (g + r);
 				float s = 1, c = 1, p = 0;
 				for (int i = m - 1; i >= ell; i--) {
 					float f = s * subd[i], b = c * subd[i];
-					if (fabsf(f) >= fabsf(g)) {
+					if ( fabsf(f) >= fabsf(g) ) {
 						c = g / f;
 						r = sqrtf(c * c + 1);
 						subd[i + 1] = f * r;
@@ -1799,7 +1944,7 @@ private:
 				subd[ell] = g;
 				subd[m] = 0;
 			}
-			if (iter == maxiter)
+			if ( iter == maxiter )
 				// should not get here under normal circumstances
 				return false;
 		}
@@ -1807,50 +1952,77 @@ private:
 	}
 };
 
-/// Fixed size vector class.
-class FullVector {
-public:
-	FullVector(uint32_t dim) :
-			m_array(MemTag::FullVector) { m_array.resize(dim); }
-	FullVector(const FullVector &v) :
-			m_array(MemTag::FullVector) { v.m_array.copyTo(m_array); }
-	FullVector &operator=(const FullVector &v) = delete;
-	XA_INLINE uint32_t dimension() const { return m_array.size(); }
-	XA_INLINE const float &operator[](uint32_t index) const { return m_array[index]; }
-	XA_INLINE float &operator[](uint32_t index) { return m_array[index]; }
-
-	void fill(float f) {
-		const uint32_t dim = dimension();
-		for (uint32_t i = 0; i < dim; i++)
-			m_array[i] = f;
+static uint32_t sdbmHash(const void *data_in, uint32_t size, uint32_t h = 5381)
+{
+	const uint8_t *data = (const uint8_t *) data_in;
+	uint32_t i = 0;
+	while (i < size) {
+		h = (h << 16) + (h << 6) - h + (uint32_t ) data[i++];
 	}
+	return h;
+}
 
-private:
-	Array<float> m_array;
+template <typename T>
+static uint32_t hash(const T &t, uint32_t h = 5381)
+{
+	return sdbmHash(&t, sizeof(T), h);
+}
+
+template <typename Key>
+struct Hash
+{
+	uint32_t operator()(const Key &k) const { return hash(k); }
 };
 
-template <typename Key, typename H = Hash<Key>, typename E = Equal<Key>>
-class HashMap {
+template <typename Key>
+struct PassthroughHash
+{
+	uint32_t operator()(const Key &k) const { return (uint32_t)k; }
+};
+
+template <typename Key>
+struct Equal
+{
+	bool operator()(const Key &k0, const Key &k1) const { return k0 == k1; }
+};
+
+template<typename Key, typename H = Hash<Key>, typename E = Equal<Key> >
+class HashMap
+{
 public:
-	HashMap(int memTag, uint32_t size) :
-			m_memTag(memTag), m_size(size), m_numSlots(0), m_slots(nullptr), m_keys(memTag), m_next(memTag) {
+	HashMap(int memTag, uint32_t size) : m_memTag(memTag), m_size(size), m_numSlots(0), m_slots(nullptr), m_keys(memTag), m_next(memTag)
+	{
 	}
 
-	~HashMap() {
+	~HashMap()
+	{
 		if (m_slots)
 			XA_FREE(m_slots);
 	}
 
-	void add(const Key &key) {
+	void destroy()
+	{
+		if (m_slots) {
+			XA_FREE(m_slots);
+			m_slots = nullptr;
+		}
+		m_keys.destroy();
+		m_next.destroy();
+	}
+
+	uint32_t add(const Key &key)
+	{
 		if (!m_slots)
 			alloc();
 		const uint32_t hash = computeHash(key);
 		m_keys.push_back(key);
 		m_next.push_back(m_slots[hash]);
 		m_slots[hash] = m_next.size() - 1;
+		return m_keys.size() - 1;
 	}
 
-	uint32_t get(const Key &key) const {
+	uint32_t get(const Key &key) const
+	{
 		if (!m_slots)
 			return UINT32_MAX;
 		const uint32_t hash = computeHash(key);
@@ -1864,7 +2036,8 @@ public:
 		return UINT32_MAX;
 	}
 
-	uint32_t getNext(uint32_t current) const {
+	uint32_t getNext(uint32_t current) const
+	{
 		uint32_t i = m_next[current];
 		E equal;
 		while (i != UINT32_MAX) {
@@ -1876,7 +2049,8 @@ public:
 	}
 
 private:
-	void alloc() {
+	void alloc()
+	{
 		XA_DEBUG_ASSERT(m_size > 0);
 		m_numSlots = nextPowerOfTwo(m_size);
 		auto minNumSlots = uint32_t(m_size * 1.3);
@@ -1889,7 +2063,8 @@ private:
 		m_next.reserve(m_size);
 	}
 
-	uint32_t computeHash(const Key &key) const {
+	uint32_t computeHash(const Key &key) const
+	{
 		H hash;
 		return hash(key) & (m_numSlots - 1);
 	}
@@ -1902,8 +2077,9 @@ private:
 	Array<uint32_t> m_next;
 };
 
-template <typename T>
-static void insertionSort(T *data, uint32_t length) {
+template<typename T>
+static void insertionSort(T *data, uint32_t length)
+{
 	for (int32_t i = 1; i < (int32_t)length; i++) {
 		T x = data[i];
 		int32_t j = i - 1;
@@ -1915,18 +2091,21 @@ static void insertionSort(T *data, uint32_t length) {
 	}
 }
 
-class KISSRng {
+class KISSRng
+{
 public:
 	KISSRng() { reset(); }
 
-	void reset() {
+	void reset()
+	{
 		x = 123456789;
 		y = 362436000;
 		z = 521288629;
 		c = 7654321;
 	}
 
-	uint32_t getRange(uint32_t range) {
+	uint32_t getRange(uint32_t range)
+	{
 		if (range == 0)
 			return 0;
 		x = 69069 * x + 12345;
@@ -1935,7 +2114,7 @@ public:
 		y ^= (y << 5);
 		uint64_t t = 698769069ULL * z + c;
 		c = (t >> 32);
-		return (x + y + (z = (uint32_t)t)) % range;
+		return (x + y + (z = (uint32_t)t)) % (range + 1);
 	}
 
 private:
@@ -1945,76 +2124,70 @@ private:
 // Based on Pierre Terdiman's and Michael Herf's source code.
 // http://www.codercorner.com/RadixSortRevisited.htm
 // http://www.stereopsis.com/radix.html
-class RadixSort {
+class RadixSort
+{
 public:
-	RadixSort() :
-			m_size(0), m_ranks(nullptr), m_ranks2(nullptr), m_validRanks(false) {}
-
-	~RadixSort() {
-		// Release everything
-		XA_FREE(m_ranks2);
-		XA_FREE(m_ranks);
-	}
-
-	RadixSort &sort(const float *input, uint32_t count) {
-		if (input == nullptr || count == 0) return *this;
-		// Resize lists if needed
-		if (count != m_size) {
-			if (count > m_size) {
-				m_ranks2 = XA_REALLOC(MemTag::Default, m_ranks2, uint32_t, count);
-				m_ranks = XA_REALLOC(MemTag::Default, m_ranks, uint32_t, count);
-			}
-			m_size = count;
-			m_validRanks = false;
+	void sort(const float *input, uint32_t count)
+	{
+		if (input == nullptr || count == 0) {
+			m_buffer1.clear();
+			m_buffer2.clear();
+			m_ranks = m_buffer1.data();
+			m_ranks2 = m_buffer2.data();
+			return;
 		}
-		if (count < 32) {
+		// Resize lists if needed
+		m_buffer1.resize(count);
+		m_buffer2.resize(count);
+		m_ranks = m_buffer1.data();
+		m_ranks2 = m_buffer2.data();
+		m_validRanks = false;
+		if (count < 32)
 			insertionSort(input, count);
-		} else {
+		else {
 			// @@ Avoid touching the input multiple times.
 			for (uint32_t i = 0; i < count; i++) {
-				FloatFlip((uint32_t &)input[i]);
+				floatFlip((uint32_t &)input[i]);
 			}
 			radixSort<uint32_t>((const uint32_t *)input, count);
 			for (uint32_t i = 0; i < count; i++) {
-				IFloatFlip((uint32_t &)input[i]);
+				ifloatFlip((uint32_t &)input[i]);
 			}
 		}
-		return *this;
 	}
 
-	RadixSort &sort(const Array<float> &input) {
-		return sort(input.data(), input.size());
+	void sort(const Array<float> &input)
+	{
+		sort(input.data(), input.size());
 	}
 
 	// Access to results. m_ranks is a list of indices in sorted order, i.e. in the order you may further process your data
-	const uint32_t *ranks() const {
-		XA_DEBUG_ASSERT(m_validRanks);
-		return m_ranks;
-	}
-
-	uint32_t *ranks() {
+	const uint32_t *ranks() const
+	{
 		XA_DEBUG_ASSERT(m_validRanks);
 		return m_ranks;
 	}
 
 private:
-	uint32_t m_size;
-	uint32_t *m_ranks;
-	uint32_t *m_ranks2;
+	uint32_t *m_ranks, *m_ranks2;
+	Array<uint32_t> m_buffer1, m_buffer2;
 	bool m_validRanks;
 
-	void FloatFlip(uint32_t &f) {
+	void floatFlip(uint32_t &f)
+	{
 		int32_t mask = (int32_t(f) >> 31) | 0x80000000; // Warren Hunt, Manchor Ko.
 		f ^= mask;
 	}
 
-	void IFloatFlip(uint32_t &f) {
+	void ifloatFlip(uint32_t &f)
+	{
 		uint32_t mask = ((f >> 31) - 1) | 0x80000000; // Michael Herf.
 		f ^= mask;
 	}
 
-	template <typename T>
-	void createHistograms(const T *buffer, uint32_t count, uint32_t *histogram) {
+	template<typename T>
+	void createHistograms(const T *buffer, uint32_t count, uint32_t *histogram)
+	{
 		const uint32_t bucketCount = sizeof(T); // (8 * sizeof(T)) / log2(radix)
 		// Init bucket pointers.
 		uint32_t *h[bucketCount];
@@ -2022,10 +2195,10 @@ private:
 			h[i] = histogram + 256 * i;
 		}
 		// Clear histograms.
-		memset(histogram, 0, 256 * bucketCount * sizeof(uint32_t));
+		memset(histogram, 0, 256 * bucketCount * sizeof(uint32_t ));
 		// @@ Add support for signed integers.
 		// Build histograms.
-		const uint8_t *p = (const uint8_t *)buffer; // @@ Does this break aliasing rules?
+		const uint8_t *p = (const uint8_t *)buffer;  // @@ Does this break aliasing rules?
 		const uint8_t *pe = p + count * sizeof(T);
 		while (p != pe) {
 			h[0][*p++]++, h[1][*p++]++, h[2][*p++]++, h[3][*p++]++;
@@ -2041,7 +2214,8 @@ private:
 	}
 
 	template <typename T>
-	void insertionSort(const T *input, uint32_t count) {
+	void insertionSort(const T *input, uint32_t count)
+	{
 		if (!m_validRanks) {
 			m_ranks[0] = 0;
 			for (uint32_t i = 1; i != count; ++i) {
@@ -2072,7 +2246,8 @@ private:
 	}
 
 	template <typename T>
-	void radixSort(const T *input, uint32_t count) {
+	void radixSort(const T *input, uint32_t count)
+	{
 		const uint32_t P = sizeof(T); // pass count
 		// Allocate histograms & offsets on the stack
 		uint32_t histogram[256 * P];
@@ -2090,8 +2265,7 @@ private:
 			}
 			// Create offsets
 			link[0] = m_ranks2;
-			for (uint32_t i = 1; i < 256; i++)
-				link[i] = link[i - 1] + h[i - 1];
+			for (uint32_t i = 1; i < 256; i++) link[i] = link[i - 1] + h[i - 1];
 			// Perform Radix Sort
 			if (!m_validRanks) {
 				for (uint32_t i = 0; i < count; i++) {
@@ -2109,30 +2283,33 @@ private:
 		}
 		// All values were equal, generate linear ranks.
 		if (!m_validRanks) {
-			for (uint32_t i = 0; i < count; i++) {
+			for (uint32_t i = 0; i < count; i++)
 				m_ranks[i] = i;
-			}
 			m_validRanks = true;
 		}
 	}
 };
 
 // Wrapping this in a class allows temporary arrays to be re-used.
-class BoundingBox2D {
+class BoundingBox2D
+{
 public:
 	Vector2 majorAxis, minorAxis, minCorner, maxCorner;
 
-	void clear() {
+	void clear()
+	{
 		m_boundaryVertices.clear();
 	}
 
-	void appendBoundaryVertex(Vector2 v) {
+	void appendBoundaryVertex(Vector2 v)
+	{
 		m_boundaryVertices.push_back(v);
 	}
 
 	// This should compute convex hull and use rotating calipers to find the best box. Currently it uses a brute force method.
 	// If vertices is null or vertexCount is 0, the boundary vertices are used.
-	void compute(const Vector2 *vertices = nullptr, uint32_t vertexCount = 0) {
+	void compute(const Vector2 *vertices = nullptr, uint32_t vertexCount = 0)
+	{
 		if (!vertices || vertexCount == 0) {
 			vertices = m_boundaryVertices.data();
 			vertexCount = m_boundaryVertices.size();
@@ -2180,13 +2357,13 @@ public:
 
 private:
 	// Compute the convex hull using Graham Scan.
-	void convexHull(const Vector2 *input, uint32_t inputCount, Array<Vector2> &output, float epsilon) {
+	void convexHull(const Vector2 *input, uint32_t inputCount, Array<Vector2> &output, float epsilon)
+	{
 		m_coords.resize(inputCount);
 		for (uint32_t i = 0; i < inputCount; i++)
 			m_coords[i] = input[i].x;
-		RadixSort radix;
-		radix.sort(m_coords);
-		const uint32_t *ranks = radix.ranks();
+		m_radix.sort(m_coords);
+		const uint32_t *ranks = m_radix.ranks();
 		m_top.clear();
 		m_bottom.clear();
 		m_top.reserve(inputCount);
@@ -2210,7 +2387,7 @@ private:
 		XA_DEBUG_ASSERT(m_top.size() >= 2);
 		output.push_back(m_top[0]);
 		output.push_back(m_top[1]);
-		for (uint32_t i = 2; i < m_top.size();) {
+		for (uint32_t i = 2; i < m_top.size(); ) {
 			Vector2 a = output[output.size() - 2];
 			Vector2 b = output[output.size() - 1];
 			Vector2 c = m_top[i];
@@ -2226,7 +2403,7 @@ private:
 		XA_DEBUG_ASSERT(m_bottom.size() >= 2);
 		output.push_back(m_bottom[1]);
 		// Filter bottom list.
-		for (uint32_t i = 2; i < m_bottom.size();) {
+		for (uint32_t i = 2; i < m_bottom.size(); ) {
 			Vector2 a = output[output.size() - 2];
 			Vector2 b = output[output.size() - 1];
 			Vector2 c = m_bottom[i];
@@ -2246,51 +2423,49 @@ private:
 	Array<Vector2> m_boundaryVertices;
 	Array<float> m_coords;
 	Array<Vector2> m_top, m_bottom, m_hull;
+	RadixSort m_radix;
 };
 
-static uint32_t meshEdgeFace(uint32_t edge) {
-	return edge / 3;
-}
-static uint32_t meshEdgeIndex0(uint32_t edge) {
-	return edge;
-}
+static uint32_t meshEdgeFace(uint32_t edge) { return edge / 3; }
+static uint32_t meshEdgeIndex0(uint32_t edge) { return edge; }
 
-static uint32_t meshEdgeIndex1(uint32_t edge) {
+static uint32_t meshEdgeIndex1(uint32_t edge)
+{
 	const uint32_t faceFirstEdge = edge / 3 * 3;
 	return faceFirstEdge + (edge - faceFirstEdge + 1) % 3;
 }
 
-struct MeshFlags {
-	enum {
-		HasFaceGroups = 1 << 0,
-		HasIgnoredFaces = 1 << 1,
-		HasNormals = 1 << 2
+struct MeshFlags
+{
+	enum
+	{
+		HasIgnoredFaces = 1<<0,
+		HasNormals = 1<<1
 	};
 };
 
 class Mesh;
 static void meshGetBoundaryLoops(const Mesh &mesh, Array<uint32_t> &boundaryLoops);
 
-class Mesh {
+class Mesh
+{
 public:
-	Mesh(float epsilon, uint32_t approxVertexCount, uint32_t approxFaceCount, uint32_t flags = 0, uint32_t id = UINT32_MAX) :
-			m_epsilon(epsilon), m_flags(flags), m_id(id), m_faceIgnore(MemTag::Mesh), m_ignoredFaceCount(0), m_indices(MemTag::MeshIndices), m_positions(MemTag::MeshPositions), m_normals(MemTag::MeshNormals), m_texcoords(MemTag::MeshTexcoords), m_faceGroups(MemTag::Mesh), m_faceGroupFirstFace(MemTag::Mesh), m_faceGroupNextFace(MemTag::Mesh), m_faceGroupFaceCounts(MemTag::Mesh), m_colocalVertexCount(0), m_nextColocalVertex(MemTag::MeshColocals), m_boundaryEdges(MemTag::MeshBoundaries), m_oppositeEdges(MemTag::MeshBoundaries), m_nextBoundaryEdges(MemTag::MeshBoundaries), m_edgeMap(MemTag::MeshEdgeMap, approxFaceCount * 3) {
+	Mesh(float epsilon, uint32_t approxVertexCount, uint32_t approxFaceCount, uint32_t flags = 0, uint32_t id = UINT32_MAX) : m_epsilon(epsilon), m_flags(flags), m_id(id), m_faceIgnore(MemTag::Mesh), m_indices(MemTag::MeshIndices), m_positions(MemTag::MeshPositions), m_normals(MemTag::MeshNormals), m_texcoords(MemTag::MeshTexcoords), m_nextColocalVertex(MemTag::MeshColocals), m_boundaryEdges(MemTag::MeshBoundaries), m_oppositeEdges(MemTag::MeshBoundaries), m_nextBoundaryEdges(MemTag::MeshBoundaries), m_edgeMap(MemTag::MeshEdgeMap, approxFaceCount * 3)
+	{
 		m_indices.reserve(approxFaceCount * 3);
 		m_positions.reserve(approxVertexCount);
 		m_texcoords.reserve(approxVertexCount);
-		if (m_flags & MeshFlags::HasFaceGroups)
-			m_faceGroups.reserve(approxFaceCount);
 		if (m_flags & MeshFlags::HasIgnoredFaces)
 			m_faceIgnore.reserve(approxFaceCount);
 		if (m_flags & MeshFlags::HasNormals)
 			m_normals.reserve(approxVertexCount);
 	}
 
-	static constexpr uint16_t kInvalidFaceGroup = UINT16_MAX;
 	uint32_t flags() const { return m_flags; }
 	uint32_t id() const { return m_id; }
 
-	void addVertex(const Vector3 &pos, const Vector3 &normal = Vector3(0.0f), const Vector2 &texcoord = Vector2(0.0f)) {
+	void addVertex(const Vector3 &pos, const Vector3 &normal = Vector3(0.0f), const Vector2 &texcoord = Vector2(0.0f))
+	{
 		XA_DEBUG_ASSERT(isFinite(pos));
 		m_positions.push_back(pos);
 		if (m_flags & MeshFlags::HasNormals)
@@ -2298,47 +2473,45 @@ public:
 		m_texcoords.push_back(texcoord);
 	}
 
-	struct AddFaceResult {
-		enum Enum {
+	struct AddFaceResult
+	{
+		enum Enum
+		{
 			OK,
 			DuplicateEdge = 1
 		};
 	};
 
-	AddFaceResult::Enum addFace(uint32_t v0, uint32_t v1, uint32_t v2, bool ignore = false, bool hashEdge = true) {
+	AddFaceResult::Enum addFace(uint32_t v0, uint32_t v1, uint32_t v2, bool ignore = false)
+	{
 		uint32_t indexArray[3];
 		indexArray[0] = v0;
 		indexArray[1] = v1;
 		indexArray[2] = v2;
-		return addFace(indexArray, ignore, hashEdge);
+		return addFace(indexArray, ignore);
 	}
 
-	AddFaceResult::Enum addFace(const uint32_t *indices, bool ignore = false, bool hashEdge = true) {
+	AddFaceResult::Enum addFace(const uint32_t *indices, bool ignore = false)
+	{
 		AddFaceResult::Enum result = AddFaceResult::OK;
-		if (m_flags & MeshFlags::HasFaceGroups)
-			m_faceGroups.push_back(kInvalidFaceGroup);
-		if (m_flags & MeshFlags::HasIgnoredFaces) {
+		if (m_flags & MeshFlags::HasIgnoredFaces)
 			m_faceIgnore.push_back(ignore);
-			if (ignore)
-				m_ignoredFaceCount++;
-		}
 		const uint32_t firstIndex = m_indices.size();
 		for (uint32_t i = 0; i < 3; i++)
 			m_indices.push_back(indices[i]);
-		if (hashEdge) {
-			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex0 = m_indices[firstIndex + i];
-				const uint32_t vertex1 = m_indices[firstIndex + (i + 1) % 3];
-				const EdgeKey key(vertex0, vertex1);
-				if (m_edgeMap.get(key) != UINT32_MAX)
-					result = AddFaceResult::DuplicateEdge;
-				m_edgeMap.add(key);
-			}
+		for (uint32_t i = 0; i < 3; i++) {
+			const uint32_t vertex0 = m_indices[firstIndex + i];
+			const uint32_t vertex1 = m_indices[firstIndex + (i + 1) % 3];
+			const EdgeKey key(vertex0, vertex1);
+			if (m_edgeMap.get(key) != UINT32_MAX)
+				result = AddFaceResult::DuplicateEdge;
+			m_edgeMap.add(key);
 		}
 		return result;
 	}
 
-	void createColocals() {
+	void createColocals()
+	{
 		const uint32_t vertexCount = m_positions.size();
 		Array<AABB> aabbs(MemTag::BVH);
 		aabbs.resize(vertexCount);
@@ -2347,10 +2520,8 @@ public:
 		BVH bvh(aabbs);
 		Array<uint32_t> colocals(MemTag::MeshColocals);
 		Array<uint32_t> potential(MemTag::MeshColocals);
-		m_colocalVertexCount = 0;
 		m_nextColocalVertex.resize(vertexCount);
-		for (uint32_t i = 0; i < vertexCount; i++)
-			m_nextColocalVertex[i] = UINT32_MAX;
+		m_nextColocalVertex.fillBytes(0xff);
 		for (uint32_t i = 0; i < vertexCount; i++) {
 			if (m_nextColocalVertex[i] != UINT32_MAX)
 				continue; // Already linked.
@@ -2366,9 +2537,8 @@ public:
 			if (colocals.size() == 1) {
 				// No colocals for this vertex.
 				m_nextColocalVertex[i] = i;
-				continue;
+				continue; 
 			}
-			m_colocalVertexCount += colocals.size();
 			// Link in ascending order.
 			insertionSort(colocals.data(), colocals.size());
 			for (uint32_t j = 0; j < colocals.size(); j++)
@@ -2377,98 +2547,8 @@ public:
 		}
 	}
 
-	// Check if the face duplicates any edges of any face already in the group.
-	bool faceDuplicatesGroupEdge(uint16_t group, uint32_t face) const {
-		for (FaceEdgeIterator edgeIt(this, face); !edgeIt.isDone(); edgeIt.advance()) {
-			for (ColocalEdgeIterator colocalEdgeIt(this, edgeIt.vertex0(), edgeIt.vertex1()); !colocalEdgeIt.isDone(); colocalEdgeIt.advance()) {
-				if (m_faceGroups[meshEdgeFace(colocalEdgeIt.edge())] == group)
-					return true;
-			}
-		}
-		return false;
-	}
-
-	void createFaceGroups() {
-		uint32_t firstUnassignedFace = 0;
-		uint16_t group = 0;
-		Array<uint32_t> growFaces;
-		const uint32_t n = faceCount();
-		m_faceGroupNextFace.resize(n);
-		for (;;) {
-			// Find an unassigned face.
-			uint32_t face = UINT32_MAX;
-			for (uint32_t f = firstUnassignedFace; f < n; f++) {
-				if (m_faceGroups[f] == kInvalidFaceGroup && !isFaceIgnored(f)) {
-					face = f;
-					firstUnassignedFace = f + 1;
-					break;
-				}
-			}
-			if (face == UINT32_MAX)
-				break; // All faces assigned to a group (except ignored faces).
-			m_faceGroups[face] = group;
-			m_faceGroupNextFace[face] = UINT32_MAX;
-			m_faceGroupFirstFace.push_back(face);
-			growFaces.clear();
-			growFaces.push_back(face);
-			uint32_t prevFace = face, groupFaceCount = 1;
-			// Find faces connected to the face and assign them to the same group as the face, unless they are already assigned to another group.
-			for (;;) {
-				if (growFaces.isEmpty())
-					break;
-				const uint32_t f = growFaces.back();
-				growFaces.pop_back();
-				for (FaceEdgeIterator edgeIt(this, f); !edgeIt.isDone(); edgeIt.advance()) {
-					// Iterate opposite edges. There may be more than one - non-manifold geometry can have duplicate edges.
-					// Prioritize the one with exact vertex match, not just colocal.
-					// If *any* of the opposite edges are already assigned to this group, don't do anything.
-					bool alreadyAssignedToThisGroup = false;
-					uint32_t bestConnectedFace = UINT32_MAX;
-					for (ColocalEdgeIterator oppositeEdgeIt(this, edgeIt.vertex1(), edgeIt.vertex0()); !oppositeEdgeIt.isDone(); oppositeEdgeIt.advance()) {
-						const uint32_t oppositeEdge = oppositeEdgeIt.edge();
-						const uint32_t oppositeFace = meshEdgeFace(oppositeEdge);
-						if (isFaceIgnored(oppositeFace))
-							continue; // Don't add ignored faces to group.
-						if (m_faceGroups[oppositeFace] == group) {
-							alreadyAssignedToThisGroup = true;
-							break;
-						}
-						if (m_faceGroups[oppositeFace] != kInvalidFaceGroup)
-							continue; // Connected face is already assigned to another group.
-						if (faceDuplicatesGroupEdge(group, oppositeFace))
-							continue; // Don't want duplicate edges in a group.
-						const uint32_t oppositeVertex0 = m_indices[meshEdgeIndex0(oppositeEdge)];
-						const uint32_t oppositeVertex1 = m_indices[meshEdgeIndex1(oppositeEdge)];
-						if (bestConnectedFace == UINT32_MAX || (oppositeVertex0 == edgeIt.vertex1() && oppositeVertex1 == edgeIt.vertex0()))
-							bestConnectedFace = oppositeFace;
-#if 0
-						else {
-							// Choose the opposite face with the smallest dihedral angle.
-							const float d1 = 1.0f - dot(computeFaceNormal(f), computeFaceNormal(bestConnectedFace));
-							const float d2 = 1.0f - dot(computeFaceNormal(f), computeFaceNormal(oppositeFace));
-							if (d2 < d1)
-								bestConnectedFace = oppositeFace;
-						}
-#endif
-					}
-					if (!alreadyAssignedToThisGroup && bestConnectedFace != UINT32_MAX) {
-						m_faceGroups[bestConnectedFace] = group;
-						m_faceGroupNextFace[bestConnectedFace] = UINT32_MAX;
-						if (prevFace != UINT32_MAX)
-							m_faceGroupNextFace[prevFace] = bestConnectedFace;
-						prevFace = bestConnectedFace;
-						groupFaceCount++;
-						growFaces.push_back(bestConnectedFace);
-					}
-				}
-			}
-			m_faceGroupFaceCounts.push_back(groupFaceCount);
-			group++;
-			XA_ASSERT(group < kInvalidFaceGroup);
-		}
-	}
-
-	void createBoundaries() {
+	void createBoundaries()
+	{
 		const uint32_t edgeCount = m_indices.size();
 		const uint32_t vertexCount = m_positions.size();
 		m_oppositeEdges.resize(edgeCount);
@@ -2498,7 +2578,8 @@ public:
 		}
 	}
 
-	void linkBoundaries() {
+	void linkBoundaries()
+	{
 		const uint32_t edgeCount = m_indices.size();
 		HashMap<uint32_t> vertexToEdgeMap(MemTag::Mesh, edgeCount); // Edge is index / 2
 		for (uint32_t i = 0; i < edgeCount; i++) {
@@ -2591,7 +2672,8 @@ public:
 	}
 
 	/// Find edge, test all colocals.
-	uint32_t findEdge(uint32_t vertex0, uint32_t vertex1) const {
+	uint32_t findEdge(uint32_t vertex0, uint32_t vertex1) const
+	{
 		uint32_t result = UINT32_MAX;
 		if (m_nextColocalVertex.isEmpty()) {
 			EdgeKey key(vertex0, vertex1);
@@ -2629,8 +2711,18 @@ public:
 		return result;
 	}
 
+	// Edge map can be destroyed when no longer used to reduce memory usage. It's used by:
+	//   * Mesh::createBoundaries()
+	//   * Mesh::ColocalEdgeIterator (used by MeshFaceGroups)
+	//   * meshCloseHole()
+	void destroyEdgeMap()
+	{
+		m_edgeMap.destroy();
+	}
+
 #if XA_DEBUG_EXPORT_OBJ
-	void writeObjVertices(FILE *file) const {
+	void writeObjVertices(FILE *file) const
+	{
 		for (uint32_t i = 0; i < m_positions.size(); i++)
 			fprintf(file, "v %g %g %g\n", m_positions[i].x, m_positions[i].y, m_positions[i].z);
 		if (m_flags & MeshFlags::HasNormals) {
@@ -2641,15 +2733,17 @@ public:
 			fprintf(file, "vt %g %g\n", m_texcoords[i].x, m_texcoords[i].y);
 	}
 
-	void writeObjFace(FILE *file, uint32_t face) const {
+	void writeObjFace(FILE *file, uint32_t face, uint32_t offset = 0) const
+	{
 		fprintf(file, "f ");
 		for (uint32_t j = 0; j < 3; j++) {
-			const uint32_t index = m_indices[face * 3 + j] + 1; // 1-indexed
+			const uint32_t index = m_indices[face * 3 + j] + 1 + offset; // 1-indexed
 			fprintf(file, "%d/%d/%d%c", index, index, index, j == 2 ? '\n' : ' ');
 		}
 	}
 
-	void writeObjBoundaryEges(FILE *file) const {
+	void writeObjBoundaryEges(FILE *file) const
+	{
 		if (m_oppositeEdges.isEmpty())
 			return; // Boundaries haven't been created.
 		fprintf(file, "o boundary_edges\n");
@@ -2660,7 +2754,8 @@ public:
 		}
 	}
 
-	void writeObjLinkedBoundaries(FILE *file) const {
+	void writeObjLinkedBoundaries(FILE *file) const
+	{
 		if (m_oppositeEdges.isEmpty() || m_nextBoundaryEdges.isEmpty())
 			return; // Boundaries haven't been created and/or linked.
 		Array<uint32_t> boundaryLoops;
@@ -2682,7 +2777,8 @@ public:
 		}
 	}
 
-	void writeObjFile(const char *filename) const {
+	void writeObjFile(const char *filename) const
+	{
 		FILE *file;
 		XA_FOPEN(file, filename, "w");
 		if (!file)
@@ -2698,7 +2794,8 @@ public:
 	}
 #endif
 
-	float computeSurfaceArea() const {
+	float computeSurfaceArea() const
+	{
 		float area = 0;
 		for (uint32_t f = 0; f < faceCount(); f++)
 			area += computeFaceArea(f);
@@ -2706,21 +2803,25 @@ public:
 		return area;
 	}
 
-	float computeParametricArea() const {
+	// Returned value is always positive, even if some triangles are flipped.
+	float computeParametricArea() const
+	{
 		float area = 0;
 		for (uint32_t f = 0; f < faceCount(); f++)
-			area += computeFaceParametricArea(f);
-		return fabsf(area); // May be negative, depends on texcoord winding.
+			area += fabsf(computeFaceParametricArea(f)); // May be negative, depends on texcoord winding.
+		return area; 
 	}
 
-	float computeFaceArea(uint32_t face) const {
+	float computeFaceArea(uint32_t face) const
+	{
 		const Vector3 &p0 = m_positions[m_indices[face * 3 + 0]];
 		const Vector3 &p1 = m_positions[m_indices[face * 3 + 1]];
 		const Vector3 &p2 = m_positions[m_indices[face * 3 + 2]];
 		return length(cross(p1 - p0, p2 - p0)) * 0.5f;
 	}
 
-	Vector3 computeFaceCentroid(uint32_t face) const {
+	Vector3 computeFaceCentroid(uint32_t face) const
+	{
 		Vector3 sum(0.0f);
 		for (uint32_t i = 0; i < 3; i++)
 			sum += m_positions[m_indices[face * 3 + i]];
@@ -2729,7 +2830,8 @@ public:
 
 	// Average of the edge midpoints weighted by the edge length.
 	// I want a point inside the triangle, but closer to the cirumcenter.
-	Vector3 computeFaceCenter(uint32_t face) const {
+	Vector3 computeFaceCenter(uint32_t face) const
+	{
 		const Vector3 &p0 = m_positions[m_indices[face * 3 + 0]];
 		const Vector3 &p1 = m_positions[m_indices[face * 3 + 1]];
 		const Vector3 &p2 = m_positions[m_indices[face * 3 + 2]];
@@ -2742,7 +2844,8 @@ public:
 		return m0 + m1 + m2;
 	}
 
-	Vector3 computeFaceNormal(uint32_t face) const {
+	Vector3 computeFaceNormal(uint32_t face) const
+	{
 		const Vector3 &p0 = m_positions[m_indices[face * 3 + 0]];
 		const Vector3 &p1 = m_positions[m_indices[face * 3 + 1]];
 		const Vector3 &p2 = m_positions[m_indices[face * 3 + 2]];
@@ -2752,15 +2855,17 @@ public:
 		return normalizeSafe(normalAreaScaled, Vector3(0, 0, 1), 0.0f);
 	}
 
-	float computeFaceParametricArea(uint32_t face) const {
+	float computeFaceParametricArea(uint32_t face) const
+	{
 		const Vector2 &t0 = m_texcoords[m_indices[face * 3 + 0]];
 		const Vector2 &t1 = m_texcoords[m_indices[face * 3 + 1]];
 		const Vector2 &t2 = m_texcoords[m_indices[face * 3 + 2]];
 		return triangleArea(t0, t1, t2);
 	}
-
+	
 	// @@ This is not exactly accurate, we should compare the texture coordinates...
-	bool isSeam(uint32_t edge) const {
+	bool isSeam(uint32_t edge) const
+	{
 		const uint32_t oppositeEdge = m_oppositeEdges[edge];
 		if (oppositeEdge == UINT32_MAX)
 			return false; // boundary edge
@@ -2771,7 +2876,8 @@ public:
 		return m_indices[e0] != m_indices[oe1] || m_indices[e1] != m_indices[oe0];
 	}
 
-	bool isTextureSeam(uint32_t edge) const {
+	bool isTextureSeam(uint32_t edge) const
+	{
 		const uint32_t oppositeEdge = m_oppositeEdges[edge];
 		if (oppositeEdge == UINT32_MAX)
 			return false; // boundary edge
@@ -2782,7 +2888,8 @@ public:
 		return m_texcoords[m_indices[e0]] != m_texcoords[m_indices[oe1]] || m_texcoords[m_indices[e1]] != m_texcoords[m_indices[oe0]];
 	}
 
-	uint32_t firstColocal(uint32_t vertex) const {
+	uint32_t firstColocal(uint32_t vertex) const
+	{
 		for (ColocalVertexIterator it(this, vertex); !it.isDone(); it.advance()) {
 			if (it.vertex() < vertex)
 				vertex = it.vertex();
@@ -2790,7 +2897,8 @@ public:
 		return vertex;
 	}
 
-	bool areColocal(uint32_t vertex0, uint32_t vertex1) const {
+	bool areColocal(uint32_t vertex0, uint32_t vertex1) const
+	{
 		if (vertex0 == vertex1)
 			return true;
 		if (m_nextColocalVertex.isEmpty())
@@ -2808,60 +2916,32 @@ public:
 	XA_INLINE bool isBoundaryEdge(uint32_t edge) const { return m_oppositeEdges[edge] == UINT32_MAX; }
 	XA_INLINE const Array<uint32_t> &boundaryEdges() const { return m_boundaryEdges; }
 	XA_INLINE bool isBoundaryVertex(uint32_t vertex) const { return m_isBoundaryVertex.get(vertex); }
-	XA_INLINE uint32_t colocalVertexCount() const { return m_colocalVertexCount; }
 	XA_INLINE uint32_t vertexCount() const { return m_positions.size(); }
 	XA_INLINE uint32_t vertexAt(uint32_t i) const { return m_indices[i]; }
 	XA_INLINE const Vector3 &position(uint32_t vertex) const { return m_positions[vertex]; }
-	XA_INLINE const Vector3 &normal(uint32_t vertex) const {
-		XA_DEBUG_ASSERT(m_flags & MeshFlags::HasNormals);
-		return m_normals[vertex];
-	}
+	XA_INLINE const Vector3 *positions() const { return m_positions.data(); }
+	XA_INLINE const Vector3 &normal(uint32_t vertex) const { XA_DEBUG_ASSERT(m_flags & MeshFlags::HasNormals); return m_normals[vertex]; }
 	XA_INLINE const Vector2 &texcoord(uint32_t vertex) const { return m_texcoords[vertex]; }
 	XA_INLINE Vector2 &texcoord(uint32_t vertex) { return m_texcoords[vertex]; }
 	XA_INLINE const Vector2 *texcoords() const { return m_texcoords.data(); }
 	XA_INLINE Vector2 *texcoords() { return m_texcoords.data(); }
-	XA_INLINE uint32_t ignoredFaceCount() const { return m_ignoredFaceCount; }
 	XA_INLINE uint32_t faceCount() const { return m_indices.size() / 3; }
-	XA_INLINE uint16_t faceGroupAt(uint32_t face) const {
-		XA_DEBUG_ASSERT(m_flags & MeshFlags::HasFaceGroups);
-		return m_faceGroups[face];
-	}
-	XA_INLINE uint32_t faceGroupCount() const {
-		XA_DEBUG_ASSERT(m_flags & MeshFlags::HasFaceGroups);
-		return m_faceGroupFaceCounts.size();
-	}
-	XA_INLINE uint32_t faceGroupNextFace(uint32_t face) const {
-		XA_DEBUG_ASSERT(m_flags & MeshFlags::HasFaceGroups);
-		return m_faceGroupNextFace[face];
-	}
-	XA_INLINE uint32_t faceGroupFaceCount(uint32_t group) const {
-		XA_DEBUG_ASSERT(m_flags & MeshFlags::HasFaceGroups);
-		return m_faceGroupFaceCounts[group];
-	}
 	XA_INLINE const uint32_t *indices() const { return m_indices.data(); }
 	XA_INLINE uint32_t indexCount() const { return m_indices.size(); }
+	XA_INLINE bool isFaceIgnored(uint32_t face) const { return (m_flags & MeshFlags::HasIgnoredFaces) && m_faceIgnore[face]; }
 
 private:
-	bool isFaceIgnored(uint32_t face) const { return (m_flags & MeshFlags::HasIgnoredFaces) && m_faceIgnore[face]; }
 
 	float m_epsilon;
 	uint32_t m_flags;
 	uint32_t m_id;
 	Array<bool> m_faceIgnore;
-	uint32_t m_ignoredFaceCount;
 	Array<uint32_t> m_indices;
 	Array<Vector3> m_positions;
 	Array<Vector3> m_normals;
 	Array<Vector2> m_texcoords;
 
-	// Populated by createFaceGroups
-	Array<uint16_t> m_faceGroups;
-	Array<uint32_t> m_faceGroupFirstFace;
-	Array<uint32_t> m_faceGroupNextFace; // In: face. Out: the next face in the same group.
-	Array<uint32_t> m_faceGroupFaceCounts; // In: face group. Out: number of faces in the group.
-
 	// Populated by createColocals
-	uint32_t m_colocalVertexCount;
 	Array<uint32_t> m_nextColocalVertex; // In: vertex index. Out: the vertex index of the next colocal position.
 
 	// Populated by createBoundaries
@@ -2872,45 +2952,48 @@ private:
 	// Populated by linkBoundaries
 	Array<uint32_t> m_nextBoundaryEdges; // The index of the next boundary edge. UINT32_MAX if the edge is not a boundary edge.
 
-	struct EdgeKey {
-		EdgeKey() {}
-		EdgeKey(const EdgeKey &k) :
-				v0(k.v0), v1(k.v1) {}
-		EdgeKey(uint32_t v0, uint32_t v1) :
-				v0(v0), v1(v1) {}
+	struct EdgeKey
+	{
+		EdgeKey(const EdgeKey &k) : v0(k.v0), v1(k.v1) {}
+		EdgeKey(uint32_t v0, uint32_t v1) : v0(v0), v1(v1) {}
 		bool operator==(const EdgeKey &k) const { return v0 == k.v0 && v1 == k.v1; }
 
 		uint32_t v0;
 		uint32_t v1;
 	};
 
-	struct EdgeHash {
+	struct EdgeHash
+	{
 		uint32_t operator()(const EdgeKey &k) const { return k.v0 * 32768u + k.v1; }
 	};
 
 	HashMap<EdgeKey, EdgeHash> m_edgeMap;
 
 public:
-	class BoundaryLoopEdgeIterator {
+	class BoundaryLoopEdgeIterator
+	{
 	public:
-		BoundaryLoopEdgeIterator(const Mesh *mesh, uint32_t edge) :
-				m_mesh(mesh), m_first(UINT32_MAX), m_current(edge) {}
+		BoundaryLoopEdgeIterator(const Mesh *mesh, uint32_t edge) : m_mesh(mesh), m_first(UINT32_MAX), m_current(edge) {}
 
-		void advance() {
+		void advance()
+		{
 			if (m_first == UINT32_MAX)
 				m_first = m_current;
 			m_current = m_mesh->m_nextBoundaryEdges[m_current];
 		}
 
-		bool isDone() const {
+		bool isDone() const
+		{
 			return m_first == m_current || m_current == UINT32_MAX;
 		}
 
-		uint32_t edge() const {
+		uint32_t edge() const
+		{
 			return m_current;
 		}
 
-		uint32_t nextEdge() const {
+		uint32_t nextEdge() const
+		{
 			return m_mesh->m_nextBoundaryEdges[m_current];
 		}
 
@@ -2920,27 +3003,31 @@ public:
 		uint32_t m_current;
 	};
 
-	class ColocalVertexIterator {
+	class ColocalVertexIterator
+	{
 	public:
-		ColocalVertexIterator(const Mesh *mesh, uint32_t v) :
-				m_mesh(mesh), m_first(UINT32_MAX), m_current(v) {}
+		ColocalVertexIterator(const Mesh *mesh, uint32_t v) : m_mesh(mesh), m_first(UINT32_MAX), m_current(v) {}
 
-		void advance() {
+		void advance()
+		{
 			if (m_first == UINT32_MAX)
 				m_first = m_current;
 			if (!m_mesh->m_nextColocalVertex.isEmpty())
 				m_current = m_mesh->m_nextColocalVertex[m_current];
 		}
 
-		bool isDone() const {
+		bool isDone() const
+		{
 			return m_first == m_current;
 		}
 
-		uint32_t vertex() const {
+		uint32_t vertex() const
+		{
 			return m_current;
 		}
 
-		const Vector3 *pos() const {
+		const Vector3 *pos() const
+		{
 			return &m_mesh->m_positions[m_current];
 		}
 
@@ -2950,33 +3037,39 @@ public:
 		uint32_t m_current;
 	};
 
-	class ColocalEdgeIterator {
+	class ColocalEdgeIterator
+	{
 	public:
-		ColocalEdgeIterator(const Mesh *mesh, uint32_t vertex0, uint32_t vertex1) :
-				m_mesh(mesh), m_vertex0It(mesh, vertex0), m_vertex1It(mesh, vertex1), m_vertex1(vertex1) {
+		ColocalEdgeIterator(const Mesh *mesh, uint32_t vertex0, uint32_t vertex1) : m_mesh(mesh), m_vertex0It(mesh, vertex0), m_vertex1It(mesh, vertex1), m_vertex1(vertex1)
+		{
 			do {
 				if (!resetElement()) {
 					advanceVertex1();
-				} else {
+				}
+				else {
 					break;
 				}
 			} while (!isDone());
 		}
 
-		void advance() {
+		void advance()
+		{
 			advanceElement();
 		}
 
-		bool isDone() const {
+		bool isDone() const
+		{
 			return m_vertex0It.isDone() && m_vertex1It.isDone() && m_edge == UINT32_MAX;
 		}
 
-		uint32_t edge() const {
+		uint32_t edge() const
+		{
 			return m_edge;
 		}
 
 	private:
-		bool resetElement() {
+		bool resetElement()
+		{
 			m_edge = m_mesh->m_edgeMap.get(Mesh::EdgeKey(m_vertex0It.vertex(), m_vertex1It.vertex()));
 			while (m_edge != UINT32_MAX) {
 				if (!isIgnoredFace())
@@ -2989,7 +3082,8 @@ public:
 			return true;
 		}
 
-		void advanceElement() {
+		void advanceElement()
+		{
 			for (;;) {
 				m_edge = m_mesh->m_edgeMap.getNext(m_edge);
 				if (m_edge == UINT32_MAX)
@@ -3001,15 +3095,17 @@ public:
 				advanceVertex1();
 		}
 
-		void advanceVertex1() {
+		void advanceVertex1()
+		{
 			auto successful = false;
-			while (!successful) {
+			while (!successful)	{
 				m_vertex1It.advance();
 				if (m_vertex1It.isDone()) {
 					if (!m_vertex0It.isDone()) {
 						m_vertex0It.advance();
 						m_vertex1It = ColocalVertexIterator(m_mesh, m_vertex1);
-					} else {
+					}
+					else {
 						return;
 					}
 				}
@@ -3017,7 +3113,8 @@ public:
 			}
 		}
 
-		bool isIgnoredFace() const {
+		bool isIgnoredFace() const
+		{
 			return m_mesh->m_faceIgnore[meshEdgeFace(m_edge)];
 		}
 
@@ -3027,21 +3124,24 @@ public:
 		uint32_t m_edge;
 	};
 
-	class FaceEdgeIterator {
+	class FaceEdgeIterator 
+	{
 	public:
-		FaceEdgeIterator(const Mesh *mesh, uint32_t face) :
-				m_mesh(mesh), m_face(face), m_relativeEdge(0) {
+		FaceEdgeIterator (const Mesh *mesh, uint32_t face) : m_mesh(mesh), m_face(face), m_relativeEdge(0)
+		{
 			m_edge = m_face * 3;
 		}
 
-		void advance() {
+		void advance()
+		{
 			if (m_relativeEdge < 3) {
 				m_edge++;
 				m_relativeEdge++;
 			}
 		}
 
-		bool isDone() const {
+		bool isDone() const
+		{
 			return m_relativeEdge == 3;
 		}
 
@@ -3052,8 +3152,9 @@ public:
 		uint32_t relativeEdge() const { return m_relativeEdge; }
 		uint32_t face() const { return m_face; }
 		uint32_t oppositeEdge() const { return m_mesh->m_oppositeEdges[m_edge]; }
-
-		uint32_t oppositeFace() const {
+		
+		uint32_t oppositeFace() const
+		{
 			const uint32_t oedge = m_mesh->m_oppositeEdges[m_edge];
 			if (oedge == UINT32_MAX)
 				return UINT32_MAX;
@@ -3075,36 +3176,164 @@ public:
 		uint32_t m_edge;
 		uint32_t m_relativeEdge;
 	};
+};
+
+struct MeshFaceGroups
+{
+	typedef uint32_t Handle;
+	static constexpr Handle kInvalid = UINT32_MAX;
+
+	MeshFaceGroups(const Mesh *mesh) : m_mesh(mesh), m_groups(MemTag::Mesh), m_firstFace(MemTag::Mesh), m_nextFace(MemTag::Mesh), m_faceCount(MemTag::Mesh) {}
+	XA_INLINE Handle groupAt(uint32_t face) const { return m_groups[face]; }
+	XA_INLINE uint32_t groupCount() const { return m_faceCount.size(); }
+	XA_INLINE uint32_t nextFace(uint32_t face) const { return m_nextFace[face]; }
+	XA_INLINE uint32_t faceCount(uint32_t group) const { return m_faceCount[group]; }
+
+	void compute()
+	{
+		m_groups.resize(m_mesh->faceCount());
+		m_groups.fillBytes(0xff); // Set all faces to kInvalid
+		uint32_t firstUnassignedFace = 0;
+		Handle group = 0;
+		Array<uint32_t> growFaces;
+		const uint32_t n = m_mesh->faceCount();
+		m_nextFace.resize(n);
+		for (;;) {
+			// Find an unassigned face.
+			uint32_t face = UINT32_MAX;
+			for (uint32_t f = firstUnassignedFace; f < n; f++) {
+				if (m_groups[f] == kInvalid && !m_mesh->isFaceIgnored(f)) {
+					face = f;
+					firstUnassignedFace = f + 1;
+					break;
+				}
+			}
+			if (face == UINT32_MAX)
+				break; // All faces assigned to a group (except ignored faces).
+			m_groups[face] = group;
+			m_nextFace[face] = UINT32_MAX;
+			m_firstFace.push_back(face);
+			growFaces.clear();
+			growFaces.push_back(face);
+			uint32_t prevFace = face, groupFaceCount = 1;
+			// Find faces connected to the face and assign them to the same group as the face, unless they are already assigned to another group.
+			for (;;) {
+				if (growFaces.isEmpty())
+					break;
+				const uint32_t f = growFaces.back();
+				growFaces.pop_back();
+				for (Mesh::FaceEdgeIterator edgeIt(m_mesh, f); !edgeIt.isDone(); edgeIt.advance()) {
+					// Iterate opposite edges. There may be more than one - non-manifold geometry can have duplicate edges.
+					// Prioritize the one with exact vertex match, not just colocal.
+					// If *any* of the opposite edges are already assigned to this group, don't do anything.
+					bool alreadyAssignedToThisGroup = false;
+					uint32_t bestConnectedFace = UINT32_MAX;
+					for (Mesh::ColocalEdgeIterator oppositeEdgeIt(m_mesh, edgeIt.vertex1(), edgeIt.vertex0()); !oppositeEdgeIt.isDone(); oppositeEdgeIt.advance()) {
+						const uint32_t oppositeEdge = oppositeEdgeIt.edge();
+						const uint32_t oppositeFace = meshEdgeFace(oppositeEdge);
+#if 0
+						// Reject opposite face if dihedral angle >= 90 degrees.
+						{
+							Vector3 a = m_mesh->computeFaceNormal(f);
+							Vector3 b = m_mesh->computeFaceNormal(oppositeFace);
+							if (dot(a, b) <= 0.0f)
+								continue;
+						}
+#endif
+						if (m_mesh->isFaceIgnored(oppositeFace))
+							continue; // Don't add ignored faces to group.
+						if (m_groups[oppositeFace] == group) {
+							alreadyAssignedToThisGroup = true;
+							break;
+						}
+						if (m_groups[oppositeFace] != kInvalid)
+							continue; // Connected face is already assigned to another group.
+						if (faceDuplicatesGroupEdge(group, oppositeFace))
+							continue; // Don't want duplicate edges in a group.
+						const uint32_t oppositeVertex0 = m_mesh->vertexAt(meshEdgeIndex0(oppositeEdge));
+						const uint32_t oppositeVertex1 = m_mesh->vertexAt(meshEdgeIndex1(oppositeEdge));
+						if (bestConnectedFace == UINT32_MAX || (oppositeVertex0 == edgeIt.vertex1() && oppositeVertex1 == edgeIt.vertex0()))
+							bestConnectedFace = oppositeFace;
+#if 0
+						else {
+							// Choose the opposite face with the smallest dihedral angle.
+							const float d1 = 1.0f - dot(computeFaceNormal(f), computeFaceNormal(bestConnectedFace));
+							const float d2 = 1.0f - dot(computeFaceNormal(f), computeFaceNormal(oppositeFace));
+							if (d2 < d1)
+								bestConnectedFace = oppositeFace;
+						}
+#endif
+					}
+					if (!alreadyAssignedToThisGroup && bestConnectedFace != UINT32_MAX) {
+						m_groups[bestConnectedFace] = group;
+						m_nextFace[bestConnectedFace] = UINT32_MAX;
+						if (prevFace != UINT32_MAX)
+							m_nextFace[prevFace] = bestConnectedFace;
+						prevFace = bestConnectedFace;
+						groupFaceCount++;
+						growFaces.push_back(bestConnectedFace);
+					}
+				}
+			}
+			m_faceCount.push_back(groupFaceCount);
+			group++;
+			XA_ASSERT(group < kInvalid);
+		}
+	}
 
-	class GroupFaceIterator {
+	class Iterator
+	{
 	public:
-		GroupFaceIterator(const Mesh *mesh, uint32_t group) :
-				m_mesh(mesh) {
-			XA_DEBUG_ASSERT(group != UINT32_MAX);
-			m_current = mesh->m_faceGroupFirstFace[group];
+		Iterator(const MeshFaceGroups *meshFaceGroups, Handle group) : m_meshFaceGroups(meshFaceGroups)
+		{
+			XA_DEBUG_ASSERT(group != kInvalid);
+			m_current = m_meshFaceGroups->m_firstFace[group];
 		}
 
-		void advance() {
-			m_current = m_mesh->m_faceGroupNextFace[m_current];
+		void advance()
+		{
+			m_current = m_meshFaceGroups->m_nextFace[m_current];
 		}
 
-		bool isDone() const {
+		bool isDone() const
+		{
 			return m_current == UINT32_MAX;
 		}
 
-		uint32_t face() const {
+		uint32_t face() const
+		{
 			return m_current;
 		}
 
 	private:
-		const Mesh *m_mesh;
+		const MeshFaceGroups *m_meshFaceGroups;
 		uint32_t m_current;
 	};
+
+private:
+	// Check if the face duplicates any edges of any face already in the group.
+	bool faceDuplicatesGroupEdge(Handle group, uint32_t face) const
+	{
+		for (Mesh::FaceEdgeIterator edgeIt(m_mesh, face); !edgeIt.isDone(); edgeIt.advance()) {
+			for (Mesh::ColocalEdgeIterator colocalEdgeIt(m_mesh, edgeIt.vertex0(), edgeIt.vertex1()); !colocalEdgeIt.isDone(); colocalEdgeIt.advance()) {
+				if (m_groups[meshEdgeFace(colocalEdgeIt.edge())] == group)
+					return true;
+			}
+		}
+		return false;
+	}
+
+	const Mesh *m_mesh;
+	Array<Handle> m_groups;
+	Array<uint32_t> m_firstFace;
+	Array<uint32_t> m_nextFace; // In: face. Out: the next face in the same group.
+	Array<uint32_t> m_faceCount; // In: face group. Out: number of faces in the group.
 };
 
-constexpr uint16_t Mesh::kInvalidFaceGroup;
+constexpr MeshFaceGroups::Handle MeshFaceGroups::kInvalid;
 
-static bool meshCloseHole(Mesh *mesh, const Array<uint32_t> &holeVertices, const Vector3 &normal) {
+static bool meshCloseHole(Mesh *mesh, const Array<uint32_t> &holeVertices, const Vector3 &normal)
+{
 #if XA_CLOSE_HOLES_CHECK_EDGE_INTERSECTION
 	const uint32_t faceCount = mesh->faceCount();
 #endif
@@ -3230,7 +3459,8 @@ static bool meshCloseHole(Mesh *mesh, const Array<uint32_t> &holeVertices, const
 	return true;
 }
 
-static bool meshCloseHoles(Mesh *mesh, const Array<uint32_t> &boundaryLoops, const Vector3 &normal, uint32_t *holeCount, Array<uint32_t> *holeFaceCounts) {
+static bool meshCloseHoles(Mesh *mesh, const Array<uint32_t> &boundaryLoops, const Vector3 &normal, uint32_t *holeCount, Array<uint32_t> *holeFaceCounts)
+{
 	if (holeFaceCounts)
 		holeFaceCounts->clear();
 	// Compute lengths.
@@ -3286,7 +3516,8 @@ static bool meshCloseHoles(Mesh *mesh, const Array<uint32_t> &boundaryLoops, con
 	return result;
 }
 
-static bool meshIsPlanar(const Mesh &mesh) {
+static bool meshIsPlanar(const Mesh &mesh)
+{
 	const Vector3 p1 = mesh.position(mesh.vertexAt(0));
 	const Vector3 p2 = mesh.position(mesh.vertexAt(1));
 	const Vector3 p3 = mesh.position(mesh.vertexAt(2));
@@ -3312,12 +3543,14 @@ Fixing T-junctions.
 - Split edge.
 
 */
-struct SplitEdge {
+struct SplitEdge
+{
 	uint32_t edge;
 	float t;
 	uint32_t vertex;
 
-	bool operator<(const SplitEdge &other) const {
+	bool operator<(const SplitEdge &other) const
+	{
 		if (edge < other.edge)
 			return true;
 		else if (edge == other.edge) {
@@ -3329,7 +3562,8 @@ struct SplitEdge {
 };
 
 // Returns nullptr if there were no t-junctions to fix.
-static Mesh *meshFixTJunctions(const Mesh &inputMesh, bool *duplicatedEdge, bool *failed, uint32_t *fixedTJunctionsCount) {
+static Mesh *meshFixTJunctions(const Mesh &inputMesh, bool *duplicatedEdge, bool *failed, uint32_t *fixedTJunctionsCount)
+{
 	if (duplicatedEdge)
 		*duplicatedEdge = false;
 	if (failed)
@@ -3404,7 +3638,8 @@ static Mesh *meshFixTJunctions(const Mesh &inputMesh, bool *duplicatedEdge, bool
 }
 
 // boundaryLoops are the first edges for each boundary loop.
-static void meshGetBoundaryLoops(const Mesh &mesh, Array<uint32_t> &boundaryLoops) {
+static void meshGetBoundaryLoops(const Mesh &mesh, Array<uint32_t> &boundaryLoops)
+{
 	const uint32_t edgeCount = mesh.edgeCount();
 	BitArray bitFlags(edgeCount);
 	bitFlags.zeroOutMemory();
@@ -3419,23 +3654,26 @@ static void meshGetBoundaryLoops(const Mesh &mesh, Array<uint32_t> &boundaryLoop
 	}
 }
 
-struct Progress {
-	Progress(ProgressCategory::Enum category, ProgressFunc func, void *userData, uint32_t maxValue) :
-			value(0), cancel(false), m_category(category), m_func(func), m_userData(userData), m_maxValue(maxValue), m_progress(0) {
+struct Progress
+{
+	Progress(ProgressCategory::Enum category, ProgressFunc func, void *userData, uint32_t maxValue) : value(0), cancel(false), m_category(category), m_func(func), m_userData(userData), m_maxValue(maxValue), m_progress(0)
+	{
 		if (m_func) {
 			if (!m_func(category, 0, userData))
 				cancel = true;
 		}
 	}
 
-	~Progress() {
+	~Progress()
+	{
 		if (m_func) {
 			if (!m_func(m_category, 100, m_userData))
 				cancel = true;
 		}
 	}
 
-	void update() {
+	void update()
+	{
 		if (!m_func)
 			return;
 		m_mutex.lock();
@@ -3448,7 +3686,8 @@ struct Progress {
 		m_mutex.unlock();
 	}
 
-	void setMaxValue(uint32_t maxValue) {
+	void setMaxValue(uint32_t maxValue)
+	{
 		m_mutex.lock();
 		m_maxValue = maxValue;
 		m_mutex.unlock();
@@ -3466,31 +3705,32 @@ private:
 	std::mutex m_mutex;
 };
 
-struct Spinlock {
-	void lock() {
-		while (m_lock.test_and_set(std::memory_order_acquire)) {
-		}
-	}
+struct Spinlock
+{
+	void lock() { while(m_lock.test_and_set(std::memory_order_acquire)) {} }
 	void unlock() { m_lock.clear(std::memory_order_release); }
 
 private:
 	std::atomic_flag m_lock = ATOMIC_FLAG_INIT;
 };
 
-struct TaskGroupHandle {
+struct TaskGroupHandle
+{
 	uint32_t value = UINT32_MAX;
 };
 
-struct Task {
+struct Task
+{
 	void (*func)(void *userData);
 	void *userData;
 };
 
 #if XA_MULTITHREADED
-class TaskScheduler {
+class TaskScheduler
+{
 public:
-	TaskScheduler() :
-			m_shutdown(false) {
+	TaskScheduler() : m_shutdown(false)
+	{
 		m_threadIndex = 0;
 		// Max with current task scheduler usage is 1 per thread + 1 deep nesting, but allow for some slop.
 		m_maxGroups = std::thread::hardware_concurrency() * 4;
@@ -3508,7 +3748,8 @@ public:
 		}
 	}
 
-	~TaskScheduler() {
+	~TaskScheduler()
+	{
 		m_shutdown = true;
 		for (uint32_t i = 0; i < m_workers.size(); i++) {
 			Worker &worker = m_workers[i];
@@ -3526,11 +3767,13 @@ public:
 		XA_FREE(m_groups);
 	}
 
-	uint32_t threadCount() const {
+	uint32_t threadCount() const
+	{
 		return max(1u, std::thread::hardware_concurrency()); // Including the main thread.
 	}
 
-	TaskGroupHandle createTaskGroup(uint32_t reserveSize = 0) {
+	TaskGroupHandle createTaskGroup(uint32_t reserveSize = 0)
+	{
 		// Claim the first free group.
 		for (uint32_t i = 0; i < m_maxGroups; i++) {
 			TaskGroup &group = m_groups[i];
@@ -3552,7 +3795,8 @@ public:
 		return handle;
 	}
 
-	void run(TaskGroupHandle handle, Task task) {
+	void run(TaskGroupHandle handle, const Task &task)
+	{
 		XA_DEBUG_ASSERT(handle.value != UINT32_MAX);
 		TaskGroup &group = m_groups[handle.value];
 		group.queueLock.lock();
@@ -3566,7 +3810,8 @@ public:
 		}
 	}
 
-	void wait(TaskGroupHandle *handle) {
+	void wait(TaskGroupHandle *handle)
+	{
 		if (handle->value == UINT32_MAX) {
 			XA_DEBUG_ASSERT(false);
 			return;
@@ -3594,7 +3839,8 @@ public:
 	static uint32_t currentThreadIndex() { return m_threadIndex; }
 
 private:
-	struct TaskGroup {
+	struct TaskGroup
+	{
 		std::atomic<bool> free;
 		Array<Task> queue; // Items are never removed. queueHead is incremented to pop items.
 		uint32_t queueHead = 0;
@@ -3602,7 +3848,8 @@ private:
 		std::atomic<uint32_t> ref; // Increment when a task is enqueued, decrement when a task finishes.
 	};
 
-	struct Worker {
+	struct Worker
+	{
 		std::thread *thread = nullptr;
 		std::mutex mutex;
 		std::condition_variable cv;
@@ -3610,16 +3857,17 @@ private:
 	};
 
 	TaskGroup *m_groups;
-	uint32_t m_maxGroups;
 	Array<Worker> m_workers;
 	std::atomic<bool> m_shutdown;
+	uint32_t m_maxGroups;
 	static thread_local uint32_t m_threadIndex;
 
-	static void workerThread(TaskScheduler *scheduler, Worker *worker, uint32_t threadIndex) {
+	static void workerThread(TaskScheduler *scheduler, Worker *worker, uint32_t threadIndex)
+	{
 		m_threadIndex = threadIndex;
 		std::unique_lock<std::mutex> lock(worker->mutex);
 		for (;;) {
-			worker->cv.wait(lock, [=] { return worker->wakeup.load(); });
+			worker->cv.wait(lock, [=]{ return worker->wakeup.load(); });
 			worker->wakeup = false;
 			for (;;) {
 				if (scheduler->m_shutdown)
@@ -3650,18 +3898,22 @@ private:
 
 thread_local uint32_t TaskScheduler::m_threadIndex;
 #else
-class TaskScheduler {
+class TaskScheduler
+{
 public:
-	~TaskScheduler() {
+	~TaskScheduler()
+	{
 		for (uint32_t i = 0; i < m_groups.size(); i++)
 			destroyGroup({ i });
 	}
 
-	uint32_t threadCount() const {
+	uint32_t threadCount() const
+	{
 		return 1;
 	}
 
-	TaskGroupHandle createTaskGroup(uint32_t reserveSize = 0) {
+	TaskGroupHandle createTaskGroup(uint32_t reserveSize = 0)
+	{
 		TaskGroup *group = XA_NEW(MemTag::Default, TaskGroup);
 		group->queue.reserve(reserveSize);
 		m_groups.push_back(group);
@@ -3670,11 +3922,13 @@ public:
 		return handle;
 	}
 
-	void run(TaskGroupHandle handle, Task task) {
+	void run(TaskGroupHandle handle, Task task)
+	{
 		m_groups[handle.value]->queue.push_back(task);
 	}
 
-	void wait(TaskGroupHandle *handle) {
+	void wait(TaskGroupHandle *handle)
+	{
 		if (handle->value == UINT32_MAX) {
 			XA_DEBUG_ASSERT(false);
 			return;
@@ -3690,7 +3944,8 @@ public:
 	static uint32_t currentThreadIndex() { return 0; }
 
 private:
-	void destroyGroup(TaskGroupHandle handle) {
+	void destroyGroup(TaskGroupHandle handle)
+	{
 		TaskGroup *group = m_groups[handle.value];
 		if (group) {
 			group->~TaskGroup();
@@ -3699,7 +3954,8 @@ private:
 		}
 	}
 
-	struct TaskGroup {
+	struct TaskGroup
+	{
 		Array<Task> queue;
 	};
 
@@ -3712,7 +3968,8 @@ const uint8_t TGA_TYPE_RGB = 2;
 const uint8_t TGA_ORIGIN_UPPER = 0x20;
 
 #pragma pack(push, 1)
-struct TgaHeader {
+struct TgaHeader
+{
 	uint8_t id_length;
 	uint8_t colormap_type;
 	uint8_t image_type;
@@ -3729,7 +3986,8 @@ struct TgaHeader {
 };
 #pragma pack(pop)
 
-static void WriteTga(const char *filename, const uint8_t *data, uint32_t width, uint32_t height) {
+static void WriteTga(const char *filename, const uint8_t *data, uint32_t width, uint32_t height)
+{
 	XA_DEBUG_ASSERT(sizeof(TgaHeader) == TgaHeader::Size);
 	FILE *f;
 	XA_FOPEN(f, filename, "wb");
@@ -3754,10 +4012,12 @@ static void WriteTga(const char *filename, const uint8_t *data, uint32_t width,
 }
 #endif
 
-template <typename T>
-class ThreadLocal {
+template<typename T>
+class ThreadLocal
+{
 public:
-	ThreadLocal() {
+	ThreadLocal()
+	{
 #if XA_MULTITHREADED
 		const uint32_t n = std::thread::hardware_concurrency();
 #else
@@ -3768,7 +4028,8 @@ public:
 			new (&m_array[i]) T;
 	}
 
-	~ThreadLocal() {
+	~ThreadLocal()
+	{
 #if XA_MULTITHREADED
 		const uint32_t n = std::thread::hardware_concurrency();
 #else
@@ -3779,7 +4040,8 @@ public:
 		XA_FREE(m_array);
 	}
 
-	T &get() const {
+	T &get() const
+	{
 		return m_array[TaskScheduler::currentThreadIndex()];
 	}
 
@@ -3787,9 +4049,11 @@ private:
 	T *m_array;
 };
 
-class UniformGrid2 {
+class UniformGrid2
+{
 public:
-	void reset(const Vector2 *positions, const uint32_t *indices = nullptr, uint32_t reserveEdgeCount = 0) {
+	void reset(const Vector2 *positions, const uint32_t *indices = nullptr, uint32_t reserveEdgeCount = 0)
+	{
 		m_edges.clear();
 		if (reserveEdgeCount > 0)
 			m_edges.reserve(reserveEdgeCount);
@@ -3798,14 +4062,16 @@ public:
 		m_cellDataOffsets.clear();
 	}
 
-	void append(uint32_t edge) {
+	void append(uint32_t edge)
+	{
 		XA_DEBUG_ASSERT(m_cellDataOffsets.isEmpty());
 		m_edges.push_back(edge);
 	}
 
-	bool intersect(Vector2 v1, Vector2 v2, float epsilon) {
+	bool intersect(Vector2 v1, Vector2 v2, float epsilon)
+	{
 		const uint32_t edgeCount = m_edges.size();
-		bool bruteForce = edgeCount <= 64;
+		bool bruteForce = edgeCount <= 20;
 		if (!bruteForce && m_cellDataOffsets.isEmpty())
 			bruteForce = !createGrid();
 		if (bruteForce) {
@@ -3829,37 +4095,80 @@ public:
 		return false;
 	}
 
-	bool intersectSelf(float epsilon) {
-		const uint32_t edgeCount = m_edges.size();
-		bool bruteForce = edgeCount <= 64;
+	// If edges is empty, checks for intersection with all edges in the grid.
+	bool intersect(float epsilon, ConstArrayView<uint32_t> edges = ConstArrayView<uint32_t>(), ConstArrayView<uint32_t> ignoreEdges = ConstArrayView<uint32_t>())
+	{
+		bool bruteForce = m_edges.size() <= 20;
 		if (!bruteForce && m_cellDataOffsets.isEmpty())
 			bruteForce = !createGrid();
-		for (uint32_t i = 0; i < edgeCount; i++) {
-			const uint32_t edge1 = m_edges[i];
+		const uint32_t *edges1, *edges2 = nullptr;
+		uint32_t edges1Count, edges2Count = 0;
+		if (edges.length == 0) {
+			edges1 = m_edges.data();
+			edges1Count = m_edges.size();
+		} else {
+			edges1 = edges.data;
+			edges1Count = edges.length;
+		}
+		if (bruteForce) {
+			edges2 = m_edges.data();
+			edges2Count = m_edges.size();
+		}
+		for (uint32_t i = 0; i < edges1Count; i++) {
+			const uint32_t edge1 = edges1[i];
+			const uint32_t edge1Vertex[2] = { vertexAt(meshEdgeIndex0(edge1)), vertexAt(meshEdgeIndex1(edge1)) };
+			const Vector2 &edge1Position1 = m_positions[edge1Vertex[0]];
+			const Vector2 &edge1Position2 = m_positions[edge1Vertex[1]];
+			const Extents2 edge1Extents(edge1Position1, edge1Position2);
+			uint32_t j = 0;
 			if (bruteForce) {
-				for (uint32_t j = 0; j < edgeCount; j++) {
-					const uint32_t edge2 = m_edges[j];
-					if (edgesIntersect(edge1, edge2, epsilon))
-						return true;
+				// If checking against self, test each edge pair only once.
+				if (edges.length == 0) {
+					j = i + 1;
+					if (j == edges1Count)
+						break;
 				}
 			} else {
 				computePotentialEdges(edgePosition0(edge1), edgePosition1(edge1));
-				uint32_t prevEdge = UINT32_MAX;
-				for (uint32_t j = 0; j < m_potentialEdges.size(); j++) {
-					const uint32_t edge2 = m_potentialEdges[j];
-					if (edge2 == prevEdge)
-						continue;
-					if (edgesIntersect(edge1, edge2, epsilon))
-						return true;
-					prevEdge = edge2;
+				edges2 = m_potentialEdges.data();
+				edges2Count = m_potentialEdges.size();
+			}
+			uint32_t prevEdge = UINT32_MAX; // Handle potential edges duplicates.
+			for (; j < edges2Count; j++) {
+				const uint32_t edge2 = edges2[j];
+				if (edge1 == edge2)
+					continue;
+				if (edge2 == prevEdge)
+					continue;
+				prevEdge = edge2;
+				// Check if edge2 is ignored.
+				bool ignore = false;
+				for (uint32_t k = 0; k < ignoreEdges.length; k++) {
+					if (edge2 == ignoreEdges[k]) {
+						ignore = true;
+						break;
+					}
 				}
+				if (ignore)
+					continue;
+				const uint32_t edge2Vertex[2] = { vertexAt(meshEdgeIndex0(edge2)), vertexAt(meshEdgeIndex1(edge2)) };
+				// Ignore connected edges, since they can't intersect (only overlap), and may be detected as false positives.
+				if (edge1Vertex[0] == edge2Vertex[0] || edge1Vertex[0] == edge2Vertex[1] || edge1Vertex[1] == edge2Vertex[0] || edge1Vertex[1] == edge2Vertex[1])
+					continue;
+				const Vector2 &edge2Position1 = m_positions[edge2Vertex[0]];
+				const Vector2 &edge2Position2 = m_positions[edge2Vertex[1]];
+				if (!Extents2::intersect(edge1Extents, Extents2(edge2Position1, edge2Position2)))
+					continue;
+				if (linesIntersect(edge1Position1, edge1Position2, edge2Position1, edge2Position2, epsilon))
+					return true;
 			}
 		}
 		return false;
 	}
 
 #if XA_DEBUG_EXPORT_BOUNDARY_GRID
-	void debugExport(const char *filename) {
+	void debugExport(const char *filename)
+	{
 		Array<uint8_t> image;
 		image.resize(m_gridWidth * m_gridHeight * 3);
 		for (uint32_t y = 0; y < m_gridHeight; y++) {
@@ -3881,7 +4190,8 @@ public:
 #endif
 
 private:
-	bool createGrid() {
+	bool createGrid()
+	{
 		// Compute edge extents. Min will be the grid origin.
 		const uint32_t edgeCount = m_edges.size();
 		Extents2 edgeExtents;
@@ -3933,7 +4243,8 @@ private:
 		return true;
 	}
 
-	void computePotentialEdges(Vector2 p1, Vector2 p2) {
+	void computePotentialEdges(Vector2 p1, Vector2 p2)
+	{
 		m_potentialEdges.clear();
 		traverse(p1, p2);
 		for (uint32_t j = 0; j < m_traversedCellOffsets.size(); j++) {
@@ -3951,7 +4262,8 @@ private:
 	}
 
 	// "A Fast Voxel Traversal Algorithm for Ray Tracing"
-	void traverse(Vector2 p1, Vector2 p2) {
+	void traverse(Vector2 p1, Vector2 p2)
+	{
 		const Vector2 dir = p2 - p1;
 		const Vector2 normal = normalizeSafe(dir, Vector2(0.0f), kEpsilon);
 		const int stepX = dir.x >= 0 ? 1 : -1;
@@ -3972,12 +4284,14 @@ private:
 		if (normal.x > kEpsilon || normal.x < -kEpsilon) {
 			tMaxX = (distToNextCellX * stepX) / normal.x;
 			tDeltaX = (m_cellSize * stepX) / normal.x;
-		} else
+		}
+		else
 			tMaxX = tDeltaX = FLT_MAX;
 		if (normal.y > kEpsilon || normal.y < -kEpsilon) {
 			tMaxY = (distToNextCellY * stepY) / normal.y;
 			tDeltaY = (m_cellSize * stepY) / normal.y;
-		} else
+		}
+		else
 			tMaxY = tDeltaY = FLT_MAX;
 		m_traversedCellOffsets.clear();
 		m_traversedCellOffsets.push_back(firstCell[0] + firstCell[1] * m_gridWidth);
@@ -3992,11 +4306,11 @@ private:
 			}
 			if (currentCell[0] >= m_gridWidth || currentCell[1] >= m_gridHeight)
 				break;
-			if (stepX == 0 && currentCell[0] < lastCell[0])
+			if (stepX == -1 && currentCell[0] < lastCell[0])
 				break;
 			if (stepX == 1 && currentCell[0] > lastCell[0])
 				break;
-			if (stepY == 0 && currentCell[1] < lastCell[1])
+			if (stepY == -1 && currentCell[1] < lastCell[1])
 				break;
 			if (stepY == 1 && currentCell[1] > lastCell[1])
 				break;
@@ -4004,34 +4318,28 @@ private:
 		}
 	}
 
-	bool edgesIntersect(uint32_t edge1, uint32_t edge2, float epsilon) const {
-		if (edge1 == edge2)
-			return false;
-		const uint32_t ai[2] = { vertexAt(meshEdgeIndex0(edge1)), vertexAt(meshEdgeIndex1(edge1)) };
-		const uint32_t bi[2] = { vertexAt(meshEdgeIndex0(edge2)), vertexAt(meshEdgeIndex1(edge2)) };
-		// Ignore connected edges, since they can't intersect (only overlap), and may be detected as false positives.
-		if (ai[0] == bi[0] || ai[0] == bi[1] || ai[1] == bi[0] || ai[1] == bi[1])
-			return false;
-		return linesIntersect(m_positions[ai[0]], m_positions[ai[1]], m_positions[bi[0]], m_positions[bi[1]], epsilon);
-	}
-
-	uint32_t cellX(float x) const {
+	uint32_t cellX(float x) const
+	{
 		return min((uint32_t)max(0.0f, (x - m_gridOrigin.x) / m_cellSize), m_gridWidth - 1u);
 	}
 
-	uint32_t cellY(float y) const {
+	uint32_t cellY(float y) const
+	{
 		return min((uint32_t)max(0.0f, (y - m_gridOrigin.y) / m_cellSize), m_gridHeight - 1u);
 	}
 
-	Vector2 edgePosition0(uint32_t edge) const {
+	Vector2 edgePosition0(uint32_t edge) const
+	{
 		return m_positions[vertexAt(meshEdgeIndex0(edge))];
 	}
 
-	Vector2 edgePosition1(uint32_t edge) const {
+	Vector2 edgePosition1(uint32_t edge) const
+	{
 		return m_positions[vertexAt(meshEdgeIndex1(edge))];
 	}
 
-	uint32_t vertexAt(uint32_t index) const {
+	uint32_t vertexAt(uint32_t index) const
+	{
 		return m_indices ? m_indices[index] : index;
 	}
 
@@ -4047,29 +4355,835 @@ private:
 	Array<uint32_t> m_traversedCellOffsets;
 };
 
-struct UvMeshChart {
+struct UvMeshChart
+{
 	Array<uint32_t> faces;
 	Array<uint32_t> indices;
 	uint32_t material;
 };
 
-struct UvMesh {
+struct UvMesh
+{
 	UvMeshDecl decl;
 	Array<uint32_t> indices;
 	Array<UvMeshChart *> charts;
 	Array<uint32_t> vertexToChartMap;
 };
 
-struct UvMeshInstance {
+struct UvMeshInstance
+{
 	UvMesh *mesh;
 	Array<Vector2> texcoords;
 	bool rotateCharts;
 };
 
+/*
+ *  Copyright (c) 2004-2010, Bruno Levy
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *  * Redistributions of source code must retain the above copyright notice,
+ *  this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright notice,
+ *  this list of conditions and the following disclaimer in the documentation
+ *  and/or other materials provided with the distribution.
+ *  * Neither the name of the ALICE Project-Team nor the names of its
+ *  contributors may be used to endorse or promote products derived from this
+ *  software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  If you modify this software, you should include a notice giving the
+ *  name of the person performing the modification, the date of modification,
+ *  and the reason for such modification.
+ *
+ *  Contact: Bruno Levy
+ *
+ *     levy@loria.fr
+ *
+ *     ALICE Project
+ *     LORIA, INRIA Lorraine,
+ *     Campus Scientifique, BP 239
+ *     54506 VANDOEUVRE LES NANCY CEDEX
+ *     FRANCE
+ */
+namespace opennl {
+#define NL_NEW(T)              XA_ALLOC(MemTag::OpenNL, T)
+#define NL_NEW_ARRAY(T,NB)     XA_ALLOC_ARRAY(MemTag::OpenNL, T, NB)
+#define NL_RENEW_ARRAY(T,x,NB) XA_REALLOC(MemTag::OpenNL, x, T, NB)
+#define NL_DELETE(x)           XA_FREE(x); x = nullptr 
+#define NL_DELETE_ARRAY(x)     XA_FREE(x); x = nullptr
+#define NL_CLEAR(x, T)         memset(x, 0, sizeof(T));
+#define NL_CLEAR_ARRAY(T,x,NB) memset(x, 0, (size_t)(NB)*sizeof(T)) 
+#define NL_NEW_VECTOR(dim)     XA_ALLOC_ARRAY(MemTag::OpenNL, double, dim)
+#define NL_DELETE_VECTOR(ptr)  XA_FREE(ptr)
+
+struct NLMatrixStruct;
+typedef NLMatrixStruct * NLMatrix;
+typedef void (*NLDestroyMatrixFunc)(NLMatrix M);
+typedef void (*NLMultMatrixVectorFunc)(NLMatrix M, const double* x, double* y);
+
+#define NL_MATRIX_SPARSE_DYNAMIC 0x1001
+#define NL_MATRIX_CRS            0x1002
+#define NL_MATRIX_OTHER          0x1006
+
+struct NLMatrixStruct
+{
+	uint32_t m;
+	uint32_t n;
+	uint32_t type;
+	NLDestroyMatrixFunc destroy_func;
+	NLMultMatrixVectorFunc mult_func;
+};
+
+/* Dynamic arrays for sparse row/columns */
+
+struct NLCoeff
+{
+	uint32_t index;
+	double value;
+};
+
+struct NLRowColumn
+{
+	uint32_t size;
+	uint32_t capacity;
+	NLCoeff* coeff;
+};
+
+/* Compressed Row Storage */
+
+struct NLCRSMatrix
+{
+	uint32_t m;
+	uint32_t n;
+	uint32_t type;
+	NLDestroyMatrixFunc destroy_func;
+	NLMultMatrixVectorFunc mult_func;
+	double* val;
+	uint32_t* rowptr;
+	uint32_t* colind;
+	uint32_t nslices;
+	uint32_t* sliceptr;
+};
+
+/* SparseMatrix data structure */
+
+struct NLSparseMatrix
+{
+	uint32_t m;
+	uint32_t n;
+	uint32_t type;
+	NLDestroyMatrixFunc destroy_func;
+	NLMultMatrixVectorFunc mult_func;
+	uint32_t diag_size;
+	uint32_t diag_capacity;
+	NLRowColumn* row;
+	NLRowColumn* column;
+	double*    diag;
+	uint32_t row_capacity;
+	uint32_t column_capacity;
+};
+
+/* NLContext data structure */
+
+struct NLBufferBinding
+{
+	void* base_address;
+	uint32_t stride;
+};
+
+#define NL_BUFFER_ITEM(B,i) *(double*)((void*)((char*)((B).base_address)+((i)*(B).stride)))
+
+struct NLContext
+{
+	NLBufferBinding *variable_buffer;
+	double *variable_value;
+	bool *variable_is_locked;
+	uint32_t *variable_index;
+	uint32_t n;
+	NLMatrix M;
+	NLMatrix P;
+	NLMatrix B;
+	NLRowColumn af;
+	NLRowColumn al;
+	double *x;
+	double *b;
+	uint32_t nb_variables;
+	uint32_t nb_systems;
+	uint32_t current_row;
+	uint32_t max_iterations;
+	bool max_iterations_defined;
+	double threshold;
+	double omega;
+	uint32_t used_iterations;
+	double error;
+};
+
+static void nlDeleteMatrix(NLMatrix M)
+{
+	if (!M)
+		return;
+	M->destroy_func(M);
+	NL_DELETE(M);
+}
+
+static void nlMultMatrixVector(NLMatrix M, const double* x, double* y)
+{
+	M->mult_func(M, x, y);
+}
+
+static void nlRowColumnConstruct(NLRowColumn* c)
+{
+	c->size = 0;
+	c->capacity = 0;
+	c->coeff = nullptr;
+}
+
+static void nlRowColumnDestroy(NLRowColumn* c)
+{
+	NL_DELETE_ARRAY(c->coeff);
+	c->size = 0;
+	c->capacity = 0;
+}
+
+static void nlRowColumnGrow(NLRowColumn* c)
+{
+	if (c->capacity != 0) {
+		c->capacity = 2 * c->capacity;
+		c->coeff = NL_RENEW_ARRAY(NLCoeff, c->coeff, c->capacity);
+	} else {
+		c->capacity = 4;
+		c->coeff = NL_NEW_ARRAY(NLCoeff, c->capacity);
+		NL_CLEAR_ARRAY(NLCoeff, c->coeff, c->capacity);
+	}
+}
+
+static void nlRowColumnAdd(NLRowColumn* c, uint32_t index, double value)
+{
+	for (uint32_t i = 0; i < c->size; i++) {
+		if (c->coeff[i].index == index) {
+			c->coeff[i].value += value;
+			return;
+		}
+	}
+	if (c->size == c->capacity)
+		nlRowColumnGrow(c);
+	c->coeff[c->size].index = index;
+	c->coeff[c->size].value = value;
+	c->size++;
+}
+
+/* Does not check whether the index already exists */
+static void nlRowColumnAppend(NLRowColumn* c, uint32_t index, double value)
+{
+	if (c->size == c->capacity)
+		nlRowColumnGrow(c);
+	c->coeff[c->size].index = index;
+	c->coeff[c->size].value = value;
+	c->size++;
+}
+
+static void nlRowColumnZero(NLRowColumn* c)
+{
+	c->size = 0;
+}
+
+static void nlRowColumnClear(NLRowColumn* c)
+{
+	c->size = 0;
+	c->capacity = 0;
+	NL_DELETE_ARRAY(c->coeff);
+}
+
+static int nlCoeffCompare(const void* p1, const void* p2)
+{
+	return (((NLCoeff*)(p2))->index < ((NLCoeff*)(p1))->index);
+}
+
+static void nlRowColumnSort(NLRowColumn* c)
+{
+	qsort(c->coeff, c->size, sizeof(NLCoeff), nlCoeffCompare);
+}
+
+/* CRSMatrix data structure */
+
+static void nlCRSMatrixDestroy(NLCRSMatrix* M)
+{
+	NL_DELETE_ARRAY(M->val);
+	NL_DELETE_ARRAY(M->rowptr);
+	NL_DELETE_ARRAY(M->colind);
+	NL_DELETE_ARRAY(M->sliceptr);
+	M->m = 0;
+	M->n = 0;
+	M->nslices = 0;
+}
+
+static void nlCRSMatrixMultSlice(NLCRSMatrix* M, const double* x, double* y, uint32_t Ibegin, uint32_t Iend)
+{
+	for (uint32_t i = Ibegin; i < Iend; ++i) {
+		double sum = 0.0;
+		for (uint32_t j = M->rowptr[i]; j < M->rowptr[i + 1]; ++j)
+			sum += M->val[j] * x[M->colind[j]];
+		y[i] = sum;
+	}
+}
+
+static void nlCRSMatrixMult(NLCRSMatrix* M, const double* x, double* y)
+{
+	int nslices = (int)(M->nslices);
+	for (int slice = 0; slice < nslices; ++slice)
+		nlCRSMatrixMultSlice(M, x, y, M->sliceptr[slice], M->sliceptr[slice + 1]);
+}
+
+static void nlCRSMatrixConstruct(NLCRSMatrix* M, uint32_t m, uint32_t n, uint32_t nnz, uint32_t nslices)
+{
+	M->m = m;
+	M->n = n;
+	M->type = NL_MATRIX_CRS;
+	M->destroy_func = (NLDestroyMatrixFunc)nlCRSMatrixDestroy;
+	M->mult_func = (NLMultMatrixVectorFunc)nlCRSMatrixMult;
+	M->nslices = nslices;
+	M->val = NL_NEW_ARRAY(double, nnz);
+	NL_CLEAR_ARRAY(double, M->val, nnz);
+	M->rowptr = NL_NEW_ARRAY(uint32_t, m + 1);
+	NL_CLEAR_ARRAY(uint32_t, M->rowptr, m + 1);
+	M->colind = NL_NEW_ARRAY(uint32_t, nnz);
+	NL_CLEAR_ARRAY(uint32_t, M->colind, nnz);
+	M->sliceptr = NL_NEW_ARRAY(uint32_t, nslices + 1);
+	NL_CLEAR_ARRAY(uint32_t, M->sliceptr, nslices + 1);
+}
+
+/* SparseMatrix data structure */
+
+static void nlSparseMatrixDestroyRowColumns(NLSparseMatrix* M)
+{
+	for (uint32_t i = 0; i < M->m; i++)
+		nlRowColumnDestroy(&(M->row[i]));
+	NL_DELETE_ARRAY(M->row);
+}
+
+static void nlSparseMatrixDestroy(NLSparseMatrix* M)
+{
+	XA_DEBUG_ASSERT(M->type == NL_MATRIX_SPARSE_DYNAMIC);
+	nlSparseMatrixDestroyRowColumns(M);
+	NL_DELETE_ARRAY(M->diag);
+}
+
+static void nlSparseMatrixAdd(NLSparseMatrix* M, uint32_t i, uint32_t j, double value)
+{
+	XA_DEBUG_ASSERT(i >= 0 && i <= M->m - 1);
+	XA_DEBUG_ASSERT(j >= 0 && j <= M->n - 1);
+	if (i == j)
+		M->diag[i] += value;
+	nlRowColumnAdd(&(M->row[i]), j, value);
+}
+
+/* Returns the number of non-zero coefficients */
+static uint32_t nlSparseMatrixNNZ(NLSparseMatrix* M)
+{
+	uint32_t nnz = 0;
+	for (uint32_t i = 0; i < M->m; i++)
+		nnz += M->row[i].size;
+	return nnz;
+}
+
+static void nlSparseMatrixSort(NLSparseMatrix* M)
+{
+	for (uint32_t i = 0; i < M->m; i++)
+		nlRowColumnSort(&(M->row[i]));
+}
+
+/* SparseMatrix x Vector routines, internal helper routines */
+
+static void nlSparseMatrix_mult_rows(NLSparseMatrix* A,	const double* x, double* y)
+{
+	/*
+	 * Note: OpenMP does not like unsigned ints
+	 * (causes some floating point exceptions),
+	 * therefore I use here signed ints for all
+	 * indices.
+	 */
+	int m = (int)(A->m);
+	NLCoeff* c = nullptr;
+	NLRowColumn* Ri = nullptr;
+	for (int i = 0; i < m; i++) {
+		Ri = &(A->row[i]);
+		y[i] = 0;
+		for (int ij = 0; ij < (int)(Ri->size); ij++) {
+			c = &(Ri->coeff[ij]);
+			y[i] += c->value * x[c->index];
+		}
+	}
+}
+
+static void nlSparseMatrixMult(NLSparseMatrix* A, const double* x, double* y)
+{
+	XA_DEBUG_ASSERT(A->type == NL_MATRIX_SPARSE_DYNAMIC);
+	nlSparseMatrix_mult_rows(A, x, y);
+}
+
+static void nlSparseMatrixConstruct(NLSparseMatrix* M, uint32_t m, uint32_t n)
+{
+	M->m = m;
+	M->n = n;
+	M->type = NL_MATRIX_SPARSE_DYNAMIC;
+	M->destroy_func = (NLDestroyMatrixFunc)nlSparseMatrixDestroy;
+	M->mult_func = (NLMultMatrixVectorFunc)nlSparseMatrixMult;
+	M->row = NL_NEW_ARRAY(NLRowColumn, m);
+	NL_CLEAR_ARRAY(NLRowColumn, M->row, m);
+	M->row_capacity = m;
+	for (uint32_t i = 0; i < n; i++)
+		nlRowColumnConstruct(&(M->row[i]));
+	M->row_capacity = 0;
+	M->column = nullptr;
+	M->column_capacity = 0;
+	M->diag_size = min(m, n);
+	M->diag_capacity = M->diag_size;
+	M->diag = NL_NEW_ARRAY(double, M->diag_size);
+	NL_CLEAR_ARRAY(double, M->diag, M->diag_size);
+}
+
+static NLMatrix nlCRSMatrixNewFromSparseMatrix(NLSparseMatrix* M)
+{
+	uint32_t nnz = nlSparseMatrixNNZ(M);
+	uint32_t nslices = 8; /* TODO: get number of cores */
+	uint32_t slice, cur_bound, cur_NNZ, cur_row;
+	uint32_t k;
+	uint32_t slice_size = nnz / nslices;
+	NLCRSMatrix* CRS = NL_NEW(NLCRSMatrix);
+	NL_CLEAR(CRS, NLCRSMatrix);
+	nlCRSMatrixConstruct(CRS, M->m, M->n, nnz, nslices);
+	nlSparseMatrixSort(M);
+	/* Convert matrix to CRS format */
+	k = 0;
+	for (uint32_t i = 0; i < M->m; ++i) {
+		NLRowColumn* Ri = &(M->row[i]);
+		CRS->rowptr[i] = k;
+		for (uint32_t ij = 0; ij < Ri->size; ij++) {
+			NLCoeff* c = &(Ri->coeff[ij]);
+			CRS->val[k] = c->value;
+			CRS->colind[k] = c->index;
+			++k;
+		}
+	}
+	CRS->rowptr[M->m] = k;
+	/* Create "slices" to be used by parallel sparse matrix vector product */
+	if (CRS->sliceptr) {
+		cur_bound = slice_size;
+		cur_NNZ = 0;
+		cur_row = 0;
+		CRS->sliceptr[0] = 0;
+		for (slice = 1; slice < nslices; ++slice) {
+			while (cur_NNZ < cur_bound && cur_row < M->m) {
+				++cur_row;
+				cur_NNZ += CRS->rowptr[cur_row + 1] - CRS->rowptr[cur_row];
+			}
+			CRS->sliceptr[slice] = cur_row;
+			cur_bound += slice_size;
+		}
+		CRS->sliceptr[nslices] = M->m;
+	}
+	return (NLMatrix)CRS;
+}
+
+static void nlMatrixCompress(NLMatrix* M)
+{
+	NLMatrix CRS = nullptr;
+	if ((*M)->type != NL_MATRIX_SPARSE_DYNAMIC)
+		return;
+	CRS = nlCRSMatrixNewFromSparseMatrix((NLSparseMatrix*)*M);
+	nlDeleteMatrix(*M);
+	*M = CRS;
+}
+
+static NLContext *nlNewContext()
+{
+	NLContext* result = NL_NEW(NLContext);
+	NL_CLEAR(result, NLContext);
+	result->max_iterations = 100;
+	result->threshold = 1e-6;
+	result->omega = 1.5;
+	result->nb_systems = 1;
+	return result;
+}
+
+static void nlDeleteContext(NLContext *context)
+{
+	nlDeleteMatrix(context->M);
+	context->M = nullptr;
+	nlDeleteMatrix(context->P);
+	context->P = nullptr;
+	nlDeleteMatrix(context->B);
+	context->B = nullptr;
+	nlRowColumnDestroy(&context->af);
+	nlRowColumnDestroy(&context->al);
+	NL_DELETE_ARRAY(context->variable_value);
+	NL_DELETE_ARRAY(context->variable_buffer);
+	NL_DELETE_ARRAY(context->variable_is_locked);
+	NL_DELETE_ARRAY(context->variable_index);
+	NL_DELETE_ARRAY(context->x);
+	NL_DELETE_ARRAY(context->b);
+	NL_DELETE(context);
+}
+
+static double ddot(int n, const double *x, const double *y)
+{
+	double sum = 0.0;
+	for (int i = 0; i < n; i++)
+		sum += x[i] * y[i];
+	return sum;
+}
+
+static void daxpy(int n, double a, const double *x, double *y)
+{
+	for (int i = 0; i < n; i++)
+		y[i] = a * x[i] + y[i];
+}
+
+static void dscal(int n, double a, double *x)
+{
+	for (int i = 0; i < n; i++)
+		x[i] *= a;
+}
+
+/*
+ * The implementation of the solvers is inspired by
+ * the lsolver library, by Christian Badura, available from:
+ * http://www.mathematik.uni-freiburg.de
+ * /IAM/Research/projectskr/lin_solver/
+ *
+ * About the Conjugate Gradient, details can be found in:
+ *  Ashby, Manteuffel, Saylor
+ *     A taxononmy for conjugate gradient methods
+ *     SIAM J Numer Anal 27, 1542-1568 (1990)
+ *
+ *  This version is completely abstract, the same code can be used for
+ * CPU/GPU, dense matrix / sparse matrix etc...
+ *  Abstraction is realized through:
+  *   - Abstract matrix interface (NLMatrix), that can implement different
+ *     versions of matrix x vector product (CPU/GPU, sparse/dense ...)
+ */
+
+static uint32_t nlSolveSystem_PRE_CG(NLMatrix M, NLMatrix P, double* b, double* x, double eps, uint32_t max_iter, double *sq_bnorm, double *sq_rnorm)
+{
+	int     N = (int)M->n;
+	double* r = NL_NEW_VECTOR(N);
+	double* d = NL_NEW_VECTOR(N);
+	double* h = NL_NEW_VECTOR(N);
+	double *Ad = h;
+	uint32_t its = 0;
+	double rh, alpha, beta;
+	double b_square = ddot(N, b, b);
+	double err = eps * eps*b_square;
+	double curr_err;
+	nlMultMatrixVector(M, x, r);
+	daxpy(N, -1., b, r);
+	nlMultMatrixVector(P, r, d);
+	memcpy(h, d, N * sizeof(double));
+	rh = ddot(N, r, h);
+	curr_err = ddot(N, r, r);
+	while (curr_err > err && its < max_iter) {
+		nlMultMatrixVector(M, d, Ad);
+		alpha = rh / ddot(N, d, Ad);
+		daxpy(N, -alpha, d, x);
+		daxpy(N, -alpha, Ad, r);
+		nlMultMatrixVector(P, r, h);
+		beta = 1. / rh;
+		rh = ddot(N, r, h);
+		beta *= rh;
+		dscal(N, beta, d);
+		daxpy(N, 1., h, d);
+		++its;
+		curr_err = ddot(N, r, r);
+	}
+	NL_DELETE_VECTOR(r);
+	NL_DELETE_VECTOR(d);
+	NL_DELETE_VECTOR(h);
+	*sq_bnorm = b_square;
+	*sq_rnorm = curr_err;
+	return its;
+}
+
+static uint32_t nlSolveSystemIterative(NLContext *context, NLMatrix M, NLMatrix P, double* b_in, double* x_in, double eps, uint32_t max_iter)
+{
+	uint32_t result = 0;
+	double rnorm = 0.0;
+	double bnorm = 0.0;
+	double* b = b_in;
+	double* x = x_in;
+	XA_DEBUG_ASSERT(M->m == M->n);
+	double sq_bnorm, sq_rnorm;
+	result = nlSolveSystem_PRE_CG(M, P, b, x, eps, max_iter, &sq_bnorm, &sq_rnorm);
+	/* Get residual norm and rhs norm */
+	bnorm = sqrt(sq_bnorm);
+	rnorm = sqrt(sq_rnorm);
+	if (bnorm == 0.0)
+		context->error = rnorm;
+	else
+		context->error = rnorm / bnorm;
+	context->used_iterations = result;
+	return result;
+}
+
+static bool nlSolveIterative(NLContext *context)
+{
+	double* b = context->b;
+	double* x = context->x;
+	uint32_t n = context->n;
+	NLMatrix M = context->M;
+	NLMatrix P = context->P;
+	for (uint32_t k = 0; k < context->nb_systems; ++k) {
+		nlSolveSystemIterative(context, M, P, b, x, context->threshold, context->max_iterations);
+		b += n;
+		x += n;
+	}
+	return true;
+}
+
+struct NLJacobiPreconditioner
+{
+	uint32_t m;
+	uint32_t n;
+	uint32_t type;
+	NLDestroyMatrixFunc destroy_func;
+	NLMultMatrixVectorFunc mult_func;
+	double* diag_inv;
+};
+
+static void nlJacobiPreconditionerDestroy(NLJacobiPreconditioner* M)
+{
+	NL_DELETE_ARRAY(M->diag_inv);
+}
+
+static void nlJacobiPreconditionerMult(NLJacobiPreconditioner* M, const double* x, double* y)
+{
+	for (uint32_t i = 0; i < M->n; ++i)
+		y[i] = x[i] * M->diag_inv[i];
+}
+
+static NLMatrix nlNewJacobiPreconditioner(NLMatrix M_in)
+{
+	NLSparseMatrix* M = nullptr;
+	NLJacobiPreconditioner* result = nullptr;
+	XA_DEBUG_ASSERT(M_in->type == NL_MATRIX_SPARSE_DYNAMIC);
+	XA_DEBUG_ASSERT(M_in->m == M_in->n);
+	M = (NLSparseMatrix*)M_in;
+	result = NL_NEW(NLJacobiPreconditioner);
+	NL_CLEAR(result, NLJacobiPreconditioner);
+	result->m = M->m;
+	result->n = M->n;
+	result->type = NL_MATRIX_OTHER;
+	result->destroy_func = (NLDestroyMatrixFunc)nlJacobiPreconditionerDestroy;
+	result->mult_func = (NLMultMatrixVectorFunc)nlJacobiPreconditionerMult;
+	result->diag_inv = NL_NEW_ARRAY(double, M->n);
+	NL_CLEAR_ARRAY(double, result->diag_inv, M->n);
+	for (uint32_t i = 0; i < M->n; ++i)
+		result->diag_inv[i] = (M->diag[i] == 0.0) ? 1.0 : 1.0 / M->diag[i];
+	return (NLMatrix)result;
+}
+
+#define NL_NB_VARIABLES 0x101
+#define NL_MAX_ITERATIONS 0x103
+
+static void nlSolverParameteri(NLContext *context, uint32_t pname, int param)
+{
+	if (pname == NL_NB_VARIABLES) {
+		XA_DEBUG_ASSERT(param > 0);
+		context->nb_variables = (uint32_t)param;
+	} else if (pname == NL_MAX_ITERATIONS) {
+		XA_DEBUG_ASSERT(param > 0);
+		context->max_iterations = (uint32_t)param;
+		context->max_iterations_defined = true;
+	}
+}
+
+static void nlSetVariable(NLContext *context, uint32_t index, double value)
+{
+	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
+	NL_BUFFER_ITEM(context->variable_buffer[0], index) = value;
+}
+
+static double nlGetVariable(NLContext *context, uint32_t index)
+{
+	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
+	return NL_BUFFER_ITEM(context->variable_buffer[0], index);
+}
+
+static void nlLockVariable(NLContext *context, uint32_t index)
+{
+	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
+	context->variable_is_locked[index] = true;
+}
+
+static void nlVariablesToVector(NLContext *context)
+{
+	uint32_t n = context->n;
+	XA_DEBUG_ASSERT(context->x);
+	for (uint32_t k = 0; k < context->nb_systems; ++k) {
+		for (uint32_t i = 0; i < context->nb_variables; ++i) {
+			if (!context->variable_is_locked[i]) {
+				uint32_t index = context->variable_index[i];
+				XA_DEBUG_ASSERT(index < context->n);
+				double value = NL_BUFFER_ITEM(context->variable_buffer[k], i);
+				context->x[index + k * n] = value;
+			}
+		}
+	}
+}
+
+static void nlVectorToVariables(NLContext *context)
+{
+	uint32_t n = context->n;
+	XA_DEBUG_ASSERT(context->x);
+	for (uint32_t k = 0; k < context->nb_systems; ++k) {
+		for (uint32_t i = 0; i < context->nb_variables; ++i) {
+			if (!context->variable_is_locked[i]) {
+				uint32_t index = context->variable_index[i];
+				XA_DEBUG_ASSERT(index < context->n);
+				double value = context->x[index + k * n];
+				NL_BUFFER_ITEM(context->variable_buffer[k], i) = value;
+			}
+		}
+	}
+}
+
+static void nlCoefficient(NLContext *context, uint32_t index, double value)
+{
+	XA_DEBUG_ASSERT(index >= 0 && index <= context->nb_variables - 1);
+	if (context->variable_is_locked[index]) {
+		/*
+		 * Note: in al, indices are NLvariable indices,
+		 * within [0..nb_variables-1]
+		 */
+		nlRowColumnAppend(&(context->al), index, value);
+	} else {
+		/*
+		 * Note: in af, indices are system indices,
+		 * within [0..n-1]
+		 */
+		nlRowColumnAppend(&(context->af), context->variable_index[index], value);
+	}
+}
+
+#define NL_SYSTEM  0x0
+#define NL_MATRIX  0x1
+#define NL_ROW     0x2
+
+static void nlBegin(NLContext *context, uint32_t prim)
+{
+	if (prim == NL_SYSTEM) {
+		XA_DEBUG_ASSERT(context->nb_variables > 0);
+		context->variable_buffer = NL_NEW_ARRAY(NLBufferBinding, context->nb_systems);
+		NL_CLEAR_ARRAY(NLBufferBinding, context->variable_buffer, context->nb_systems);
+		context->variable_value = NL_NEW_ARRAY(double, context->nb_variables * context->nb_systems);
+		NL_CLEAR_ARRAY(double, context->variable_value, context->nb_variables * context->nb_systems);
+		for (uint32_t k = 0; k < context->nb_systems; ++k) {
+			context->variable_buffer[k].base_address =
+				context->variable_value +
+				k * context->nb_variables;
+			context->variable_buffer[k].stride = sizeof(double);
+		}
+		context->variable_is_locked = NL_NEW_ARRAY(bool, context->nb_variables);
+		NL_CLEAR_ARRAY(bool, context->variable_is_locked, context->nb_variables);
+		context->variable_index = NL_NEW_ARRAY(uint32_t, context->nb_variables);
+		NL_CLEAR_ARRAY(uint32_t, context->variable_index, context->nb_variables);
+	} else if (prim == NL_MATRIX) {
+		if (context->M)
+			return;
+		uint32_t n = 0;
+		for (uint32_t i = 0; i < context->nb_variables; i++) {
+			if (!context->variable_is_locked[i]) {
+				context->variable_index[i] = n;
+				n++;
+			} else
+				context->variable_index[i] = (uint32_t)~0;
+		}
+		context->n = n;
+		if (!context->max_iterations_defined)
+			context->max_iterations = n * 5;
+		context->M = (NLMatrix)(NL_NEW(NLSparseMatrix));
+		NL_CLEAR(context->M, NLSparseMatrix);
+		nlSparseMatrixConstruct((NLSparseMatrix*)(context->M), n, n);
+		context->x = NL_NEW_ARRAY(double, n*context->nb_systems);
+		NL_CLEAR_ARRAY(double, context->x, n*context->nb_systems);
+		context->b = NL_NEW_ARRAY(double, n*context->nb_systems);
+		NL_CLEAR_ARRAY(double, context->b, n*context->nb_systems);
+		nlVariablesToVector(context);
+		nlRowColumnConstruct(&context->af);
+		nlRowColumnConstruct(&context->al);
+		context->current_row = 0;
+	} else if (prim == NL_ROW) {
+		nlRowColumnZero(&context->af);
+		nlRowColumnZero(&context->al);
+	}
+}
+
+static void nlEnd(NLContext *context, uint32_t prim)
+{
+	if (prim == NL_MATRIX) {
+		nlRowColumnClear(&context->af);
+		nlRowColumnClear(&context->al);
+	} else if (prim == NL_ROW) {
+		NLRowColumn*    af = &context->af;
+		NLRowColumn*    al = &context->al;
+		NLSparseMatrix* M = (NLSparseMatrix*)context->M;
+		double* b = context->b;
+		uint32_t nf = af->size;
+		uint32_t nl = al->size;
+		uint32_t n = context->n;
+		double S;
+		/*
+		 * least_squares : we want to solve
+		 * A'A x = A'b
+		 */
+		for (uint32_t i = 0; i < nf; i++) {
+			for (uint32_t j = 0; j < nf; j++) {
+				nlSparseMatrixAdd(M, af->coeff[i].index, af->coeff[j].index, af->coeff[i].value * af->coeff[j].value);
+			}
+		}
+		for (uint32_t k = 0; k < context->nb_systems; ++k) {
+			S = 0.0;
+			for (uint32_t jj = 0; jj < nl; ++jj) {
+				uint32_t j = al->coeff[jj].index;
+				S += al->coeff[jj].value * NL_BUFFER_ITEM(context->variable_buffer[k], j);
+			}
+			for (uint32_t jj = 0; jj < nf; jj++)
+				b[k*n + af->coeff[jj].index] -= af->coeff[jj].value * S;
+		}
+		context->current_row++;
+	}
+}
+
+static bool nlSolve(NLContext *context)
+{
+	nlDeleteMatrix(context->P);
+	context->P = nlNewJacobiPreconditioner(context->M);
+	nlMatrixCompress(&context->M);
+	bool result = nlSolveIterative(context);
+	nlVectorToVariables(context);
+	return result;
+}
+} // namespace opennl
+
 namespace raster {
-class ClippedTriangle {
+class ClippedTriangle
+{
 public:
-	ClippedTriangle(const Vector2 &a, const Vector2 &b, const Vector2 &c) {
+	ClippedTriangle(const Vector2 &a, const Vector2 &b, const Vector2 &c)
+	{
 		m_numVertices = 3;
 		m_activeVertexBuffer = 0;
 		m_verticesA[0] = a;
@@ -4077,21 +5191,23 @@ public:
 		m_verticesA[2] = c;
 		m_vertexBuffers[0] = m_verticesA;
 		m_vertexBuffers[1] = m_verticesB;
+		m_area = 0;
 	}
 
-	void clipHorizontalPlane(float offset, float clipdirection) {
-		Vector2 *v = m_vertexBuffers[m_activeVertexBuffer];
+	void clipHorizontalPlane(float offset, float clipdirection)
+	{
+		Vector2 *v  = m_vertexBuffers[m_activeVertexBuffer];
 		m_activeVertexBuffer ^= 1;
 		Vector2 *v2 = m_vertexBuffers[m_activeVertexBuffer];
 		v[m_numVertices] = v[0];
-		float dy2, dy1 = offset - v[0].y;
-		int dy2in, dy1in = clipdirection * dy1 >= 0;
-		uint32_t p = 0;
+		float dy2,   dy1 = offset - v[0].y;
+		int   dy2in, dy1in = clipdirection * dy1 >= 0;
+		uint32_t  p = 0;
 		for (uint32_t k = 0; k < m_numVertices; k++) {
-			dy2 = offset - v[k + 1].y;
+			dy2   = offset - v[k + 1].y;
 			dy2in = clipdirection * dy2 >= 0;
 			if (dy1in) v2[p++] = v[k];
-			if (dy1in + dy2in == 1) { // not both in/out
+			if ( dy1in + dy2in == 1 ) { // not both in/out
 				float dx = v[k + 1].x - v[k].x;
 				float dy = v[k + 1].y - v[k].y;
 				v2[p++] = Vector2(v[k].x + dy1 * (dx / dy), offset);
@@ -4102,19 +5218,20 @@ public:
 		m_numVertices = p;
 	}
 
-	void clipVerticalPlane(float offset, float clipdirection) {
-		Vector2 *v = m_vertexBuffers[m_activeVertexBuffer];
+	void clipVerticalPlane(float offset, float clipdirection)
+	{
+		Vector2 *v  = m_vertexBuffers[m_activeVertexBuffer];
 		m_activeVertexBuffer ^= 1;
 		Vector2 *v2 = m_vertexBuffers[m_activeVertexBuffer];
 		v[m_numVertices] = v[0];
-		float dx2, dx1 = offset - v[0].x;
-		int dx2in, dx1in = clipdirection * dx1 >= 0;
-		uint32_t p = 0;
+		float dx2,   dx1   = offset - v[0].x;
+		int   dx2in, dx1in = clipdirection * dx1 >= 0;
+		uint32_t  p = 0;
 		for (uint32_t k = 0; k < m_numVertices; k++) {
 			dx2 = offset - v[k + 1].x;
 			dx2in = clipdirection * dx2 >= 0;
 			if (dx1in) v2[p++] = v[k];
-			if (dx1in + dx2in == 1) { // not both in/out
+			if ( dx1in + dx2in == 1 ) { // not both in/out
 				float dx = v[k + 1].x - v[k].x;
 				float dy = v[k + 1].y - v[k].y;
 				v2[p++] = Vector2(offset, v[k].y + dx1 * (dy / dx));
@@ -4125,8 +5242,9 @@ public:
 		m_numVertices = p;
 	}
 
-	void computeArea() {
-		Vector2 *v = m_vertexBuffers[m_activeVertexBuffer];
+	void computeArea()
+	{
+		Vector2 *v  = m_vertexBuffers[m_activeVertexBuffer];
 		v[m_numVertices] = v[0];
 		m_area = 0;
 		float centroidx = 0, centroidy = 0;
@@ -4140,7 +5258,8 @@ public:
 		m_area = 0.5f * fabsf(m_area);
 	}
 
-	void clipAABox(float x0, float y0, float x1, float y1) {
+	void clipAABox(float x0, float y0, float x1, float y1)
+	{
 		clipVerticalPlane(x0, -1);
 		clipHorizontalPlane(y0, -1);
 		clipVerticalPlane(x1, 1);
@@ -4148,7 +5267,8 @@ public:
 		computeArea();
 	}
 
-	float area() const {
+	float area() const
+	{
 		return m_area;
 	}
 
@@ -4165,12 +5285,10 @@ private:
 typedef bool (*SamplingCallback)(void *param, int x, int y);
 
 /// A triangle for rasterization.
-struct Triangle {
-	Triangle(const Vector2 &v0, const Vector2 &v1, const Vector2 &v2) {
-		// Init vertices.
-		this->v1 = v0;
-		this->v2 = v2;
-		this->v3 = v1;
+struct Triangle
+{
+	Triangle(const Vector2 &_v0, const Vector2 &_v1, const Vector2 &_v2) : v1(_v0), v2(_v2), v3(_v1)
+	{
 		// make sure every triangle is front facing.
 		flipBackface();
 		// Compute deltas.
@@ -4178,7 +5296,8 @@ struct Triangle {
 			computeUnitInwardNormals();
 	}
 
-	bool isValid() {
+	bool isValid()
+	{
 		const Vector2 e0 = v3 - v1;
 		const Vector2 e1 = v2 - v1;
 		const float area = e0.y * e1.x - e1.y * e0.x;
@@ -4186,17 +5305,18 @@ struct Triangle {
 	}
 
 	// extents has to be multiple of BK_SIZE!!
-	bool drawAA(const Vector2 &extents, SamplingCallback cb, void *param) {
-		const float PX_INSIDE = 1.0f / sqrtf(2.0f);
-		const float PX_OUTSIDE = -1.0f / sqrtf(2.0f);
+	bool drawAA(const Vector2 &extents, SamplingCallback cb, void *param)
+	{
+		const float PX_INSIDE = 1.0f/sqrtf(2.0f);
+		const float PX_OUTSIDE = -1.0f/sqrtf(2.0f);
 		const float BK_SIZE = 8;
-		const float BK_INSIDE = sqrtf(BK_SIZE * BK_SIZE / 2.0f);
-		const float BK_OUTSIDE = -sqrtf(BK_SIZE * BK_SIZE / 2.0f);
+		const float BK_INSIDE = sqrtf(BK_SIZE*BK_SIZE/2.0f);
+		const float BK_OUTSIDE = -sqrtf(BK_SIZE*BK_SIZE/2.0f);
 		// Bounding rectangle
 		float minx = floorf(max(min3(v1.x, v2.x, v3.x), 0.0f));
 		float miny = floorf(max(min3(v1.y, v2.y, v3.y), 0.0f));
-		float maxx = ceilf(min(max3(v1.x, v2.x, v3.x), extents.x - 1.0f));
-		float maxy = ceilf(min(max3(v1.y, v2.y, v3.y), extents.y - 1.0f));
+		float maxx = ceilf( min(max3(v1.x, v2.x, v3.x), extents.x - 1.0f));
+		float maxy = ceilf( min(max3(v1.y, v2.y, v3.y), extents.y - 1.0f));
 		// There's no reason to align the blocks to the viewport, instead we align them to the origin of the triangle bounds.
 		minx = floorf(minx);
 		miny = floorf(miny);
@@ -4221,9 +5341,9 @@ struct Triangle {
 				float bC = C2 + n2.x * xc + n2.y * yc;
 				float cC = C3 + n3.x * xc + n3.y * yc;
 				// Skip block when outside an edge
-				if ((aC <= BK_OUTSIDE) || (bC <= BK_OUTSIDE) || (cC <= BK_OUTSIDE)) continue;
+				if ( (aC <= BK_OUTSIDE) || (bC <= BK_OUTSIDE) || (cC <= BK_OUTSIDE) ) continue;
 				// Accept whole block when totally covered
-				if ((aC >= BK_INSIDE) && (bC >= BK_INSIDE) && (cC >= BK_INSIDE)) {
+				if ( (aC >= BK_INSIDE) && (bC >= BK_INSIDE) && (cC >= BK_INSIDE) ) {
 					for (float y = y0; y < y0 + BK_SIZE; y++) {
 						for (float x = x0; x < x0 + BK_SIZE; x++) {
 							if (!cb(param, (int)x, (int)y))
@@ -4266,9 +5386,10 @@ struct Triangle {
 	}
 
 private:
-	void flipBackface() {
+	void flipBackface()
+	{
 		// check if triangle is backfacing, if so, swap two vertices
-		if (((v3.x - v1.x) * (v2.y - v1.y) - (v3.y - v1.y) * (v2.x - v1.x)) < 0) {
+		if ( ((v3.x - v1.x) * (v2.y - v1.y) - (v3.y - v1.y) * (v2.x - v1.x)) < 0 ) {
 			Vector2 hv = v1;
 			v1 = v2;
 			v2 = hv; // swap pos
@@ -4276,7 +5397,8 @@ private:
 	}
 
 	// compute unit inward normals for each edge.
-	void computeUnitInwardNormals() {
+	void computeUnitInwardNormals()
+	{
 		n1 = v1 - v2;
 		n1 = Vector2(-n1.y, n1.x);
 		n1 = n1 * (1.0f / sqrtf(dot(n1, n1)));
@@ -4294,7 +5416,8 @@ private:
 };
 
 // Process the given triangle. Returns false if rasterization was interrupted by the callback.
-static bool drawTriangle(const Vector2 &extents, const Vector2 v[3], SamplingCallback cb, void *param) {
+static bool drawTriangle(const Vector2 &extents, const Vector2 v[3], SamplingCallback cb, void *param)
+{
 	Triangle tri(v[0], v[1], v[2]);
 	// @@ It would be nice to have a conservative drawing mode that enlarges the triangle extents by one texel and is able to handle degenerate triangles.
 	// @@ Maybe the simplest thing to do would be raster triangle edges.
@@ -4305,251 +5428,26 @@ static bool drawTriangle(const Vector2 &extents, const Vector2 v[3], SamplingCal
 
 } // namespace raster
 
-// Full and sparse vector and matrix classes. BLAS subset.
-// Pseudo-BLAS interface.
-namespace sparse {
-
-/**
-* Sparse matrix class. The matrix is assumed to be sparse and to have
-* very few non-zero elements, for this reason it's stored in indexed
-* format. To multiply column vectors efficiently, the matrix stores
-* the elements in indexed-column order, there is a list of indexed
-* elements for each row of the matrix. As with the FullVector the
-* dimension of the matrix is constant.
-**/
-class Matrix {
-public:
-	// An element of the sparse array.
-	struct Coefficient {
-		uint32_t x; // column
-		float v; // value
-	};
-
-	Matrix(uint32_t d) :
-			m_width(d), m_array(MemTag::Matrix) {
-		m_array.resize(d);
-		m_array.runCtors();
-#if XA_DEBUG_HEAP
-		for (uint32_t i = 0; i < d; i++)
-			m_array[i].setMemTag(MemTag::Matrix);
-#endif
-	}
-
-	Matrix(uint32_t w, uint32_t h) :
-			m_width(w), m_array(MemTag::Matrix) {
-		m_array.resize(h);
-		m_array.runCtors();
-#if XA_DEBUG_HEAP
-		for (uint32_t i = 0; i < h; i++)
-			m_array[i].setMemTag(MemTag::Matrix);
-#endif
-	}
-
-	~Matrix() {
-		m_array.runDtors();
-	}
-
-	Matrix(const Matrix &m) = delete;
-	Matrix &operator=(const Matrix &m) = delete;
-	uint32_t width() const { return m_width; }
-	uint32_t height() const { return m_array.size(); }
-	bool isSquare() const { return width() == height(); }
-
-	// x is column, y is row
-	float getCoefficient(uint32_t x, uint32_t y) const {
-		XA_DEBUG_ASSERT(x < width());
-		XA_DEBUG_ASSERT(y < height());
-		const uint32_t count = m_array[y].size();
-		for (uint32_t i = 0; i < count; i++) {
-			if (m_array[y][i].x == x) return m_array[y][i].v;
-		}
-		return 0.0f;
-	}
-
-	void setCoefficient(uint32_t x, uint32_t y, float f) {
-		XA_DEBUG_ASSERT(x < width());
-		XA_DEBUG_ASSERT(y < height());
-		const uint32_t count = m_array[y].size();
-		for (uint32_t i = 0; i < count; i++) {
-			if (m_array[y][i].x == x) {
-				m_array[y][i].v = f;
-				return;
-			}
-		}
-		if (f != 0.0f) {
-			Coefficient c = { x, f };
-			m_array[y].push_back(c);
-		}
-	}
-
-	float dotRow(uint32_t y, const FullVector &v) const {
-		XA_DEBUG_ASSERT(y < height());
-		const uint32_t count = m_array[y].size();
-		float sum = 0;
-		for (uint32_t i = 0; i < count; i++) {
-			sum += m_array[y][i].v * v[m_array[y][i].x];
-		}
-		return sum;
-	}
-
-	void madRow(uint32_t y, float alpha, FullVector &v) const {
-		XA_DEBUG_ASSERT(y < height());
-		const uint32_t count = m_array[y].size();
-		for (uint32_t i = 0; i < count; i++) {
-			v[m_array[y][i].x] += alpha * m_array[y][i].v;
-		}
-	}
-
-	void clearRow(uint32_t y) {
-		XA_DEBUG_ASSERT(y < height());
-		m_array[y].clear();
-	}
-
-	const Array<Coefficient> &getRow(uint32_t y) const { return m_array[y]; }
-
-private:
-	/// Number of columns.
-	const uint32_t m_width;
-
-	/// Array of matrix elements.
-	Array<Array<Coefficient>> m_array;
-};
-
-// y = a * x + y
-static void saxpy(float a, const FullVector &x, FullVector &y) {
-	XA_DEBUG_ASSERT(x.dimension() == y.dimension());
-	const uint32_t dim = x.dimension();
-	for (uint32_t i = 0; i < dim; i++) {
-		y[i] += a * x[i];
-	}
-}
-
-static void copy(const FullVector &x, FullVector &y) {
-	XA_DEBUG_ASSERT(x.dimension() == y.dimension());
-	const uint32_t dim = x.dimension();
-	for (uint32_t i = 0; i < dim; i++) {
-		y[i] = x[i];
-	}
-}
-
-static void scal(float a, FullVector &x) {
-	const uint32_t dim = x.dimension();
-	for (uint32_t i = 0; i < dim; i++) {
-		x[i] *= a;
-	}
-}
-
-static float dot(const FullVector &x, const FullVector &y) {
-	XA_DEBUG_ASSERT(x.dimension() == y.dimension());
-	const uint32_t dim = x.dimension();
-	float sum = 0;
-	for (uint32_t i = 0; i < dim; i++) {
-		sum += x[i] * y[i];
-	}
-	return sum;
-}
-
-// y = M * x
-static void mult(const Matrix &M, const FullVector &x, FullVector &y) {
-	uint32_t w = M.width();
-	uint32_t h = M.height();
-	XA_DEBUG_ASSERT(w == x.dimension());
-	XA_UNUSED(w);
-	XA_DEBUG_ASSERT(h == y.dimension());
-	for (uint32_t i = 0; i < h; i++)
-		y[i] = M.dotRow(i, x);
-}
-
-// y = alpha*A*x + beta*y
-static void sgemv(float alpha, const Matrix &A, const FullVector &x, float beta, FullVector &y) {
-	const uint32_t w = A.width();
-	const uint32_t h = A.height();
-	XA_DEBUG_ASSERT(w == x.dimension());
-	XA_DEBUG_ASSERT(h == y.dimension());
-	XA_UNUSED(w);
-	XA_UNUSED(h);
-	for (uint32_t i = 0; i < h; i++)
-		y[i] = alpha * A.dotRow(i, x) + beta * y[i];
-}
-
-// dot y-row of A by x-column of B
-static float dotRowColumn(int y, const Matrix &A, int x, const Matrix &B) {
-	const Array<Matrix::Coefficient> &row = A.getRow(y);
-	const uint32_t count = row.size();
-	float sum = 0.0f;
-	for (uint32_t i = 0; i < count; i++) {
-		const Matrix::Coefficient &c = row[i];
-		sum += c.v * B.getCoefficient(x, c.x);
-	}
-	return sum;
-}
-
-static void transpose(const Matrix &A, Matrix &B) {
-	XA_DEBUG_ASSERT(A.width() == B.height());
-	XA_DEBUG_ASSERT(B.width() == A.height());
-	const uint32_t w = A.width();
-	for (uint32_t x = 0; x < w; x++) {
-		B.clearRow(x);
-	}
-	const uint32_t h = A.height();
-	for (uint32_t y = 0; y < h; y++) {
-		const Array<Matrix::Coefficient> &row = A.getRow(y);
-		const uint32_t count = row.size();
-		for (uint32_t i = 0; i < count; i++) {
-			const Matrix::Coefficient &c = row[i];
-			XA_DEBUG_ASSERT(c.x < w);
-			B.setCoefficient(y, c.x, c.v);
-		}
-	}
-}
-
-static void sgemm(float alpha, const Matrix &A, const Matrix &B, float beta, Matrix &C) {
-	const uint32_t w = C.width();
-	const uint32_t h = C.height();
-#if XA_DEBUG
-	const uint32_t aw = A.width();
-	const uint32_t ah = A.height();
-	const uint32_t bw = B.width();
-	const uint32_t bh = B.height();
-	XA_DEBUG_ASSERT(aw == bh);
-	XA_DEBUG_ASSERT(bw == ah);
-	XA_DEBUG_ASSERT(w == bw);
-	XA_DEBUG_ASSERT(h == ah);
-#endif
-	for (uint32_t y = 0; y < h; y++) {
-		for (uint32_t x = 0; x < w; x++) {
-			float c = beta * C.getCoefficient(x, y);
-			// dot y-row of A by x-column of B.
-			c += alpha * dotRowColumn(y, A, x, B);
-			C.setCoefficient(x, y, c);
-		}
-	}
-}
-
-// C = A * B
-static void mult(const Matrix &A, const Matrix &B, Matrix &C) {
-	sgemm(1.0f, A, B, 0.0f, C);
-}
-
-} // namespace sparse
-
 namespace segment {
 
 // - Insertion is o(n)
 // - Smallest element goes at the end, so that popping it is o(1).
-struct CostQueue {
-	CostQueue(uint32_t size = UINT32_MAX) :
-			m_maxSize(size), m_pairs(MemTag::SegmentAtlasChartCandidates) {}
+struct CostQueue
+{
+	CostQueue(uint32_t size = UINT32_MAX) : m_maxSize(size), m_pairs(MemTag::SegmentAtlasChartCandidates) {}
 
-	float peekCost() const {
+	float peekCost() const
+	{
 		return m_pairs.back().cost;
 	}
 
-	uint32_t peekFace() const {
+	uint32_t peekFace() const
+	{
 		return m_pairs.back().face;
 	}
 
-	void push(float cost, uint32_t face) {
+	void push(float cost, uint32_t face)
+	{
 		const Pair p = { cost, face };
 		if (m_pairs.isEmpty() || cost < peekCost())
 			m_pairs.push_back(p);
@@ -4566,25 +5464,29 @@ struct CostQueue {
 		}
 	}
 
-	uint32_t pop() {
+	uint32_t pop()
+	{
 		XA_DEBUG_ASSERT(!m_pairs.isEmpty());
 		uint32_t f = m_pairs.back().face;
 		m_pairs.pop_back();
 		return f;
 	}
 
-	XA_INLINE void clear() {
+	XA_INLINE void clear()
+	{
 		m_pairs.clear();
 	}
 
-	XA_INLINE uint32_t count() const {
+	XA_INLINE uint32_t count() const
+	{
 		return m_pairs.size();
 	}
 
 private:
 	const uint32_t m_maxSize;
 
-	struct Pair {
+	struct Pair
+	{
 		float cost;
 		uint32_t face;
 	};
@@ -4592,86 +5494,92 @@ private:
 	Array<Pair> m_pairs;
 };
 
-struct Chart {
-	Chart() :
-			faces(MemTag::SegmentAtlasChartFaces) {}
-
-	int id = -1;
-	Basis basis; // Best fit normal.
-	float area = 0.0f;
-	float boundaryLength = 0.0f;
-	Vector3 centroidSum = Vector3(0.0f); // Sum of chart face centroids.
-	Vector3 centroid = Vector3(0.0f); // Average centroid of chart faces.
-	Array<uint32_t> seeds;
-	Array<uint32_t> faces;
-	Array<uint32_t> failedPlanarRegions;
-	CostQueue candidates;
-};
-
-struct Atlas {
-	Atlas() :
-			m_edgeLengths(MemTag::SegmentAtlasMeshData), m_faceAreas(MemTag::SegmentAtlasMeshData), m_faceNormals(MemTag::SegmentAtlasMeshData), m_texcoords(MemTag::SegmentAtlasMeshData), m_bestTriangles(10), m_nextPlanarRegionFace(MemTag::SegmentAtlasPlanarRegions), m_facePlanarRegionId(MemTag::SegmentAtlasPlanarRegions) {}
-
-	~Atlas() {
-		const uint32_t chartCount = m_charts.size();
-		for (uint32_t i = 0; i < chartCount; i++) {
-			m_charts[i]->~Chart();
-			XA_FREE(m_charts[i]);
-		}
-	}
+struct AtlasData
+{
+	ChartOptions options;
+	const Mesh *mesh = nullptr;
+	Array<float> edgeDihedralAngles;
+	Array<float> edgeLengths;
+	Array<float> faceAreas;
+	Array<Vector3> faceNormals;
+	BitArray isFaceInChart;
 
-	uint32_t facesLeft() const { return m_facesLeft; }
-	uint32_t chartCount() const { return m_charts.size(); }
-	const Array<uint32_t> &chartFaces(uint32_t i) const { return m_charts[i]->faces; }
-	const Basis &chartBasis(uint32_t chartIndex) const { return m_charts[chartIndex]->basis; }
+	AtlasData() : edgeDihedralAngles(MemTag::SegmentAtlasMeshData), edgeLengths(MemTag::SegmentAtlasMeshData), faceAreas(MemTag::SegmentAtlasMeshData), faceNormals(MemTag::SegmentAtlasMeshData) {}
 
-	void reset(uint32_t meshId, uint32_t chartGroupId, const Mesh *mesh, const ChartOptions &options) {
-		XA_UNUSED(meshId);
-		XA_UNUSED(chartGroupId);
-		XA_PROFILE_START(buildAtlasInit)
-		m_mesh = mesh;
-		const uint32_t faceCount = m_mesh->faceCount();
-		m_facesLeft = faceCount;
-		m_options = options;
-		m_rand.reset();
-		const uint32_t chartCount = m_charts.size();
-		for (uint32_t i = 0; i < chartCount; i++) {
-			m_charts[i]->~Chart();
-			XA_FREE(m_charts[i]);
-		}
-		m_charts.clear();
-		m_faceCharts.resize(faceCount);
-		m_faceCharts.setAll(-1);
-		m_texcoords.resize(faceCount * 3);
-		// Precompute edge lengths and face areas.
-		const uint32_t edgeCount = m_mesh->edgeCount();
-		m_edgeLengths.resize(edgeCount);
-		m_faceAreas.resize(faceCount);
-		m_faceNormals.resize(faceCount);
+	void compute()
+	{
+		const uint32_t faceCount = mesh->faceCount();
+		const uint32_t edgeCount = mesh->edgeCount();
+		edgeDihedralAngles.resize(edgeCount);
+		edgeLengths.resize(edgeCount);
+		faceAreas.resize(faceCount);
+		faceNormals.resize(faceCount);
+		isFaceInChart.resize(faceCount);
+		isFaceInChart.zeroOutMemory();
 		for (uint32_t f = 0; f < faceCount; f++) {
 			for (uint32_t i = 0; i < 3; i++) {
 				const uint32_t edge = f * 3 + i;
-				const Vector3 &p0 = mesh->position(m_mesh->vertexAt(meshEdgeIndex0(edge)));
-				const Vector3 &p1 = mesh->position(m_mesh->vertexAt(meshEdgeIndex1(edge)));
-				m_edgeLengths[edge] = length(p1 - p0);
-				XA_DEBUG_ASSERT(m_edgeLengths[edge] > 0.0f);
+				const Vector3 &p0 = mesh->position(mesh->vertexAt(meshEdgeIndex0(edge)));
+				const Vector3 &p1 = mesh->position(mesh->vertexAt(meshEdgeIndex1(edge)));
+				edgeLengths[edge] = length(p1 - p0);
+				XA_DEBUG_ASSERT(edgeLengths[edge] > 0.0f);
+			}
+			faceAreas[f] = mesh->computeFaceArea(f);
+			XA_DEBUG_ASSERT(faceAreas[f] > 0.0f);
+			faceNormals[f] = mesh->computeFaceNormal(f);
+		}
+		for (uint32_t face = 0; face < faceCount; face++) {
+			for (uint32_t i = 0; i < 3; i++) {
+				const uint32_t edge = face * 3 + i;
+				const uint32_t oedge = mesh->oppositeEdge(edge);
+				if (oedge == UINT32_MAX)
+					edgeDihedralAngles[edge] = FLT_MAX;
+				else {
+					const uint32_t oface = meshEdgeFace(oedge);
+					edgeDihedralAngles[edge] = edgeDihedralAngles[oedge] = dot(faceNormals[face], faceNormals[oface]);
+				}
 			}
-			m_faceAreas[f] = m_mesh->computeFaceArea(f);
-			XA_DEBUG_ASSERT(m_faceAreas[f] > 0.0f);
-			m_faceNormals[f] = m_mesh->computeFaceNormal(f);
 		}
+	}
+};
+
+#if XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS
+static uint32_t s_planarRegionsCurrentRegion;
+static uint32_t s_planarRegionsCurrentVertex;
+#endif
+
+struct PlanarCharts
+{
+	PlanarCharts(AtlasData &data) : m_data(data), m_nextRegionFace(MemTag::SegmentAtlasPlanarRegions), m_faceToRegionId(MemTag::SegmentAtlasPlanarRegions) {}
+	const Basis &chartBasis(uint32_t chartIndex) const { return m_chartBasis[chartIndex]; }
+	uint32_t chartCount() const { return m_charts.size(); }
+	
+	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const
+	{
+		const Chart &chart = m_charts[chartIndex];
+		return ConstArrayView<uint32_t>(&m_chartFaces[chart.firstFace], chart.faceCount);
+	}
+
+	uint32_t regionIdFromFace(uint32_t face) const { return m_faceToRegionId[face]; }
+	uint32_t nextRegionFace(uint32_t face) const { return m_nextRegionFace[face]; }
+	float regionArea(uint32_t region) const { return m_regionAreas[region]; }
+
+	void compute()
+	{
+		const uint32_t faceCount = m_data.mesh->faceCount();
 		// Precompute regions of coplanar incident faces.
-		m_nextPlanarRegionFace.resize(faceCount);
-		m_facePlanarRegionId.resize(faceCount);
+		m_regionFirstFace.clear();
+		m_nextRegionFace.resize(faceCount);
+		m_faceToRegionId.resize(faceCount);
 		for (uint32_t f = 0; f < faceCount; f++) {
-			m_nextPlanarRegionFace[f] = f;
-			m_facePlanarRegionId[f] = UINT32_MAX;
+			m_nextRegionFace[f] = f;
+			m_faceToRegionId[f] = UINT32_MAX;
 		}
 		Array<uint32_t> faceStack;
 		faceStack.reserve(min(faceCount, 16u));
-		uint32_t planarRegionCount = 0;
+		uint32_t regionCount = 0;
 		for (uint32_t f = 0; f < faceCount; f++) {
-			if (m_nextPlanarRegionFace[f] != f)
+			if (m_nextRegionFace[f] != f)
 				continue; // Already assigned.
 			faceStack.clear();
 			faceStack.push_back(f);
@@ -4679,48 +5587,207 @@ struct Atlas {
 				if (faceStack.isEmpty())
 					break;
 				const uint32_t face = faceStack.back();
-				m_facePlanarRegionId[face] = planarRegionCount;
+				m_faceToRegionId[face] = regionCount;
 				faceStack.pop_back();
-				for (Mesh::FaceEdgeIterator it(m_mesh, face); !it.isDone(); it.advance()) {
+				for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
 					const uint32_t oface = it.oppositeFace();
 					if (it.isBoundary())
 						continue;
-					if (m_nextPlanarRegionFace[oface] != oface)
+					if (m_nextRegionFace[oface] != oface)
 						continue; // Already assigned.
-					if (!equal(dot(m_faceNormals[face], m_faceNormals[oface]), 1.0f, kEpsilon))
+					if (!equal(dot(m_data.faceNormals[face], m_data.faceNormals[oface]), 1.0f, kEpsilon))
 						continue; // Not coplanar.
-					const uint32_t next = m_nextPlanarRegionFace[face];
-					m_nextPlanarRegionFace[face] = oface;
-					m_nextPlanarRegionFace[oface] = next;
-					m_facePlanarRegionId[oface] = planarRegionCount;
+					const uint32_t next = m_nextRegionFace[face];
+					m_nextRegionFace[face] = oface;
+					m_nextRegionFace[oface] = next;
+					m_faceToRegionId[oface] = regionCount;
 					faceStack.push_back(oface);
 				}
 			}
-			planarRegionCount++;
+			m_regionFirstFace.push_back(f);
+			regionCount++;
 		}
 #if XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS
-		char filename[256];
-		XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_planar_regions.obj", meshId, chartGroupId);
-		FILE *file;
-		XA_FOPEN(file, filename, "w");
-		if (file) {
-			m_mesh->writeObjVertices(file);
-			fprintf(file, "s off\n");
-			for (uint32_t i = 0; i < planarRegionCount; i++) {
-				fprintf(file, "o region%u\n", i);
-				for (uint32_t j = 0; j < faceCount; j++) {
-					if (m_facePlanarRegionId[j] == i)
-						m_mesh->writeObjFace(file, j);
+		static std::mutex s_mutex;
+		{
+			std::lock_guard<std::mutex> lock(s_mutex);
+			FILE *file;
+			XA_FOPEN(file, "debug_mesh_planar_regions.obj", s_planarRegionsCurrentRegion == 0 ? "w" : "a");
+			if (file) {
+				m_data.mesh->writeObjVertices(file);
+				fprintf(file, "s off\n");
+				for (uint32_t i = 0; i < regionCount; i++) {
+					fprintf(file, "o region%u\n", s_planarRegionsCurrentRegion);
+					for (uint32_t j = 0; j < faceCount; j++) {
+						if (m_faceToRegionId[j] == i)
+							m_data.mesh->writeObjFace(file, j, s_planarRegionsCurrentVertex);
+					}
+					s_planarRegionsCurrentRegion++;
 				}
+				s_planarRegionsCurrentVertex += m_data.mesh->vertexCount();
+				fclose(file);
 			}
-			fclose(file);
 		}
 #endif
-		XA_PROFILE_END(buildAtlasInit)
+		// Precompute planar region areas.
+		m_regionAreas.resize(regionCount);
+		m_regionAreas.zeroOutMemory();
+		for (uint32_t f = 0; f < faceCount; f++)
+			m_regionAreas[m_faceToRegionId[f]] += m_data.faceAreas[f];
+		// Create charts from suitable planar regions.
+		// The dihedral angle of all boundary edges must be >= 90 degrees.
+		m_charts.clear();
+		m_chartFaces.clear();
+		for (uint32_t region = 0; region < regionCount; region++) {
+			const uint32_t firstRegionFace = m_regionFirstFace[region];
+			uint32_t face = firstRegionFace;
+			bool createChart = true;
+			do {
+				for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
+					if (it.isBoundary())
+						continue; // Ignore mesh boundary edges.
+					const uint32_t oface = it.oppositeFace();
+					if (m_faceToRegionId[oface] == region)
+						continue; // Ignore internal edges.
+					const float angle = m_data.edgeDihedralAngles[it.edge()];
+					if (angle > 0.0f && angle < FLT_MAX) { // FLT_MAX on boundaries.
+						createChart = false;
+						break;
+					}
+				}
+				if (!createChart)
+					break;
+				face = m_nextRegionFace[face];
+			}
+			while (face != firstRegionFace);
+			// Create a chart.
+			if (createChart) {
+				Chart chart;
+				chart.firstFace = m_chartFaces.size();
+				chart.faceCount = 0;
+				face = firstRegionFace;
+				do {
+					m_data.isFaceInChart.set(face);
+					m_chartFaces.push_back(face);
+					chart.faceCount++;
+					face = m_nextRegionFace[face];
+				}
+				while (face != firstRegionFace);
+				m_charts.push_back(chart);
+			}
+		}
+		// Compute basis for each chart using the first face normal (all faces have the same normal).
+		m_chartBasis.resize(m_charts.size());
+		for (uint32_t c = 0; c < m_charts.size(); c++)
+		{
+			const uint32_t face = m_chartFaces[m_charts[c].firstFace];
+			Basis &basis = m_chartBasis[c];
+			basis.normal = m_data.faceNormals[face];
+			basis.tangent = Basis::computeTangent(basis.normal);
+			basis.bitangent = Basis::computeBitangent(basis.normal, basis.tangent);
+		}
+	}
+
+private:
+	struct Chart
+	{
+		uint32_t firstFace, faceCount;
+	};
+
+	AtlasData &m_data;
+	Array<uint32_t> m_regionFirstFace;
+	Array<uint32_t> m_nextRegionFace;
+	Array<uint32_t> m_faceToRegionId;
+	Array<float> m_regionAreas;
+	Array<Chart> m_charts;
+	Array<uint32_t> m_chartFaces;
+	Array<Basis> m_chartBasis;
+};
+
+struct ClusteredCharts
+{
+	ClusteredCharts(AtlasData &data, const PlanarCharts &planarCharts) : m_data(data), m_planarCharts(planarCharts), m_texcoords(MemTag::SegmentAtlasMeshData), m_bestTriangles(10), m_placingSeeds(false) {}
+
+	~ClusteredCharts()
+	{
+		const uint32_t chartCount = m_charts.size();
+		for (uint32_t i = 0; i < chartCount; i++) {
+			m_charts[i]->~Chart();
+			XA_FREE(m_charts[i]);
+		}
 	}
 
-	void placeSeeds(float threshold) {
-		XA_PROFILE_START(buildAtlasPlaceSeeds)
+	uint32_t chartCount() const { return m_charts.size(); }
+	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const { return m_charts[chartIndex]->faces; }
+	const Basis &chartBasis(uint32_t chartIndex) const { return m_charts[chartIndex]->basis; }
+
+	void compute()
+	{
+		const uint32_t faceCount = m_data.mesh->faceCount();
+		m_facesLeft = 0;
+		for (uint32_t i = 0; i < faceCount; i++) {
+			if (!m_data.isFaceInChart.get(i))
+				m_facesLeft++;
+		}
+		const uint32_t chartCount = m_charts.size();
+		for (uint32_t i = 0; i < chartCount; i++) {
+			m_charts[i]->~Chart();
+			XA_FREE(m_charts[i]);
+		}
+		m_charts.clear();
+		m_faceCharts.resize(faceCount);
+		m_faceCharts.fill(-1);
+		m_texcoords.resize(faceCount * 3);
+		if (m_facesLeft == 0)
+			return;
+		// Create initial charts greedely.
+		placeSeeds(m_data.options.maxCost * 0.5f);
+		if (m_data.options.maxIterations == 0) {
+			XA_DEBUG_ASSERT(m_facesLeft == 0);
+			return;
+		}
+		relocateSeeds();
+		resetCharts();
+		// Restart process growing charts in parallel.
+		uint32_t iteration = 0;
+		for (;;) {
+			growCharts(m_data.options.maxCost);
+			// When charts cannot grow more: fill holes, merge charts, relocate seeds and start new iteration.
+			fillHoles(m_data.options.maxCost * 0.5f);
+#if XA_MERGE_CHARTS
+			mergeCharts();
+#endif
+			if (++iteration == m_data.options.maxIterations)
+				break;
+			if (!relocateSeeds())
+				break;
+			resetCharts();
+		}
+		// Make sure no holes are left!
+		XA_DEBUG_ASSERT(m_facesLeft == 0);
+	}
+
+private:
+	struct Chart
+	{
+		Chart() : faces(MemTag::SegmentAtlasChartFaces) {}
+
+		int id = -1;
+		Basis basis; // Best fit normal.
+		float area = 0.0f;
+		float boundaryLength = 0.0f;
+		Vector3 centroidSum = Vector3(0.0f); // Sum of chart face centroids.
+		Vector3 centroid = Vector3(0.0f); // Average centroid of chart faces.
+		Array<uint32_t> faces;
+		Array<uint32_t> failedPlanarRegions;
+		CostQueue candidates;
+		uint32_t seed;
+	};
+
+	void placeSeeds(float threshold)
+	{
+		XA_PROFILE_START(clusteredChartsPlaceSeeds)
+		m_placingSeeds = true;
 		// Instead of using a predefiened number of seeds:
 		// - Add seeds one by one, growing chart until a certain treshold.
 		// - Undo charts and restart growing process.
@@ -4728,13 +5795,15 @@ struct Atlas {
 		//   - those points can be found using a simple flood filling algorithm.
 		//   - how do we weight the probabilities?
 		while (m_facesLeft > 0)
-			createRandomChart(threshold);
-		XA_PROFILE_END(buildAtlasPlaceSeeds)
+			createChart(threshold);
+		m_placingSeeds = false;
+		XA_PROFILE_END(clusteredChartsPlaceSeeds)
 	}
 
 	// Returns true if any of the charts can grow more.
-	void growCharts(float threshold) {
-		XA_PROFILE_START(buildAtlasGrowCharts)
+	void growCharts(float threshold)
+	{
+		XA_PROFILE_START(clusteredChartsGrow)
 		for (;;) {
 			if (m_facesLeft == 0)
 				break;
@@ -4752,7 +5821,7 @@ struct Atlas {
 						break;
 					cost = chart->candidates.peekCost();
 					face = chart->candidates.peekFace();
-					if (m_faceCharts[face] == -1)
+					if (!m_data.isFaceInChart.get(face))
 						break;
 					else {
 						// Face belongs to another chart. Pop from queue so the next best candidate can be retrieved.
@@ -4774,21 +5843,28 @@ struct Atlas {
 			Chart *chart = m_charts[bestChart];
 			chart->candidates.pop(); // Pop the selected candidate from the queue.
 			if (!addFaceToChart(chart, bestFace))
-				chart->failedPlanarRegions.push_back(m_facePlanarRegionId[bestFace]);
+				chart->failedPlanarRegions.push_back(m_planarCharts.regionIdFromFace(bestFace));
 		}
-		XA_PROFILE_END(buildAtlasGrowCharts)
+		XA_PROFILE_END(clusteredChartsGrow)
 	}
 
-	void resetCharts() {
-		XA_PROFILE_START(buildAtlasResetCharts)
-		const uint32_t faceCount = m_mesh->faceCount();
-		for (uint32_t i = 0; i < faceCount; i++)
+	void resetCharts()
+	{
+		XA_PROFILE_START(clusteredChartsReset)
+		const uint32_t faceCount = m_data.mesh->faceCount();
+		for (uint32_t i = 0; i < faceCount; i++) {
+			if (m_faceCharts[i] != -1)
+				m_data.isFaceInChart.unset(i);
 			m_faceCharts[i] = -1;
-		m_facesLeft = faceCount;
+		}
+		m_facesLeft = 0;
+		for (uint32_t i = 0; i < faceCount; i++) {
+			if (!m_data.isFaceInChart.get(i))
+				m_facesLeft++;
+		}
 		const uint32_t chartCount = m_charts.size();
 		for (uint32_t i = 0; i < chartCount; i++) {
 			Chart *chart = m_charts[i];
-			const uint32_t seed = chart->seeds.back();
 			chart->area = 0.0f;
 			chart->boundaryLength = 0.0f;
 			chart->basis.normal = Vector3(0.0f);
@@ -4799,13 +5875,14 @@ struct Atlas {
 			chart->faces.clear();
 			chart->candidates.clear();
 			chart->failedPlanarRegions.clear();
-			addFaceToChart(chart, seed);
+			addFaceToChart(chart, chart->seed);
 		}
-		XA_PROFILE_END(buildAtlasResetCharts)
+		XA_PROFILE_END(clusteredChartsReset)
 	}
 
-	bool relocateSeeds() {
-		XA_PROFILE_START(buildAtlasRelocateSeeds)
+	bool relocateSeeds()
+	{
+		XA_PROFILE_START(clusteredChartsRelocateSeeds)
 		bool anySeedChanged = false;
 		const uint32_t chartCount = m_charts.size();
 		for (uint32_t i = 0; i < chartCount; i++) {
@@ -4813,20 +5890,22 @@ struct Atlas {
 				anySeedChanged = true;
 			}
 		}
-		XA_PROFILE_END(buildAtlasRelocateSeeds)
+		XA_PROFILE_END(clusteredChartsRelocateSeeds)
 		return anySeedChanged;
 	}
 
-	void fillHoles(float threshold) {
-		XA_PROFILE_START(buildAtlasFillHoles)
+	void fillHoles(float threshold)
+	{
+		XA_PROFILE_START(clusteredChartsFillHoles)
 		while (m_facesLeft > 0)
-			createRandomChart(threshold);
-		XA_PROFILE_END(buildAtlasFillHoles)
+			createChart(threshold);
+		XA_PROFILE_END(clusteredChartsFillHoles)
 	}
 
 #if XA_MERGE_CHARTS
-	void mergeCharts() {
-		XA_PROFILE_START(buildAtlasMergeCharts)
+	void mergeCharts()
+	{
+		XA_PROFILE_START(clusteredChartsMerge)
 		const uint32_t chartCount = m_charts.size();
 		// Merge charts progressively until there's none left to merge.
 		for (;;) {
@@ -4845,13 +5924,15 @@ struct Atlas {
 				const uint32_t faceCount = chart->faces.size();
 				for (uint32_t i = 0; i < faceCount; i++) {
 					const uint32_t f = chart->faces[i];
-					for (Mesh::FaceEdgeIterator it(m_mesh, f); !it.isDone(); it.advance()) {
-						const float l = m_edgeLengths[it.edge()];
+					for (Mesh::FaceEdgeIterator it(m_data.mesh, f); !it.isDone(); it.advance()) {
+						const float l = m_data.edgeLengths[it.edge()];
 						if (it.isBoundary()) {
 							externalBoundaryLength += l;
 						} else {
 							const int neighborChart = m_faceCharts[it.oppositeFace()];
-							if (m_charts[neighborChart] != chart) {
+							if (neighborChart == -1)
+								externalBoundaryLength += l;
+							else if (m_charts[neighborChart] != chart) {
 								if ((it.isSeam() && (isNormalSeam(it.edge()) || it.isTextureSeam()))) {
 									externalBoundaryLength += l;
 								} else {
@@ -4876,14 +5957,14 @@ struct Atlas {
 					if (dot(chart2->basis.normal, chart->basis.normal) < XA_MERGE_CHARTS_MIN_NORMAL_DEVIATION)
 						continue;
 					// Obey max chart area and boundary length.
-					if (m_options.maxChartArea > 0.0f && chart->area + chart2->area > m_options.maxChartArea)
+					if (m_data.options.maxChartArea > 0.0f && chart->area + chart2->area > m_data.options.maxChartArea)
 						continue;
-					if (m_options.maxBoundaryLength > 0.0f && chart->boundaryLength + chart2->boundaryLength - m_sharedBoundaryLengthsNoSeams[cc] > m_options.maxBoundaryLength)
+					if (m_data.options.maxBoundaryLength > 0.0f && chart->boundaryLength + chart2->boundaryLength - m_sharedBoundaryLengthsNoSeams[cc] > m_data.options.maxBoundaryLength)
 						continue;
 					// Merge if chart2 has a single face.
 					// chart1 must have more than 1 face.
 					// chart2 area must be <= 10% of chart1 area.
-					if (m_sharedBoundaryLengthsNoSeams[cc] > 0.0f && chart->faces.size() > 1 && chart2->faces.size() == 1 && chart2->area <= chart->area * 0.1f)
+					if (m_sharedBoundaryLengthsNoSeams[cc] > 0.0f && chart->faces.size() > 1 && chart2->faces.size() == 1 && chart2->area <= chart->area * 0.1f) 
 						goto merge;
 					// Merge if chart2 has two faces (probably a quad), and chart1 bounds at least 2 of its edges.
 					if (chart2->faces.size() == 2 && m_sharedBoundaryEdgeCountNoSeams[cc] >= 2)
@@ -4891,8 +5972,8 @@ struct Atlas {
 					// Merge if chart2 is wholely inside chart1, ignoring seams.
 					if (m_sharedBoundaryLengthsNoSeams[cc] > 0.0f && equal(m_sharedBoundaryLengthsNoSeams[cc], chart2->boundaryLength, kEpsilon))
 						goto merge;
-					if (m_sharedBoundaryLengths[cc] > 0.2f * max(0.0f, chart->boundaryLength - externalBoundaryLength) ||
-							m_sharedBoundaryLengths[cc] > 0.75f * chart2->boundaryLength)
+					if (m_sharedBoundaryLengths[cc] > 0.2f * max(0.0f, chart->boundaryLength - externalBoundaryLength) || 
+						m_sharedBoundaryLengths[cc] > 0.75f * chart2->boundaryLength)
 						goto merge;
 					continue;
 				merge:
@@ -4925,55 +6006,64 @@ struct Atlas {
 				c++;
 			}
 		}
-		XA_PROFILE_END(buildAtlasMergeCharts)
+		XA_PROFILE_END(clusteredChartsMerge)
 	}
 #endif
 
 private:
-	void createRandomChart(float threshold) {
+	void createChart(float threshold)
+	{
 		Chart *chart = XA_NEW(MemTag::Default, Chart);
 		chart->id = (int)m_charts.size();
 		m_charts.push_back(chart);
-		// Pick random face that is not used by any chart yet.
-		uint32_t face = m_rand.getRange(m_mesh->faceCount() - 1);
-		while (m_faceCharts[face] != -1) {
-			if (++face >= m_mesh->faceCount())
-				face = 0;
-		}
-		chart->seeds.push_back(face);
-		addFaceToChart(chart, face);
+		// Pick a face not used by any chart yet, belonging to the largest planar region.
+		chart->seed = 0;
+		float largestArea = 0.0f;
+		for (uint32_t f = 0; f < m_data.mesh->faceCount(); f++) {
+			if (m_data.isFaceInChart.get(f))
+				continue;
+			const float area = m_planarCharts.regionArea(m_planarCharts.regionIdFromFace(f));
+			if (area > largestArea) {
+				largestArea = area;
+				chart->seed = f;
+			}
+		}
+		addFaceToChart(chart, chart->seed);
 		// Grow the chart as much as possible within the given threshold.
 		for (;;) {
 			if (chart->candidates.count() == 0 || chart->candidates.peekCost() > threshold)
 				break;
 			const uint32_t f = chart->candidates.pop();
-			if (m_faceCharts[f] != -1)
+			if (m_data.isFaceInChart.get(f))
 				continue;
 			if (!addFaceToChart(chart, f)) {
-				chart->failedPlanarRegions.push_back(m_facePlanarRegionId[f]);
+				chart->failedPlanarRegions.push_back(m_planarCharts.regionIdFromFace(f));
 				continue;
 			}
 		}
 	}
 
-	bool isChartBoundaryEdge(const Chart *chart, uint32_t edge) const {
-		const uint32_t oppositeEdge = m_mesh->oppositeEdge(edge);
+	bool isChartBoundaryEdge(const Chart *chart, uint32_t edge) const
+	{
+		const uint32_t oppositeEdge = m_data.mesh->oppositeEdge(edge);
 		const uint32_t oppositeFace = meshEdgeFace(oppositeEdge);
 		return oppositeEdge == UINT32_MAX || m_faceCharts[oppositeFace] != chart->id;
 	}
 
-	bool computeChartBasis(Chart *chart, Basis *basis) {
+	bool computeChartBasis(Chart *chart, Basis *basis)
+	{
 		const uint32_t faceCount = chart->faces.size();
 		m_tempPoints.resize(chart->faces.size() * 3);
 		for (uint32_t i = 0; i < faceCount; i++) {
 			const uint32_t f = chart->faces[i];
 			for (uint32_t j = 0; j < 3; j++)
-				m_tempPoints[i * 3 + j] = m_mesh->position(m_mesh->vertexAt(f * 3 + j));
+				m_tempPoints[i * 3 + j] = m_data.mesh->position(m_data.mesh->vertexAt(f * 3 + j));
 		}
 		return Fit::computeBasis(m_tempPoints.data(), m_tempPoints.size(), basis);
 	}
 
-	bool isFaceFlipped(uint32_t face) const {
+	bool isFaceFlipped(uint32_t face) const
+	{
 		const Vector2 &v1 = m_texcoords[face * 3 + 0];
 		const Vector2 &v2 = m_texcoords[face * 3 + 1];
 		const Vector2 &v3 = m_texcoords[face * 3 + 2];
@@ -4981,20 +6071,22 @@ private:
 		return parametricArea < 0.0f;
 	}
 
-	void parameterizeChart(const Chart *chart) {
+	void parameterizeChart(const Chart *chart)
+	{
 		const uint32_t faceCount = chart->faces.size();
 		for (uint32_t i = 0; i < faceCount; i++) {
 			const uint32_t face = chart->faces[i];
 			for (uint32_t j = 0; j < 3; j++) {
 				const uint32_t offset = face * 3 + j;
-				const Vector3 &pos = m_mesh->position(m_mesh->vertexAt(offset));
+				const Vector3 &pos = m_data.mesh->position(m_data.mesh->vertexAt(offset));
 				m_texcoords[offset] = Vector2(dot(chart->basis.tangent, pos), dot(chart->basis.bitangent, pos));
 			}
 		}
 	}
 
 	// m_faceCharts for the chart faces must be set to the chart ID. Needed to compute boundary edges.
-	bool isChartParameterizationValid(const Chart *chart) {
+	bool isChartParameterizationValid(const Chart *chart)
+	{
 		const uint32_t faceCount = chart->faces.size();
 		// Check for flipped faces in the parameterization. OK if all are flipped.
 		uint32_t flippedFaceCount = 0;
@@ -5005,6 +6097,8 @@ private:
 		if (flippedFaceCount != 0 && flippedFaceCount != faceCount)
 			return false;
 		// Check for boundary intersection in the parameterization.
+		XA_PROFILE_START(clusteredChartsPlaceSeedsBoundaryIntersection)
+		XA_PROFILE_START(clusteredChartsGrowBoundaryIntersection)
 		m_boundaryGrid.reset(m_texcoords.data());
 		for (uint32_t i = 0; i < faceCount; i++) {
 			const uint32_t f = chart->faces[i];
@@ -5014,22 +6108,30 @@ private:
 					m_boundaryGrid.append(edge);
 			}
 		}
-		if (m_boundaryGrid.intersectSelf(m_mesh->epsilon()))
+		const bool intersection = m_boundaryGrid.intersect(m_data.mesh->epsilon());
+#if XA_PROFILE
+		if (m_placingSeeds)
+			XA_PROFILE_END(clusteredChartsPlaceSeedsBoundaryIntersection)
+		else
+			XA_PROFILE_END(clusteredChartsGrowBoundaryIntersection)
+#endif
+		if (intersection)
 			return false;
 		return true;
 	}
 
-	bool addFaceToChart(Chart *chart, uint32_t face) {
-		XA_DEBUG_ASSERT(m_faceCharts[face] == -1);
+	bool addFaceToChart(Chart *chart, uint32_t face)
+	{
+		XA_DEBUG_ASSERT(!m_data.isFaceInChart.get(face));
 		const uint32_t oldFaceCount = chart->faces.size();
 		const bool firstFace = oldFaceCount == 0;
 		// Append the face and any coplanar connected faces to the chart faces array.
 		chart->faces.push_back(face);
-		uint32_t coplanarFace = m_nextPlanarRegionFace[face];
-		while (coplanarFace != face) {
-			XA_DEBUG_ASSERT(m_faceCharts[coplanarFace] == -1);
+		uint32_t coplanarFace = m_planarCharts.nextRegionFace(face);
+		while (coplanarFace != face) { 
+			XA_DEBUG_ASSERT(!m_data.isFaceInChart.get(coplanarFace));
 			chart->faces.push_back(coplanarFace);
-			coplanarFace = m_nextPlanarRegionFace[coplanarFace];
+			coplanarFace = m_planarCharts.nextRegionFace(coplanarFace);
 		}
 		const uint32_t faceCount = chart->faces.size();
 		// Compute basis.
@@ -5037,8 +6139,8 @@ private:
 		if (firstFace) {
 			// Use the first face normal.
 			// Use any edge as the tangent vector.
-			basis.normal = m_faceNormals[face];
-			basis.tangent = normalize(m_mesh->position(m_mesh->vertexAt(face * 3 + 0)) - m_mesh->position(m_mesh->vertexAt(face * 3 + 1)), 0);
+			basis.normal = m_data.faceNormals[face];
+			basis.tangent = normalize(m_data.mesh->position(m_data.mesh->vertexAt(face * 3 + 0)) - m_data.mesh->position(m_data.mesh->vertexAt(face * 3 + 1)), kEpsilon);
 			basis.bitangent = cross(basis.normal, basis.tangent);
 		} else {
 			// Use best fit normal.
@@ -5046,7 +6148,7 @@ private:
 				chart->faces.resize(oldFaceCount);
 				return false;
 			}
-			if (dot(basis.normal, m_faceNormals[face]) < 0.0f) // Flip normal if oriented in the wrong direction.
+			if (dot(basis.normal, m_data.faceNormals[face]) < 0.0f) // Flip normal if oriented in the wrong direction.
 				basis.normal = -basis.normal;
 		}
 		if (!firstFace) {
@@ -5069,7 +6171,8 @@ private:
 			const uint32_t f = chart->faces[i];
 			m_faceCharts[f] = chart->id;
 			m_facesLeft--;
-			chart->centroidSum += m_mesh->computeFaceCenter(f);
+			m_data.isFaceInChart.set(f);
+			chart->centroidSum += m_data.mesh->computeFaceCenter(f);
 		}
 		chart->centroid = chart->centroidSum / float(chart->faces.size());
 		// Refresh candidates.
@@ -5079,15 +6182,15 @@ private:
 			const uint32_t f = chart->faces[i];
 			for (uint32_t j = 0; j < 3; j++) {
 				const uint32_t edge = f * 3 + j;
-				const uint32_t oedge = m_mesh->oppositeEdge(edge);
+				const uint32_t oedge = m_data.mesh->oppositeEdge(edge);
 				if (oedge == UINT32_MAX)
 					continue; // Boundary edge.
 				const uint32_t oface = meshEdgeFace(oedge);
-				if (m_faceCharts[oface] != -1)
+				if (m_data.isFaceInChart.get(oface))
 					continue; // Face belongs to another chart.
-				if (chart->failedPlanarRegions.contains(m_facePlanarRegionId[oface]))
+				if (chart->failedPlanarRegions.contains(m_planarCharts.regionIdFromFace(oface)))
 					continue; // Failed to add this faces planar region to the chart before.
-				const float cost = evaluateCost(chart, oface);
+				const float cost = computeCost(chart, oface);
 				if (cost < FLT_MAX)
 					chart->candidates.push(cost, oface);
 			}
@@ -5096,76 +6199,62 @@ private:
 	}
 
 	// Returns true if the seed has changed.
-	bool relocateSeed(Chart *chart) {
+	bool relocateSeed(Chart *chart)
+	{
 		// Find the first N triangles that fit the proxy best.
 		const uint32_t faceCount = chart->faces.size();
 		m_bestTriangles.clear();
 		for (uint32_t i = 0; i < faceCount; i++) {
-			const float cost = evaluateProxyFitMetric(chart, chart->faces[i]);
+			const float cost = computeNormalDeviationMetric(chart, chart->faces[i]);
 			m_bestTriangles.push(cost, chart->faces[i]);
 		}
-		// Of those, choose the least central triangle.
-		uint32_t leastCentral = 0;
-		float maxDistance = -1;
+		// Of those, choose the most central triangle.
+		uint32_t mostCentral = 0;
+		float minDistance = FLT_MAX;
 		for (;;) {
 			if (m_bestTriangles.count() == 0)
 				break;
 			const uint32_t face = m_bestTriangles.pop();
-			Vector3 faceCentroid = m_mesh->computeFaceCenter(face);
+			Vector3 faceCentroid = m_data.mesh->computeFaceCenter(face);
 			const float distance = length(chart->centroid - faceCentroid);
-			if (distance > maxDistance) {
-				maxDistance = distance;
-				leastCentral = face;
-			}
-		}
-		XA_DEBUG_ASSERT(maxDistance >= 0);
-		// In order to prevent k-means cyles we record all the previously chosen seeds.
-		for (uint32_t i = 0; i < chart->seeds.size(); i++) {
-			// Treat seeds belong to the same planar region as equal.
-			if (chart->seeds[i] == leastCentral || m_facePlanarRegionId[chart->seeds[i]] == m_facePlanarRegionId[leastCentral]) {
-				// Move new seed to the end of the seed array.
-				uint32_t last = chart->seeds.size() - 1;
-				swap(chart->seeds[i], chart->seeds[last]);
-				return false;
+			if (distance < minDistance) {
+				minDistance = distance;
+				mostCentral = face;
 			}
 		}
-		// Append new seed.
-		chart->seeds.push_back(leastCentral);
+		XA_DEBUG_ASSERT(minDistance < FLT_MAX);
+		if (mostCentral == chart->seed)
+			return false;
+		chart->seed = mostCentral;
 		return true;
 	}
 
-	// Evaluate combined metric.
-	float evaluateCost(Chart *chart, uint32_t face) const {
-		if (dot(m_faceNormals[face], chart->basis.normal) <= 0.26f) // ~75 degrees
-			return FLT_MAX;
+	// Cost is combined metrics * weights.
+	float computeCost(Chart *chart, uint32_t face) const
+	{
 		// Estimate boundary length and area:
-		float newChartArea = 0.0f, newBoundaryLength = 0.0f;
-		if (m_options.maxChartArea > 0.0f || m_options.roundnessMetricWeight > 0.0f)
-			newChartArea = computeArea(chart, face);
-		if (m_options.maxBoundaryLength > 0.0f || m_options.roundnessMetricWeight > 0.0f)
-			newBoundaryLength = computeBoundaryLength(chart, face);
+		const float newChartArea = computeArea(chart, face);
+		const float newBoundaryLength = computeBoundaryLength(chart, face);
 		// Enforce limits strictly:
-		if (m_options.maxChartArea > 0.0f && newChartArea > m_options.maxChartArea)
+		if (m_data.options.maxChartArea > 0.0f && newChartArea > m_data.options.maxChartArea)
 			return FLT_MAX;
-		if (m_options.maxBoundaryLength > 0.0f && newBoundaryLength > m_options.maxBoundaryLength)
+		if (m_data.options.maxBoundaryLength > 0.0f && newBoundaryLength > m_data.options.maxBoundaryLength)
 			return FLT_MAX;
+		// Compute metrics.
 		float cost = 0.0f;
-		if (m_options.normalSeamMetricWeight > 0.0f) {
-			// Penalize faces that cross seams, reward faces that close seams or reach boundaries.
-			// Make sure normal seams are fully respected:
-			const float N = evaluateNormalSeamMetric(chart, face);
-			if (m_options.normalSeamMetricWeight >= 1000.0f && N > 0.0f)
-				return FLT_MAX;
-			cost += m_options.normalSeamMetricWeight * N;
-		}
-		if (m_options.proxyFitMetricWeight > 0.0f)
-			cost += m_options.proxyFitMetricWeight * evaluateProxyFitMetric(chart, face);
-		if (m_options.roundnessMetricWeight > 0.0f)
-			cost += m_options.roundnessMetricWeight * evaluateRoundnessMetric(chart, newBoundaryLength, newChartArea);
-		if (m_options.straightnessMetricWeight > 0.0f)
-			cost += m_options.straightnessMetricWeight * evaluateStraightnessMetric(chart, face);
-		if (m_options.textureSeamMetricWeight > 0.0f)
-			cost += m_options.textureSeamMetricWeight * evaluateTextureSeamMetric(chart, face);
+		const float normalDeviation = computeNormalDeviationMetric(chart, face);
+		if (normalDeviation >= 0.707f) // ~75 degrees
+			return FLT_MAX;
+		cost += m_data.options.normalDeviationWeight * normalDeviation;
+		// Penalize faces that cross seams, reward faces that close seams or reach boundaries.
+		// Make sure normal seams are fully respected:
+		const float normalSeam = computeNormalSeamMetric(chart, face);
+		if (m_data.options.normalSeamWeight >= 1000.0f && normalSeam > 0.0f)
+			return FLT_MAX;
+		cost += m_data.options.normalSeamWeight * normalSeam;
+		cost += m_data.options.roundnessWeight * computeRoundnessMetric(chart, newBoundaryLength, newChartArea);
+		cost += m_data.options.straightnessWeight * computeStraightnessMetric(chart, face);
+		cost += m_data.options.textureSeamWeight * computeTextureSeamMetric(chart, face);
 		//float R = evaluateCompletenessMetric(chart, face);
 		//float D = evaluateDihedralAngleMetric(chart, face);
 		// @@ Add a metric based on local dihedral angle.
@@ -5176,100 +6265,108 @@ private:
 	}
 
 	// Returns a value in [0-1].
-	float evaluateProxyFitMetric(Chart *chart, uint32_t face) const {
+	// 0 if face normal is coplanar to the chart's best fit normal.
+	// 1 if face normal is perpendicular.
+	float computeNormalDeviationMetric(Chart *chart, uint32_t face) const
+	{
 		// All faces in coplanar regions have the same normal, can use any face.
-		const Vector3 faceNormal = m_faceNormals[face];
+		const Vector3 faceNormal = m_data.faceNormals[face];
 		// Use plane fitting metric for now:
-		return 1 - dot(faceNormal, chart->basis.normal); // @@ normal deviations should be weighted by face area
+		return min(1.0f - dot(faceNormal, chart->basis.normal), 1.0f); // @@ normal deviations should be weighted by face area
 	}
 
-	float evaluateRoundnessMetric(Chart *chart, float newBoundaryLength, float newChartArea) const {
-		const float roundness = square(chart->boundaryLength) / chart->area;
-		const float newBoundaryLengthSq = square(newBoundaryLength);
-		const float newRoundness = newBoundaryLengthSq / newChartArea;
-		if (newRoundness > roundness)
-			return newBoundaryLengthSq / (newChartArea * kPi4);
-		// Offer no impedance to faces that improve roundness.
-		return 0;
+	float computeRoundnessMetric(Chart *chart, float newBoundaryLength, float newChartArea) const
+	{
+		const float oldRoundness = square(chart->boundaryLength) / chart->area;
+		const float newRoundness = square(newBoundaryLength) / newChartArea;
+		return 1.0f - oldRoundness / newRoundness;
 	}
 
-	float evaluateStraightnessMetric(Chart *chart, uint32_t firstFace) const {
-		float l_out = 0.0f, l_in = 0.0f;
-		const uint32_t planarRegionId = m_facePlanarRegionId[firstFace];
+	float computeStraightnessMetric(Chart *chart, uint32_t firstFace) const
+	{
+		float l_out = 0.0f; // Length of firstFace planar region boundary that doesn't border the chart.
+		float l_in = 0.0f; // Length that does border the chart.
+		const uint32_t planarRegionId = m_planarCharts.regionIdFromFace(firstFace);
 		uint32_t face = firstFace;
-		for (;;) {
-			for (Mesh::FaceEdgeIterator it(m_mesh, face); !it.isDone(); it.advance()) {
-				const float l = m_edgeLengths[it.edge()];
+		for (;;) { 
+			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
+				const float l = m_data.edgeLengths[it.edge()];
 				if (it.isBoundary()) {
 					l_out += l;
-				} else if (m_facePlanarRegionId[it.oppositeFace()] != planarRegionId) {
+				} else if (m_planarCharts.regionIdFromFace(it.oppositeFace()) != planarRegionId) {
 					if (m_faceCharts[it.oppositeFace()] != chart->id)
 						l_out += l;
 					else
 						l_in += l;
 				}
 			}
-			face = m_nextPlanarRegionFace[face];
+			face = m_planarCharts.nextRegionFace(face);
 			if (face == firstFace)
 				break;
 		}
+#if 1
 		XA_DEBUG_ASSERT(l_in != 0.0f); // Candidate face must be adjacent to chart. @@ This is not true if the input mesh has zero-length edges.
 		float ratio = (l_out - l_in) / (l_out + l_in);
 		return min(ratio, 0.0f); // Only use the straightness metric to close gaps.
+#else
+		return 1.0f - l_in / l_out;
+#endif
 	}
 
-	bool isNormalSeam(uint32_t edge) const {
-		const uint32_t oppositeEdge = m_mesh->oppositeEdge(edge);
+	bool isNormalSeam(uint32_t edge) const
+	{
+		const uint32_t oppositeEdge = m_data.mesh->oppositeEdge(edge);
 		if (oppositeEdge == UINT32_MAX)
 			return false; // boundary edge
-		if (m_mesh->flags() & MeshFlags::HasNormals) {
-			const uint32_t v0 = m_mesh->vertexAt(meshEdgeIndex0(edge));
-			const uint32_t v1 = m_mesh->vertexAt(meshEdgeIndex1(edge));
-			const uint32_t ov0 = m_mesh->vertexAt(meshEdgeIndex0(oppositeEdge));
-			const uint32_t ov1 = m_mesh->vertexAt(meshEdgeIndex1(oppositeEdge));
+		if (m_data.mesh->flags() & MeshFlags::HasNormals) {
+			const uint32_t v0 = m_data.mesh->vertexAt(meshEdgeIndex0(edge));
+			const uint32_t v1 = m_data.mesh->vertexAt(meshEdgeIndex1(edge));
+			const uint32_t ov0 = m_data.mesh->vertexAt(meshEdgeIndex0(oppositeEdge));
+			const uint32_t ov1 = m_data.mesh->vertexAt(meshEdgeIndex1(oppositeEdge));
 			if (v0 == ov1 && v1 == ov0)
 				return false;
-			return !equal(m_mesh->normal(v0), m_mesh->normal(ov1), kNormalEpsilon) || !equal(m_mesh->normal(v1), m_mesh->normal(ov0), kNormalEpsilon);
+			return !equal(m_data.mesh->normal(v0), m_data.mesh->normal(ov1), kNormalEpsilon) || !equal(m_data.mesh->normal(v1), m_data.mesh->normal(ov0), kNormalEpsilon);
 		}
 		const uint32_t f0 = meshEdgeFace(edge);
 		const uint32_t f1 = meshEdgeFace(oppositeEdge);
-		if (m_facePlanarRegionId[f0] == m_facePlanarRegionId[f1])
+		if (m_planarCharts.regionIdFromFace(f0) == m_planarCharts.regionIdFromFace(f1))
 			return false;
-		return !equal(m_faceNormals[f0], m_faceNormals[f1], kNormalEpsilon);
+		return !equal(m_data.faceNormals[f0], m_data.faceNormals[f1], kNormalEpsilon);
 	}
 
-	float evaluateNormalSeamMetric(Chart *chart, uint32_t firstFace) const {
+	float computeNormalSeamMetric(Chart *chart, uint32_t firstFace) const
+	{
 		float seamFactor = 0.0f, totalLength = 0.0f;
 		uint32_t face = firstFace;
-		for (;;) {
-			for (Mesh::FaceEdgeIterator it(m_mesh, face); !it.isDone(); it.advance()) {
+		for (;;) { 
+			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
 				if (it.isBoundary())
 					continue;
 				if (m_faceCharts[it.oppositeFace()] != chart->id)
 					continue;
-				float l = m_edgeLengths[it.edge()];
+				float l = m_data.edgeLengths[it.edge()];
 				totalLength += l;
 				if (!it.isSeam())
 					continue;
 				// Make sure it's a normal seam.
 				if (isNormalSeam(it.edge())) {
 					float d;
-					if (m_mesh->flags() & MeshFlags::HasNormals) {
-						const Vector3 &n0 = m_mesh->normal(it.vertex0());
-						const Vector3 &n1 = m_mesh->normal(it.vertex1());
-						const Vector3 &on0 = m_mesh->normal(m_mesh->vertexAt(meshEdgeIndex0(it.oppositeEdge())));
-						const Vector3 &on1 = m_mesh->normal(m_mesh->vertexAt(meshEdgeIndex1(it.oppositeEdge())));
+					if (m_data.mesh->flags() & MeshFlags::HasNormals) {
+						const Vector3 &n0 = m_data.mesh->normal(it.vertex0());
+						const Vector3 &n1 = m_data.mesh->normal(it.vertex1());
+						const Vector3 &on0 = m_data.mesh->normal(m_data.mesh->vertexAt(meshEdgeIndex0(it.oppositeEdge())));
+						const Vector3 &on1 = m_data.mesh->normal(m_data.mesh->vertexAt(meshEdgeIndex1(it.oppositeEdge())));
 						const float d0 = clamp(dot(n0, on1), 0.0f, 1.0f);
 						const float d1 = clamp(dot(n1, on0), 0.0f, 1.0f);
 						d = (d0 + d1) * 0.5f;
 					} else {
-						d = clamp(dot(m_faceNormals[face], m_faceNormals[meshEdgeFace(it.oppositeEdge())]), 0.0f, 1.0f);
+						d = clamp(dot(m_data.faceNormals[face], m_data.faceNormals[meshEdgeFace(it.oppositeEdge())]), 0.0f, 1.0f);
 					}
 					l *= 1 - d;
 					seamFactor += l;
 				}
 			}
-			face = m_nextPlanarRegionFace[face];
+			face = m_planarCharts.nextRegionFace(face);
 			if (face == firstFace)
 				break;
 		}
@@ -5278,16 +6375,17 @@ private:
 		return seamFactor / totalLength;
 	}
 
-	float evaluateTextureSeamMetric(Chart *chart, uint32_t firstFace) const {
+	float computeTextureSeamMetric(Chart *chart, uint32_t firstFace) const
+	{
 		float seamLength = 0.0f, totalLength = 0.0f;
 		uint32_t face = firstFace;
-		for (;;) {
-			for (Mesh::FaceEdgeIterator it(m_mesh, face); !it.isDone(); it.advance()) {
+		for (;;) { 
+			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
 				if (it.isBoundary())
 					continue;
 				if (m_faceCharts[it.oppositeFace()] != chart->id)
 					continue;
-				float l = m_edgeLengths[it.edge()];
+				float l = m_data.edgeLengths[it.edge()];
 				totalLength += l;
 				if (!it.isSeam())
 					continue;
@@ -5295,7 +6393,7 @@ private:
 				if (it.isTextureSeam())
 					seamLength += l;
 			}
-			face = m_nextPlanarRegionFace[face];
+			face = m_planarCharts.nextRegionFace(face);
 			if (face == firstFace)
 				break;
 		}
@@ -5304,43 +6402,46 @@ private:
 		return seamLength / totalLength;
 	}
 
-	float computeArea(Chart *chart, uint32_t firstFace) const {
+	float computeArea(Chart *chart, uint32_t firstFace) const
+	{
 		float area = chart->area;
 		uint32_t face = firstFace;
-		for (;;) {
-			area += m_faceAreas[face];
-			face = m_nextPlanarRegionFace[face];
+		for (;;) { 
+			area += m_data.faceAreas[face];
+			face = m_planarCharts.nextRegionFace(face);
 			if (face == firstFace)
 				break;
 		}
 		return area;
 	}
 
-	float computeBoundaryLength(Chart *chart, uint32_t firstFace) const {
+	float computeBoundaryLength(Chart *chart, uint32_t firstFace) const
+	{
 		float boundaryLength = chart->boundaryLength;
 		// Add new edges, subtract edges shared with the chart.
-		const uint32_t planarRegionId = m_facePlanarRegionId[firstFace];
+		const uint32_t planarRegionId = m_planarCharts.regionIdFromFace(firstFace);
 		uint32_t face = firstFace;
-		for (;;) {
-			for (Mesh::FaceEdgeIterator it(m_mesh, face); !it.isDone(); it.advance()) {
-				const float edgeLength = m_edgeLengths[it.edge()];
+		for (;;) { 
+			for (Mesh::FaceEdgeIterator it(m_data.mesh, face); !it.isDone(); it.advance()) {
+				const float edgeLength = m_data.edgeLengths[it.edge()];
 				if (it.isBoundary()) {
 					boundaryLength += edgeLength;
-				} else if (m_facePlanarRegionId[it.oppositeFace()] != planarRegionId) {
+				} else if (m_planarCharts.regionIdFromFace(it.oppositeFace()) != planarRegionId) {
 					if (m_faceCharts[it.oppositeFace()] != chart->id)
 						boundaryLength += edgeLength;
 					else
 						boundaryLength -= edgeLength;
 				}
 			}
-			face = m_nextPlanarRegionFace[face];
+			face = m_planarCharts.nextRegionFace(face);
 			if (face == firstFace)
 				break;
 		}
-		return max(0.0f, boundaryLength); // @@ Hack!
+		return max(0.0f, boundaryLength);  // @@ Hack!
 	}
 
-	bool mergeChart(Chart *owner, Chart *chart, float sharedBoundaryLength) {
+	bool mergeChart(Chart *owner, Chart *chart, float sharedBoundaryLength)
+	{
 		const uint32_t oldOwnerFaceCount = owner->faces.size();
 		const uint32_t chartFaceCount = chart->faces.size();
 		owner->faces.push_back(chart->faces);
@@ -5356,7 +6457,7 @@ private:
 				m_faceCharts[chart->faces[i]] = chart->id;
 			return false;
 		}
-		if (dot(basis.normal, m_faceNormals[owner->faces[0]]) < 0.0f) // Flip normal if oriented in the wrong direction.
+		if (dot(basis.normal, m_data.faceNormals[owner->faces[0]]) < 0.0f) // Flip normal if oriented in the wrong direction.
 			basis.normal = -basis.normal;
 		// Compute orthogonal parameterization and check that it is valid.
 		parameterizeChart(owner);
@@ -5379,19 +6480,14 @@ private:
 		return true;
 	}
 
-	const Mesh *m_mesh;
-	Array<float> m_edgeLengths;
-	Array<float> m_faceAreas;
-	Array<Vector3> m_faceNormals;
+private:
+	AtlasData &m_data;
+	const PlanarCharts &m_planarCharts;
 	Array<Vector2> m_texcoords;
 	uint32_t m_facesLeft;
 	Array<int> m_faceCharts;
 	Array<Chart *> m_charts;
 	CostQueue m_bestTriangles;
-	KISSRng m_rand;
-	ChartOptions m_options;
-	Array<uint32_t> m_nextPlanarRegionFace;
-	Array<uint32_t> m_facePlanarRegionId;
 	Array<Vector3> m_tempPoints;
 	UniformGrid2 m_boundaryGrid;
 #if XA_MERGE_CHARTS
@@ -5400,226 +6496,66 @@ private:
 	Array<float> m_sharedBoundaryLengthsNoSeams;
 	Array<uint32_t> m_sharedBoundaryEdgeCountNoSeams;
 #endif
+	bool m_placingSeeds;
 };
 
-} // namespace segment
-
-namespace param {
+struct Atlas
+{
+	Atlas() : m_planarCharts(m_data), m_clusteredCharts(m_data, m_planarCharts) {}
 
-class JacobiPreconditioner {
-public:
-	JacobiPreconditioner(const sparse::Matrix &M, bool symmetric) :
-			m_inverseDiagonal(M.width()) {
-		XA_ASSERT(M.isSquare());
-		for (uint32_t x = 0; x < M.width(); x++) {
-			float elem = M.getCoefficient(x, x);
-			//XA_DEBUG_ASSERT( elem != 0.0f ); // This can be zero in the presence of zero area triangles.
-			if (symmetric) {
-				m_inverseDiagonal[x] = (elem != 0) ? 1.0f / sqrtf(fabsf(elem)) : 1.0f;
-			} else {
-				m_inverseDiagonal[x] = (elem != 0) ? 1.0f / elem : 1.0f;
-			}
-		}
+	uint32_t chartCount() const
+	{
+		return m_planarCharts.chartCount() + m_clusteredCharts.chartCount();
 	}
 
-	void apply(const FullVector &x, FullVector &y) const {
-		XA_DEBUG_ASSERT(x.dimension() == m_inverseDiagonal.dimension());
-		XA_DEBUG_ASSERT(y.dimension() == m_inverseDiagonal.dimension());
-		// @@ Wrap vector component-wise product into a separate function.
-		const uint32_t D = x.dimension();
-		for (uint32_t i = 0; i < D; i++) {
-			y[i] = m_inverseDiagonal[i] * x[i];
-		}
+	ConstArrayView<uint32_t> chartFaces(uint32_t chartIndex) const
+	{
+		if (chartIndex < m_planarCharts.chartCount())
+			return m_planarCharts.chartFaces(chartIndex);
+		chartIndex -= m_planarCharts.chartCount();
+		return m_clusteredCharts.chartFaces(chartIndex);
 	}
 
-private:
-	FullVector m_inverseDiagonal;
-};
-
-// Linear solvers.
-class Solver {
-public:
-	// Solve the symmetric system: At·A·x = At·b
-	static bool LeastSquaresSolver(const sparse::Matrix &A, const FullVector &b, FullVector &x, float epsilon = 1e-5f) {
-		XA_DEBUG_ASSERT(A.width() == x.dimension());
-		XA_DEBUG_ASSERT(A.height() == b.dimension());
-		XA_DEBUG_ASSERT(A.height() >= A.width()); // @@ If height == width we could solve it directly...
-		const uint32_t D = A.width();
-		sparse::Matrix At(A.height(), A.width());
-		sparse::transpose(A, At);
-		FullVector Atb(D);
-		sparse::mult(At, b, Atb);
-		sparse::Matrix AtA(D);
-		sparse::mult(At, A, AtA);
-		return SymmetricSolver(AtA, Atb, x, epsilon);
-	}
-
-	// See section 10.4.3 in: Mesh Parameterization: Theory and Practice, Siggraph Course Notes, August 2007
-	static bool LeastSquaresSolver(const sparse::Matrix &A, const FullVector &b, FullVector &x, const uint32_t *lockedParameters, uint32_t lockedCount, float epsilon = 1e-5f) {
-		XA_DEBUG_ASSERT(A.width() == x.dimension());
-		XA_DEBUG_ASSERT(A.height() == b.dimension());
-		XA_DEBUG_ASSERT(A.height() >= A.width() - lockedCount);
-		// @@ This is not the most efficient way of building a system with reduced degrees of freedom. It would be faster to do it on the fly.
-		const uint32_t D = A.width() - lockedCount;
-		XA_DEBUG_ASSERT(D > 0);
-		// Compute: b - Al * xl
-		FullVector b_Alxl(b);
-		for (uint32_t y = 0; y < A.height(); y++) {
-			const uint32_t count = A.getRow(y).size();
-			for (uint32_t e = 0; e < count; e++) {
-				uint32_t column = A.getRow(y)[e].x;
-				bool isFree = true;
-				for (uint32_t i = 0; i < lockedCount; i++) {
-					isFree &= (lockedParameters[i] != column);
-				}
-				if (!isFree) {
-					b_Alxl[y] -= x[column] * A.getRow(y)[e].v;
-				}
-			}
-		}
-		// Remove locked columns from A.
-		sparse::Matrix Af(D, A.height());
-		for (uint32_t y = 0; y < A.height(); y++) {
-			const uint32_t count = A.getRow(y).size();
-			for (uint32_t e = 0; e < count; e++) {
-				uint32_t column = A.getRow(y)[e].x;
-				uint32_t ix = column;
-				bool isFree = true;
-				for (uint32_t i = 0; i < lockedCount; i++) {
-					isFree &= (lockedParameters[i] != column);
-					if (column > lockedParameters[i]) ix--; // shift columns
-				}
-				if (isFree) {
-					Af.setCoefficient(ix, y, A.getRow(y)[e].v);
-				}
-			}
-		}
-		// Remove elements from x
-		FullVector xf(D);
-		for (uint32_t i = 0, j = 0; i < A.width(); i++) {
-			bool isFree = true;
-			for (uint32_t l = 0; l < lockedCount; l++) {
-				isFree &= (lockedParameters[l] != i);
-			}
-			if (isFree) {
-				xf[j++] = x[i];
-			}
-		}
-		// Solve reduced system.
-		bool result = LeastSquaresSolver(Af, b_Alxl, xf, epsilon);
-		// Copy results back to x.
-		for (uint32_t i = 0, j = 0; i < A.width(); i++) {
-			bool isFree = true;
-			for (uint32_t l = 0; l < lockedCount; l++) {
-				isFree &= (lockedParameters[l] != i);
-			}
-			if (isFree) {
-				x[i] = xf[j++];
-			}
-		}
-		return result;
+	const Basis &chartBasis(uint32_t chartIndex) const
+	{
+		if (chartIndex < m_planarCharts.chartCount())
+			return m_planarCharts.chartBasis(chartIndex);
+		chartIndex -= m_planarCharts.chartCount();
+		return m_clusteredCharts.chartBasis(chartIndex);
 	}
 
-private:
-	/**
-	* Compute the solution of the sparse linear system Ab=x using the Conjugate
-	* Gradient method.
-	*
-	* Solving sparse linear systems:
-	* (1)		A·x = b
-	*
-	* The conjugate gradient algorithm solves (1) only in the case that A is
-	* symmetric and positive definite. It is based on the idea of minimizing the
-	* function
-	*
-	* (2)		f(x) = 1/2·x·A·x - b·x
-	*
-	* This function is minimized when its gradient
-	*
-	* (3)		df = A·x - b
-	*
-	* is zero, which is equivalent to (1). The minimization is carried out by
-	* generating a succession of search directions p.k and improved minimizers x.k.
-	* At each stage a quantity alfa.k is found that minimizes f(x.k + alfa.k·p.k),
-	* and x.k+1 is set equal to the new point x.k + alfa.k·p.k. The p.k and x.k are
-	* built up in such a way that x.k+1 is also the minimizer of f over the whole
-	* vector space of directions already taken, {p.1, p.2, . . . , p.k}. After N
-	* iterations you arrive at the minimizer over the entire vector space, i.e., the
-	* solution to (1).
-	*
-	* For a really good explanation of the method see:
-	*
-	* "An Introduction to the Conjugate Gradient Method Without the Agonizing Pain",
-	* Jonhathan Richard Shewchuk.
-	*
-	**/
-	// Conjugate gradient with preconditioner.
-	static bool ConjugateGradientSolver(const JacobiPreconditioner &preconditioner, const sparse::Matrix &A, const FullVector &b, FullVector &x, float epsilon) {
-		XA_DEBUG_ASSERT(A.isSquare());
-		XA_DEBUG_ASSERT(A.width() == b.dimension());
-		XA_DEBUG_ASSERT(A.width() == x.dimension());
-		int i = 0;
-		const int D = A.width();
-		const int i_max = 4 * D; // Convergence should be linear, but in some cases, it's not.
-		FullVector r(D); // residual
-		FullVector p(D); // search direction
-		FullVector q(D); //
-		FullVector s(D); // preconditioned
-		float delta_0;
-		float delta_old;
-		float delta_new;
-		float alpha;
-		float beta;
-		// r = b - A·x
-		sparse::copy(b, r);
-		sparse::sgemv(-1, A, x, 1, r);
-		// p = M^-1 · r
-		preconditioner.apply(r, p);
-		delta_new = sparse::dot(r, p);
-		delta_0 = delta_new;
-		while (i < i_max && delta_new > epsilon * epsilon * delta_0) {
-			i++;
-			// q = A·p
-			sparse::mult(A, p, q);
-			// alpha = delta_new / p·q
-			const float pdotq = sparse::dot(p, q);
-			if (!isFinite(pdotq) || isNan(pdotq))
-				alpha = 0.0f;
-			else
-				alpha = delta_new / pdotq;
-			// x = alfa·p + x
-			sparse::saxpy(alpha, p, x);
-			if ((i & 31) == 0) { // recompute r after 32 steps
-				// r = b - A·x
-				sparse::copy(b, r);
-				sparse::sgemv(-1, A, x, 1, r);
-			} else {
-				// r = r - alfa·q
-				sparse::saxpy(-alpha, q, r);
-			}
-			// s = M^-1 · r
-			preconditioner.apply(r, s);
-			delta_old = delta_new;
-			delta_new = sparse::dot(r, s);
-			beta = delta_new / delta_old;
-			// p = s + beta·p
-			sparse::scal(beta, p);
-			sparse::saxpy(1, s, p);
-		}
-		return delta_new <= epsilon * epsilon * delta_0;
+	void reset(const Mesh *mesh, const ChartOptions &options)
+	{
+		XA_PROFILE_START(buildAtlasInit)
+		m_data.options = options;
+		m_data.mesh = mesh;
+		m_data.compute();
+		XA_PROFILE_END(buildAtlasInit)
 	}
 
-	static bool SymmetricSolver(const sparse::Matrix &A, const FullVector &b, FullVector &x, float epsilon = 1e-5f) {
-		XA_DEBUG_ASSERT(A.height() == A.width());
-		XA_DEBUG_ASSERT(A.height() == b.dimension());
-		XA_DEBUG_ASSERT(b.dimension() == x.dimension());
-		JacobiPreconditioner jacobi(A, true);
-		return ConjugateGradientSolver(jacobi, A, b, x, epsilon);
+	void compute()
+	{
+		XA_PROFILE_START(planarCharts)
+		m_planarCharts.compute();
+		XA_PROFILE_END(planarCharts)
+		XA_PROFILE_START(clusteredCharts)
+		m_clusteredCharts.compute();
+		XA_PROFILE_END(clusteredCharts)
 	}
+
+private:
+	AtlasData m_data;
+	PlanarCharts m_planarCharts;
+	ClusteredCharts m_clusteredCharts;
 };
 
+} // namespace segment
+
+namespace param {
+
 // Fast sweep in 3 directions
-static bool findApproximateDiameterVertices(Mesh *mesh, uint32_t *a, uint32_t *b) {
+static bool findApproximateDiameterVertices(Mesh *mesh, uint32_t *a, uint32_t *b)
+{
 	XA_DEBUG_ASSERT(a != nullptr);
 	XA_DEBUG_ASSERT(b != nullptr);
 	const uint32_t vertexCount = mesh->vertexCount();
@@ -5674,165 +6610,136 @@ static bool findApproximateDiameterVertices(Mesh *mesh, uint32_t *a, uint32_t *b
 	return true;
 }
 
-// Conformal relations from Brecht Van Lommel (based on ABF):
-
-static float vec_angle_cos(const Vector3 &v1, const Vector3 &v2, const Vector3 &v3) {
-	Vector3 d1 = v1 - v2;
-	Vector3 d2 = v3 - v2;
-	return clamp(dot(d1, d2) / (length(d1) * length(d2)), -1.0f, 1.0f);
-}
-
-static float vec_angle(const Vector3 &v1, const Vector3 &v2, const Vector3 &v3) {
-	float dot = vec_angle_cos(v1, v2, v3);
-	return acosf(dot);
-}
-
-static void triangle_angles(const Vector3 &v1, const Vector3 &v2, const Vector3 &v3, float *a1, float *a2, float *a3) {
-	*a1 = vec_angle(v3, v1, v2);
-	*a2 = vec_angle(v1, v2, v3);
-	*a3 = kPi - *a2 - *a1;
-}
-
-static void setup_abf_relations(sparse::Matrix &A, int row, int id0, int id1, int id2, const Vector3 &p0, const Vector3 &p1, const Vector3 &p2) {
-	// @@ IC: Wouldn't it be more accurate to return cos and compute 1-cos^2?
-	// It does indeed seem to be a little bit more robust.
-	// @@ Need to revisit this more carefully!
-	float a0, a1, a2;
-	triangle_angles(p0, p1, p2, &a0, &a1, &a2);
-	float s0 = sinf(a0);
-	float s1 = sinf(a1);
-	float s2 = sinf(a2);
-	if (s1 > s0 && s1 > s2) {
-		swap(s1, s2);
-		swap(s0, s1);
-		swap(a1, a2);
-		swap(a0, a1);
-		swap(id1, id2);
-		swap(id0, id1);
-	} else if (s0 > s1 && s0 > s2) {
-		swap(s0, s2);
-		swap(s0, s1);
-		swap(a0, a2);
-		swap(a0, a1);
-		swap(id0, id2);
-		swap(id0, id1);
-	}
-	float c0 = cosf(a0);
-	float ratio = (s2 == 0.0f) ? 1.0f : s1 / s2;
-	float cosine = c0 * ratio;
-	float sine = s0 * ratio;
-	// Note  : 2*id + 0 --> u
-	//         2*id + 1 --> v
-	int u0_id = 2 * id0 + 0;
-	int v0_id = 2 * id0 + 1;
-	int u1_id = 2 * id1 + 0;
-	int v1_id = 2 * id1 + 1;
-	int u2_id = 2 * id2 + 0;
-	int v2_id = 2 * id2 + 1;
-	// Real part
-	A.setCoefficient(u0_id, 2 * row + 0, cosine - 1.0f);
-	A.setCoefficient(v0_id, 2 * row + 0, -sine);
-	A.setCoefficient(u1_id, 2 * row + 0, -cosine);
-	A.setCoefficient(v1_id, 2 * row + 0, sine);
-	A.setCoefficient(u2_id, 2 * row + 0, 1);
-	// Imaginary part
-	A.setCoefficient(u0_id, 2 * row + 1, sine);
-	A.setCoefficient(v0_id, 2 * row + 1, cosine - 1.0f);
-	A.setCoefficient(u1_id, 2 * row + 1, -sine);
-	A.setCoefficient(v1_id, 2 * row + 1, -cosine);
-	A.setCoefficient(v2_id, 2 * row + 1, 1);
-}
-
-static bool computeLeastSquaresConformalMap(Mesh *mesh) {
-	// For this to work properly, mesh should not have colocals that have the same
-	// attributes, unless you want the vertices to actually have different texcoords.
-	const uint32_t vertexCount = mesh->vertexCount();
-	const uint32_t D = 2 * vertexCount;
-	const uint32_t N = 2 * mesh->faceCount();
-	// N is the number of equations (one per triangle)
-	// D is the number of variables (one per vertex; there are 2 pinned vertices).
-	if (N < D - 4) {
-		return false;
-	}
-	sparse::Matrix A(D, N);
-	FullVector b(N);
-	FullVector x(D);
-	// Fill b:
-	b.fill(0.0f);
-	// Fill x:
-	uint32_t v0, v1;
-	if (!findApproximateDiameterVertices(mesh, &v0, &v1)) {
+// From OpenNL LSCM example.
+// Computes the coordinates of the vertices of a triangle in a local 2D orthonormal basis of the triangle's plane.
+static void projectTriangle(Vector3 p0, Vector3 p1, Vector3 p2, Vector2 *z0, Vector2 *z1, Vector2 *z2, float epsilon)
+{
+	Vector3 X = normalize(p1 - p0, epsilon);
+	Vector3 Z = normalize(cross(X, p2 - p0), epsilon);
+	Vector3 Y = cross(Z, X);
+	Vector3 &O = p0;
+	*z0 = Vector2(0, 0);
+	*z1 = Vector2(length(p1 - O), 0);
+	*z2 = Vector2(dot(p2 - O, X), dot(p2 - O, Y));
+}
+
+static bool computeLeastSquaresConformalMap(Mesh *mesh)
+{
+	uint32_t lockedVertex0, lockedVertex1;
+	if (!findApproximateDiameterVertices(mesh, &lockedVertex0, &lockedVertex1)) {
 		// Mesh has no boundaries.
 		return false;
 	}
-	if (mesh->texcoord(v0) == mesh->texcoord(v1)) {
-		// LSCM expects an existing parameterization.
+	const uint32_t vertexCount = mesh->vertexCount();
+	opennl::NLContext *context = opennl::nlNewContext();
+	opennl::nlSolverParameteri(context, NL_NB_VARIABLES, int(2 * vertexCount));
+	opennl::nlSolverParameteri(context, NL_MAX_ITERATIONS, int(5 * vertexCount));
+	opennl::nlBegin(context, NL_SYSTEM);
+	const Vector2 *texcoords = mesh->texcoords();
+	for (uint32_t i = 0; i < vertexCount; i++) {
+		opennl::nlSetVariable(context, 2 * i, texcoords[i].x);
+		opennl::nlSetVariable(context, 2 * i + 1, texcoords[i].y);
+		if (i == lockedVertex0 || i == lockedVertex1) {
+			opennl::nlLockVariable(context, 2 * i);
+			opennl::nlLockVariable(context, 2 * i + 1);
+		} 
+	}
+	opennl::nlBegin(context, NL_MATRIX);
+	const uint32_t faceCount = mesh->faceCount();
+	const Vector3 *positions = mesh->positions();
+	const uint32_t *indices = mesh->indices();
+	for (uint32_t f = 0; f < faceCount; f++) {
+		const uint32_t v0 = indices[f * 3 + 0];
+		const uint32_t v1 = indices[f * 3 + 1];
+		const uint32_t v2 = indices[f * 3 + 2];
+		Vector2 z0, z1, z2;
+		projectTriangle(positions[v0], positions[v1], positions[v2], &z0, &z1, &z2, mesh->epsilon());
+		double a = z1.x - z0.x;
+		double b = z1.y - z0.y;
+		double c = z2.x - z0.x;
+		double d = z2.y - z0.y;
+		XA_DEBUG_ASSERT(b == 0.0);
+		// Note  : 2*id + 0 --> u
+		//         2*id + 1 --> v
+		uint32_t u0_id = 2 * v0;
+		uint32_t v0_id = 2 * v0 + 1;
+		uint32_t u1_id = 2 * v1;
+		uint32_t v1_id = 2 * v1 + 1;
+		uint32_t u2_id = 2 * v2;
+		uint32_t v2_id = 2 * v2 + 1;
+		// Note : b = 0
+		// Real part
+		opennl::nlBegin(context, NL_ROW);
+		opennl::nlCoefficient(context, u0_id, -a+c) ;
+		opennl::nlCoefficient(context, v0_id, b-d) ;
+		opennl::nlCoefficient(context, u1_id, -c) ;
+		opennl::nlCoefficient(context, v1_id, d) ;
+		opennl::nlCoefficient(context, u2_id, a);
+		opennl::nlEnd(context, NL_ROW);
+		// Imaginary part
+		opennl::nlBegin(context, NL_ROW);
+		opennl::nlCoefficient(context, u0_id, -b+d);
+		opennl::nlCoefficient(context, v0_id, -a+c);
+		opennl::nlCoefficient(context, u1_id, -d);
+		opennl::nlCoefficient(context, v1_id, -c);
+		opennl::nlCoefficient(context, v2_id, a);
+		opennl::nlEnd(context, NL_ROW);
+	}
+	opennl::nlEnd(context, NL_MATRIX);
+	opennl::nlEnd(context, NL_SYSTEM);
+	if (!opennl::nlSolve(context)) {
+		opennl::nlDeleteContext(context);
 		return false;
 	}
-	for (uint32_t v = 0; v < vertexCount; v++) {
-		// Initial solution.
-		x[2 * v + 0] = mesh->texcoord(v).x;
-		x[2 * v + 1] = mesh->texcoord(v).y;
-	}
-	// Fill A:
-	const uint32_t faceCount = mesh->faceCount();
-	for (uint32_t f = 0, t = 0; f < faceCount; f++) {
-		const uint32_t vertex0 = mesh->vertexAt(f * 3 + 0);
-		const uint32_t vertex1 = mesh->vertexAt(f * 3 + 1);
-		const uint32_t vertex2 = mesh->vertexAt(f * 3 + 2);
-		setup_abf_relations(A, t, vertex0, vertex1, vertex2, mesh->position(vertex0), mesh->position(vertex1), mesh->position(vertex2));
-		t++;
-	}
-	const uint32_t lockedParameters[] = {
-		2 * v0 + 0,
-		2 * v0 + 1,
-		2 * v1 + 0,
-		2 * v1 + 1
-	};
-	// Solve
-	Solver::LeastSquaresSolver(A, b, x, lockedParameters, 4, 0.000001f);
-	// Map x back to texcoords:
-	for (uint32_t v = 0; v < vertexCount; v++) {
-		mesh->texcoord(v) = Vector2(x[2 * v + 0], x[2 * v + 1]);
-		XA_DEBUG_ASSERT(!isNan(mesh->texcoord(v).x));
-		XA_DEBUG_ASSERT(!isNan(mesh->texcoord(v).y));
+	for (uint32_t i = 0; i < vertexCount; i++) {
+		const double u = opennl::nlGetVariable(context, 2 * i);
+		const double v = opennl::nlGetVariable(context, 2 * i + 1);
+		mesh->texcoord(i) = Vector2((float)u, (float)v);
+		XA_DEBUG_ASSERT(!isNan(mesh->texcoord(i).x));
+		XA_DEBUG_ASSERT(!isNan(mesh->texcoord(i).y));
 	}
+	opennl::nlDeleteContext(context);
 	return true;
 }
 
 #if XA_RECOMPUTE_CHARTS
-struct PiecewiseParam {
-	void reset(const Mesh *mesh, uint32_t faceCount) {
+struct PiecewiseParam
+{
+	void reset(const Mesh *mesh, uint32_t faceCount)
+	{
 		m_mesh = mesh;
 		m_faceCount = faceCount;
 		const uint32_t vertexCount = m_mesh->vertexCount();
 		m_texcoords.resize(vertexCount);
 		m_patch.reserve(m_faceCount);
-		m_faceAssigned.resize(m_faceCount);
-		m_faceAssigned.zeroOutMemory();
+		m_candidates.reserve(m_faceCount);
+		m_faceInAnyPatch.resize(m_faceCount);
+		m_faceInAnyPatch.zeroOutMemory();
 		m_faceInvalid.resize(m_faceCount);
 		m_faceInPatch.resize(m_faceCount);
 		m_vertexInPatch.resize(vertexCount);
-		m_faceInCandidates.resize(m_faceCount);
+		m_faceToCandidate.resize(m_faceCount);
 	}
 
 	ConstArrayView<uint32_t> chartFaces() const { return m_patch; }
 	const Vector2 *texcoords() const { return m_texcoords.data(); }
 
-	bool computeChart() {
+	bool computeChart()
+	{
+		// Clear per-patch state.
 		m_patch.clear();
+		m_candidates.clear();
+		m_faceToCandidate.zeroOutMemory();
 		m_faceInvalid.zeroOutMemory();
 		m_faceInPatch.zeroOutMemory();
 		m_vertexInPatch.zeroOutMemory();
 		// Add the seed face (first unassigned face) to the patch.
 		uint32_t seed = UINT32_MAX;
 		for (uint32_t f = 0; f < m_faceCount; f++) {
-			if (m_faceAssigned.get(f))
+			if (m_faceInAnyPatch.get(f))
 				continue;
 			seed = f;
-			m_patch.push_back(seed);
-			m_faceInPatch.set(seed);
-			m_faceAssigned.set(seed);
+			// Add all 3 vertices.
 			Vector2 texcoords[3];
 			orthoProjectFace(seed, texcoords);
 			for (uint32_t i = 0; i < 3; i++) {
@@ -5840,95 +6747,104 @@ struct PiecewiseParam {
 				m_vertexInPatch.set(vertex);
 				m_texcoords[vertex] = texcoords[i];
 			}
+			addFaceToPatch(seed);
+			// Initialize the boundary grid.
+			m_boundaryGrid.reset(m_texcoords.data(), m_mesh->indices());
+			for (Mesh::FaceEdgeIterator it(m_mesh, seed); !it.isDone(); it.advance())
+				m_boundaryGrid.append(it.edge());
 			break;
 		}
 		if (seed == UINT32_MAX)
 			return false;
 		for (;;) {
-			findCandidates();
-			if (m_candidates.isEmpty())
-				break;
-			for (;;) {
-				// Find the candidate with the lowest cost.
-				float lowestCost = FLT_MAX;
-				uint32_t bestCandidate = UINT32_MAX;
-				for (uint32_t i = 0; i < m_candidates.size(); i++) {
-					const Candidate &candidate = m_candidates[i];
-					if (m_faceInvalid.get(candidate.face)) // A candidate face may be invalidated after is was added.
-						continue;
-					if (candidate.maxCost < lowestCost) {
-						lowestCost = candidate.maxCost;
-						bestCandidate = i;
-					}
+			// Find the candidate with the lowest cost.
+			float lowestCost = FLT_MAX;
+			Candidate *bestCandidate = nullptr;
+			for (uint32_t i = 0; i < m_candidates.size(); i++) {
+				Candidate *candidate = m_candidates[i];
+				if (candidate->maxCost < lowestCost) {
+					lowestCost = candidate->maxCost;
+					bestCandidate = candidate;
 				}
-				if (bestCandidate == UINT32_MAX)
+			}
+			if (!bestCandidate)
+				break;
+			XA_DEBUG_ASSERT(!bestCandidate->prev); // Must be head of linked candidates.
+			// Compute the position by averaging linked candidates (candidates that share the same free vertex).
+			Vector2 position(0.0f);
+			uint32_t n = 0;
+			for (CandidateIterator it(bestCandidate); !it.isDone(); it.advance()) {
+				position += it.current()->position;
+				n++;
+			}
+			position *= 1.0f / (float)n;
+			const uint32_t freeVertex = bestCandidate->vertex;
+			XA_DEBUG_ASSERT(!isNan(position.x));
+			XA_DEBUG_ASSERT(!isNan(position.y));
+			m_texcoords[freeVertex] = position;
+			// Check for flipped faces. This is also done when candidates are first added, but the averaged position of the free vertex is different now, so check again.
+			bool invalid = false;
+			for (CandidateIterator it(bestCandidate); !it.isDone(); it.advance()) {
+				const uint32_t vertex0 = m_mesh->vertexAt(meshEdgeIndex0(it.current()->patchEdge));
+				const uint32_t vertex1 = m_mesh->vertexAt(meshEdgeIndex1(it.current()->patchEdge));
+				const float freeVertexOrient = orientToEdge(m_texcoords[vertex0], m_texcoords[vertex1], position);
+				if ((it.current()->patchVertexOrient < 0.0f && freeVertexOrient < 0.0f) || (it.current()->patchVertexOrient > 0.0f && freeVertexOrient > 0.0f)) {
+					invalid = true;
 					break;
-				// Compute the position by averaging linked candidates (candidates that share the same free vertex).
-				Vector2 position(0.0f);
-				uint32_t n = 0;
-				for (CandidateIterator it(m_candidates, bestCandidate); !it.isDone(); it.advance()) {
-					position += it.current().position;
-					n++;
 				}
-				position *= 1.0f / (float)n;
-				const uint32_t freeVertex = m_candidates[bestCandidate].vertex;
-				XA_DEBUG_ASSERT(!isNan(position.x));
-				XA_DEBUG_ASSERT(!isNan(position.y));
-				m_texcoords[freeVertex] = position;
-				// Check for flipped faces. This is also done when candidates are first added, but the averaged position of the free vertex is different now, so check again.
-				bool invalid = false;
-				for (CandidateIterator it(m_candidates, bestCandidate); !it.isDone(); it.advance()) {
-					const uint32_t vertex0 = m_mesh->vertexAt(meshEdgeIndex0(it.current().patchEdge));
-					const uint32_t vertex1 = m_mesh->vertexAt(meshEdgeIndex1(it.current().patchEdge));
-					const float freeVertexOrient = orientToEdge(m_texcoords[vertex0], m_texcoords[vertex1], position);
-					if ((it.current().patchVertexOrient < 0.0f && freeVertexOrient < 0.0f) || (it.current().patchVertexOrient > 0.0f && freeVertexOrient > 0.0f)) {
-						invalid = true;
-						break;
-					}
-				}
-				// Check for boundary intersection.
-				if (!invalid) {
-					m_boundaryGrid.reset(m_texcoords.data(), m_mesh->indices());
-					// Add edges on the patch boundary to the grid.
-					// Temporarily adding candidate faces to the patch makes it simpler to detect which edges are on the boundary.
-					const uint32_t oldPatchSize = m_patch.size();
-					for (CandidateIterator it(m_candidates, bestCandidate); !it.isDone(); it.advance())
-						m_patch.push_back(it.current().face);
-					for (uint32_t i = 0; i < m_patch.size(); i++) {
-						for (Mesh::FaceEdgeIterator it(m_mesh, m_patch[i]); !it.isDone(); it.advance()) {
-							const uint32_t oface = it.oppositeFace();
-							if (oface == UINT32_MAX || oface >= m_faceCount || !m_faceInPatch.get(oface))
-								m_boundaryGrid.append(it.edge());
-						}
+			}
+			// Check for boundary intersection.
+			if (!invalid) {
+				XA_PROFILE_START(parameterizeChartsPiecewiseBoundaryIntersection)
+				// Test candidate edges that would form part of the new patch boundary.
+				// Ignore boundary edges that would become internal if the candidate faces were added to the patch.
+				Array<uint32_t> newBoundaryEdges, ignoreEdges;
+				for (CandidateIterator candidateIt(bestCandidate); !candidateIt.isDone(); candidateIt.advance()) {
+					for (Mesh::FaceEdgeIterator it(m_mesh, candidateIt.current()->face); !it.isDone(); it.advance()) {
+						const uint32_t oface = it.oppositeFace();
+						if (oface == UINT32_MAX || oface >= m_faceCount || !m_faceInPatch.get(oface))
+							newBoundaryEdges.push_back(it.edge());
+						if (oface != UINT32_MAX && oface < m_faceCount && m_faceInPatch.get(oface))
+							ignoreEdges.push_back(it.oppositeEdge());
 					}
-					invalid = m_boundaryGrid.intersectSelf(m_mesh->epsilon());
-					m_patch.resize(oldPatchSize);
-				}
-				if (invalid) {
-					// Mark all faces of linked candidates as invalid.
-					for (CandidateIterator it(m_candidates, bestCandidate); !it.isDone(); it.advance())
-						m_faceInvalid.set(it.current().face);
-					continue;
-				}
-				// Add faces to the patch.
-				for (CandidateIterator it(m_candidates, bestCandidate); !it.isDone(); it.advance()) {
-					m_patch.push_back(it.current().face);
-					m_faceInPatch.set(it.current().face);
-					m_faceAssigned.set(it.current().face);
 				}
+				invalid = m_boundaryGrid.intersect(m_mesh->epsilon(), newBoundaryEdges, ignoreEdges);
+				XA_PROFILE_END(parameterizeChartsPiecewiseBoundaryIntersection)
+			}
+			if (invalid) {
+				// Mark all faces of linked candidates as invalid.
+				for (CandidateIterator it(bestCandidate); !it.isDone(); it.advance())
+					m_faceInvalid.set(it.current()->face);
+				removeLinkedCandidates(bestCandidate);
+			} else {
 				// Add vertex to the patch.
 				m_vertexInPatch.set(freeVertex);
+				// Add faces to the patch.
+				for (CandidateIterator it(bestCandidate); !it.isDone(); it.advance())
+					addFaceToPatch(it.current()->face);
 				// Successfully added candidate face(s) to patch.
-				break;
+				removeLinkedCandidates(bestCandidate);
+				// Reset the grid with all edges on the patch boundary.
+				XA_PROFILE_START(parameterizeChartsPiecewiseBoundaryIntersection)
+				m_boundaryGrid.reset(m_texcoords.data(), m_mesh->indices());
+				for (uint32_t i = 0; i < m_patch.size(); i++) {
+					for (Mesh::FaceEdgeIterator it(m_mesh, m_patch[i]); !it.isDone(); it.advance()) {
+						const uint32_t oface = it.oppositeFace();
+						if (oface == UINT32_MAX || oface >= m_faceCount || !m_faceInPatch.get(oface))
+							m_boundaryGrid.append(it.edge());
+					}
+				}
+				XA_PROFILE_END(parameterizeChartsPiecewiseBoundaryIntersection)
 			}
 		}
 		return true;
 	}
 
 private:
-	struct Candidate {
+	struct Candidate
+	{
 		uint32_t face, vertex;
-		uint32_t next; // The next candidate with the same vertex.
+		Candidate *prev, *next; // The previous/next candidate with the same vertex.
 		Vector2 position;
 		float cost;
 		float maxCost; // Of all linked candidates.
@@ -5936,90 +6852,70 @@ private:
 		float patchVertexOrient;
 	};
 
-	struct CandidateIterator {
-		CandidateIterator(Array<Candidate> &candidates, uint32_t first) :
-				m_candidates(candidates), m_current(first) {}
-		void advance() {
-			if (m_current != UINT32_MAX) m_current = m_candidates[m_current].next;
-		}
-		bool isDone() const { return m_current == UINT32_MAX; }
-		Candidate &current() { return m_candidates[m_current]; }
+	struct CandidateIterator
+	{
+		CandidateIterator(Candidate *head) : m_current(head) { XA_DEBUG_ASSERT(!head->prev); }
+		void advance() { if (m_current != nullptr) { m_current = m_current->next; } }
+		bool isDone() const { return !m_current; }
+		Candidate *current() { return m_current; }
 
 	private:
-		Array<Candidate> &m_candidates;
-		uint32_t m_current;
+		Candidate *m_current;
 	};
 
 	const Mesh *m_mesh;
 	uint32_t m_faceCount;
 	Array<Vector2> m_texcoords;
-	Array<Candidate> m_candidates;
-	BitArray m_faceInCandidates;
-	Array<uint32_t> m_patch;
-	BitArray m_faceAssigned; // Face is assigned to a previous chart or the current patch.
-	BitArray m_faceInPatch, m_vertexInPatch;
+	BitArray m_faceInAnyPatch; // Face is in a previous chart patch or the current patch.
+	Array<Candidate *> m_candidates; // Incident faces to the patch.
+	Array<Candidate *> m_faceToCandidate;
+	Array<uint32_t> m_patch; // The current chart patch.
+	BitArray m_faceInPatch, m_vertexInPatch; // Face/vertex is in the current patch.
 	BitArray m_faceInvalid; // Face cannot be added to the patch - flipped, cost too high or causes boundary intersection.
 	UniformGrid2 m_boundaryGrid;
 
-	// Find candidate faces on the patch front.
-	void findCandidates() {
-		m_candidates.clear();
-		m_faceInCandidates.zeroOutMemory();
-		for (uint32_t i = 0; i < m_patch.size(); i++) {
-			for (Mesh::FaceEdgeIterator it(m_mesh, m_patch[i]); !it.isDone(); it.advance()) {
-				const uint32_t oface = it.oppositeFace();
-				if (oface == UINT32_MAX || oface >= m_faceCount || m_faceAssigned.get(oface) || m_faceInCandidates.get(oface))
-					continue;
-				// Found an active edge on the patch front.
-				// Find the free vertex (the vertex that isn't on the active edge).
-				// Compute the orientation of the other patch face vertex to the active edge.
-				uint32_t freeVertex = UINT32_MAX;
-				float orient = 0.0f;
-				for (uint32_t j = 0; j < 3; j++) {
-					const uint32_t vertex = m_mesh->vertexAt(oface * 3 + j);
-					if (vertex != it.vertex0() && vertex != it.vertex1()) {
-						freeVertex = vertex;
-						orient = orientToEdge(m_texcoords[it.vertex0()], m_texcoords[it.vertex1()], m_texcoords[m_mesh->vertexAt(m_patch[i] * 3 + j)]);
-						break;
-					}
-				}
-				XA_DEBUG_ASSERT(freeVertex != UINT32_MAX);
-				// If the free vertex is already in the patch, the face is enclosed by the patch. Add the face to the patch - don't need to assign texcoords.
-				if (m_vertexInPatch.get(freeVertex)) {
-					freeVertex = UINT32_MAX;
-					m_patch.push_back(oface);
-					m_faceAssigned.set(oface);
-					continue;
-				}
-				// Check this here rather than above so faces enclosed by the patch are always added.
-				if (m_faceInvalid.get(oface))
-					continue;
-				addCandidateFace(it.edge(), orient, oface, it.oppositeEdge(), freeVertex);
-			}
-		}
-		// Link candidates that share the same vertex.
-		for (uint32_t i = 0; i < m_candidates.size(); i++) {
-			if (m_candidates[i].next != UINT32_MAX)
+	void addFaceToPatch(uint32_t face)
+	{
+		XA_DEBUG_ASSERT(!m_faceInPatch.get(face));
+		XA_DEBUG_ASSERT(!m_faceInAnyPatch.get(face));
+		m_patch.push_back(face);
+		m_faceInPatch.set(face);
+		m_faceInAnyPatch.set(face);
+		// Find new candidate faces on the patch incident to the newly added face.
+		for (Mesh::FaceEdgeIterator it(m_mesh, face); !it.isDone(); it.advance()) {
+			const uint32_t oface = it.oppositeFace();
+			if (oface == UINT32_MAX || oface >= m_faceCount || m_faceInAnyPatch.get(oface) || m_faceToCandidate[oface])
 				continue;
-			uint32_t current = i;
-			for (uint32_t j = i + 1; j < m_candidates.size(); j++) {
-				if (m_candidates[j].vertex == m_candidates[current].vertex) {
-					m_candidates[current].next = j;
-					current = j;
+			// Found an active edge on the patch front.
+			// Find the free vertex (the vertex that isn't on the active edge).
+			// Compute the orientation of the other patch face vertex to the active edge.
+			uint32_t freeVertex = UINT32_MAX;
+			float orient = 0.0f;
+			for (uint32_t j = 0; j < 3; j++) {
+				const uint32_t vertex = m_mesh->vertexAt(oface * 3 + j);
+				if (vertex != it.vertex0() && vertex != it.vertex1()) {
+					freeVertex = vertex;
+					orient = orientToEdge(m_texcoords[it.vertex0()], m_texcoords[it.vertex1()], m_texcoords[m_mesh->vertexAt(face * 3 + j)]);
+					break;
 				}
 			}
-		}
-		// Set max cost for linked candidates.
-		for (uint32_t i = 0; i < m_candidates.size(); i++) {
-			float maxCost = 0.0f;
-			for (CandidateIterator it(m_candidates, i); !it.isDone(); it.advance())
-				maxCost = max(maxCost, it.current().cost);
-			for (CandidateIterator it(m_candidates, i); !it.isDone(); it.advance())
-				it.current().maxCost = maxCost;
+			XA_DEBUG_ASSERT(freeVertex != UINT32_MAX);
+			// If the free vertex is already in the patch, the face is enclosed by the patch. Add the face to the patch - don't need to assign texcoords.
+			/*if (m_vertexInPatch.get(freeVertex)) {
+				freeVertex = UINT32_MAX;
+				addFaceToPatch(oface, false);
+				continue;
+			}*/
+			// Check this here rather than above so faces enclosed by the patch are always added.
+			if (m_faceInvalid.get(oface))
+				continue;
+			addCandidateFace(it.edge(), orient, oface, it.oppositeEdge(), freeVertex);
 		}
 	}
 
-	void addCandidateFace(uint32_t patchEdge, float patchVertexOrient, uint32_t face, uint32_t edge, uint32_t freeVertex) {
+	void addCandidateFace(uint32_t patchEdge, float patchVertexOrient, uint32_t face, uint32_t edge, uint32_t freeVertex)
+	{
+		XA_DEBUG_ASSERT(!m_faceToCandidate[face]);
 		Vector2 texcoords[3];
 		orthoProjectFace(face, texcoords);
 		// Find corresponding vertices between the patch edge and candidate edge.
@@ -6040,8 +6936,9 @@ private:
 		const Vector2 localEdgeVec = texcoords[localVertex1] - texcoords[localVertex0];
 		const float len1 = length(patchEdgeVec);
 		const float len2 = length(localEdgeVec);
+		if (len1 <= 0.0f || len2 <= 0.0f)
+			return; // Zero length edge.
 		const float scale = len1 / len2;
-		XA_ASSERT(scale > 0.0f);
 		for (uint32_t i = 0; i < 3; i++)
 			texcoords[i] *= scale;
 		// Translate to the first vertex on the patch edge.
@@ -6063,6 +6960,8 @@ private:
 			uv.x = x + texcoords[localVertex0].x;
 			uv.y = y + texcoords[localVertex0].y;
 		}
+		if (isNan(texcoords[localFreeVertex].x) || isNan(texcoords[localFreeVertex].y))
+			return;
 		// Check for local overlap (flipped triangle).
 		// The patch face vertex that isn't on the active edge and the free vertex should be oriented on opposite sides to the active edge.
 		const float freeVertexOrient = orientToEdge(m_texcoords[vertex0], m_texcoords[vertex1], texcoords[localFreeVertex]);
@@ -6083,19 +6982,72 @@ private:
 		}
 #endif
 		// Add the candidate.
-		Candidate candidate;
-		candidate.face = face;
-		candidate.vertex = freeVertex;
-		candidate.position = texcoords[localFreeVertex];
-		candidate.next = UINT32_MAX;
-		candidate.cost = cost;
-		candidate.patchEdge = patchEdge;
-		candidate.patchVertexOrient = patchVertexOrient;
+		Candidate *candidate = XA_ALLOC(MemTag::Default, Candidate);
+		candidate->face = face;
+		candidate->vertex = freeVertex;
+		candidate->position = texcoords[localFreeVertex];
+		candidate->prev = candidate->next = nullptr;
+		candidate->cost = candidate->maxCost = cost;
+		candidate->patchEdge = patchEdge;
+		candidate->patchVertexOrient = patchVertexOrient;
 		m_candidates.push_back(candidate);
-		m_faceInCandidates.set(face);
+		m_faceToCandidate[face] = candidate;
+		// Link with candidates that share the same vertex. Append to tail.
+		for (uint32_t i = 0; i < m_candidates.size() - 1; i++) {
+			if (m_candidates[i]->vertex == candidate->vertex) {
+				Candidate *tail = m_candidates[i];
+				for (;;) {
+					if (tail->next)
+						tail = tail->next;
+					else
+						break;
+				}
+				candidate->prev = tail;
+				candidate->next = nullptr;
+				tail->next = candidate;
+				break;
+			}
+		}
+		// Set max cost for linked candidates.
+		Candidate *head = linkedCandidateHead(candidate);
+		float maxCost = 0.0f;
+		for (CandidateIterator it(head); !it.isDone(); it.advance())
+			maxCost = max(maxCost, it.current()->cost);
+		for (CandidateIterator it(head); !it.isDone(); it.advance())
+			it.current()->maxCost = maxCost;
+	}
+
+	Candidate *linkedCandidateHead(Candidate *candidate)
+	{
+		Candidate *current = candidate;
+		for (;;) {
+			if (!current->prev)
+				break;
+			current = current->prev;
+		}
+		return current;
 	}
 
-	void orthoProjectFace(uint32_t face, Vector2 *texcoords) const {
+	void removeLinkedCandidates(Candidate *head)
+	{
+		XA_DEBUG_ASSERT(!head->prev);
+		Candidate *current = head;
+		while (current) {
+			Candidate *next = current->next;
+			m_faceToCandidate[current->face] = nullptr;
+			for (uint32_t i = 0; i < m_candidates.size(); i++) {
+				if (m_candidates[i] == current) {
+					m_candidates.removeAt(i);
+					break;
+				}
+			}
+			XA_FREE(current);
+			current = next;
+		}
+	}
+
+	void orthoProjectFace(uint32_t face, Vector2 *texcoords) const
+	{
 		const Vector3 normal = m_mesh->computeFaceNormal(face);
 		const Vector3 tangent = normalize(m_mesh->position(m_mesh->vertexAt(face * 3 + 1)) - m_mesh->position(m_mesh->vertexAt(face * 3 + 0)), kEpsilon);
 		const Vector3 bitangent = cross(normal, tangent);
@@ -6105,14 +7057,16 @@ private:
 		}
 	}
 
-	float parametricArea(const Vector2 *texcoords) const {
+	float parametricArea(const Vector2 *texcoords) const
+	{
 		const Vector2 &v1 = texcoords[0];
 		const Vector2 &v2 = texcoords[1];
 		const Vector2 &v3 = texcoords[2];
 		return ((v2.x - v1.x) * (v3.y - v1.y) - (v3.x - v1.x) * (v2.y - v1.y)) * 0.5f;
 	}
 
-	float computeStretch(Vector3 p1, Vector3 p2, Vector3 p3, Vector2 t1, Vector2 t2, Vector2 t3) const {
+	float computeStretch(Vector3 p1, Vector3 p2, Vector3 p3, Vector2 t1, Vector2 t2, Vector2 t3) const
+	{
 		float parametricArea = ((t2.y - t1.y) * (t3.x - t1.x) - (t3.y - t1.y) * (t2.x - t1.x)) * 0.5f;
 		if (isZero(parametricArea, kAreaEpsilon))
 			return FLT_MAX;
@@ -6126,14 +7080,16 @@ private:
 	}
 
 	// Return value is positive if the point is one side of the edge, negative if on the other side.
-	float orientToEdge(Vector2 edgeVertex0, Vector2 edgeVertex1, Vector2 point) const {
+	float orientToEdge(Vector2 edgeVertex0, Vector2 edgeVertex1, Vector2 point) const
+	{
 		return (edgeVertex0.x - point.x) * (edgeVertex1.y - point.y) - (edgeVertex0.y - point.y) * (edgeVertex1.x - point.x);
 	}
 };
 #endif
 
 // Estimate quality of existing parameterization.
-struct Quality {
+struct Quality
+{
 	// computeBoundaryIntersection
 	bool boundaryIntersection = false;
 
@@ -6150,13 +7106,14 @@ struct Quality {
 	float conformalMetric = 0.0f;
 	float authalicMetric = 0.0f;
 
-	void computeBoundaryIntersection(const Mesh *mesh, UniformGrid2 &boundaryGrid) {
+	void computeBoundaryIntersection(const Mesh *mesh, UniformGrid2 &boundaryGrid)
+	{
 		const Array<uint32_t> &boundaryEdges = mesh->boundaryEdges();
 		const uint32_t boundaryEdgeCount = boundaryEdges.size();
 		boundaryGrid.reset(mesh->texcoords(), mesh->indices(), boundaryEdgeCount);
 		for (uint32_t i = 0; i < boundaryEdgeCount; i++)
 			boundaryGrid.append(boundaryEdges[i]);
-		boundaryIntersection = boundaryGrid.intersectSelf(mesh->epsilon());
+		boundaryIntersection = boundaryGrid.intersect(mesh->epsilon());
 #if XA_DEBUG_EXPORT_BOUNDARY_GRID
 		static int exportIndex = 0;
 		char filename[256];
@@ -6166,7 +7123,8 @@ struct Quality {
 #endif
 	}
 
-	void computeFlippedFaces(const Mesh *mesh, uint32_t faceCount, Array<uint32_t> *flippedFaces) {
+	void computeFlippedFaces(const Mesh *mesh, uint32_t faceCount, Array<uint32_t> *flippedFaces)
+	{
 		totalTriangleCount = flippedTriangleCount = zeroAreaTriangleCount = 0;
 		if (flippedFaces)
 			flippedFaces->clear();
@@ -6201,7 +7159,8 @@ struct Quality {
 				flippedFaces->clear();
 			flippedTriangleCount = 0;
 		}
-		if (flippedTriangleCount > totalTriangleCount / 2) {
+		if (flippedTriangleCount > totalTriangleCount / 2)
+		{
 			// If more than half the triangles are flipped, reverse the flipped / not flipped classification.
 			flippedTriangleCount = totalTriangleCount - flippedTriangleCount;
 			if (flippedFaces) {
@@ -6223,7 +7182,8 @@ struct Quality {
 		}
 	}
 
-	void computeMetrics(const Mesh *mesh, uint32_t faceCount) {
+	void computeMetrics(const Mesh *mesh, uint32_t faceCount)
+	{
 		totalGeometricArea = totalParametricArea = 0.0f;
 		stretchMetric = maxStretchMetric = conformalMetric = authalicMetric = 0.0f;
 		for (uint32_t f = 0; f < faceCount; f++) {
@@ -6254,7 +7214,7 @@ struct Quality {
 			const float a = dot(Ss, Ss); // E
 			const float b = dot(Ss, St); // F
 			const float c = dot(St, St); // G
-					// Compute eigen-values of the first fundamental form:
+										 // Compute eigen-values of the first fundamental form:
 			const float sigma1 = sqrtf(0.5f * max(0.0f, a + c - sqrtf(square(a - c) + 4 * square(b)))); // gamma uppercase, min eigenvalue.
 			const float sigma2 = sqrtf(0.5f * max(0.0f, a + c + sqrtf(square(a - c) + 4 * square(b)))); // gamma lowercase, max eigenvalue.
 			XA_ASSERT(sigma2 > sigma1 || equal(sigma1, sigma2, kEpsilon));
@@ -6285,62 +7245,59 @@ struct Quality {
 		if (totalGeometricArea > 0.0f) {
 			const float normFactor = sqrtf(totalParametricArea / totalGeometricArea);
 			stretchMetric = sqrtf(stretchMetric / totalGeometricArea) * normFactor;
-			maxStretchMetric *= normFactor;
+			maxStretchMetric  *= normFactor;
 			conformalMetric = sqrtf(conformalMetric / totalGeometricArea);
 			authalicMetric = sqrtf(authalicMetric / totalGeometricArea);
 		}
 	}
 };
 
-struct ChartWarningFlags {
-	enum Enum {
-		CloseHolesFailed = 1 << 1,
-		FixTJunctionsDuplicatedEdge = 1 << 2,
-		FixTJunctionsFailed = 1 << 3,
-		TriangulateDuplicatedEdge = 1 << 4,
+struct ChartWarningFlags
+{
+	enum Enum
+	{
+		CloseHolesFailed = 1<<1,
+		FixTJunctionsDuplicatedEdge = 1<<2,
+		FixTJunctionsFailed = 1<<3,
+		TriangulateDuplicatedEdge = 1<<4,
 	};
 };
 
-struct ChartCtorBuffers {
+struct ChartCtorBuffers
+{
 	Array<uint32_t> chartMeshIndices;
 	Array<uint32_t> unifiedMeshIndices;
 	Array<uint32_t> boundaryLoops;
 };
 
-/// A chart is a connected set of faces with a certain topology (usually a disk).
-class Chart {
+class Chart
+{
 public:
-	Chart(ChartCtorBuffers &buffers, const Basis &basis, ConstArrayView<uint32_t> faces, const Mesh *originalMesh, uint32_t meshId, uint32_t chartGroupId, uint32_t chartId) :
-			m_basis(basis), m_mesh(nullptr), m_unifiedMesh(nullptr), m_unmodifiedUnifiedMesh(nullptr), m_type(ChartType::LSCM), m_warningFlags(0), m_closedHolesCount(0), m_fixedTJunctionsCount(0) {
-		XA_UNUSED(meshId);
+	Chart(ChartCtorBuffers &buffers, const ParameterizeOptions &options, const Basis &basis, ConstArrayView<uint32_t> faces, const Mesh *sourceMesh, uint32_t chartGroupId, uint32_t chartId) : m_basis(basis), m_mesh(nullptr), m_unifiedMesh(nullptr), m_unmodifiedUnifiedMesh(nullptr), m_type(ChartType::LSCM), m_warningFlags(0), m_closedHolesCount(0), m_fixedTJunctionsCount(0), m_isInvalid(false)
+	{
 		XA_UNUSED(chartGroupId);
 		XA_UNUSED(chartId);
-		m_faceArray.copyFrom(faces.data, faces.length);
-		// Copy face indices.
-		m_mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, originalMesh->epsilon(), m_faceArray.size() * 3, m_faceArray.size());
-		m_unifiedMesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, originalMesh->epsilon(), m_faceArray.size() * 3, m_faceArray.size());
-		Array<uint32_t> &chartMeshIndices = buffers.chartMeshIndices;
-		chartMeshIndices.resize(originalMesh->vertexCount());
-		chartMeshIndices.setAll(UINT32_MAX);
-		Array<uint32_t> &unifiedMeshIndices = buffers.unifiedMeshIndices;
-		unifiedMeshIndices.resize(originalMesh->vertexCount());
-		unifiedMeshIndices.setAll(UINT32_MAX);
+		m_faceToSourceFaceMap.copyFrom(faces.data, faces.length);
+		const uint32_t approxVertexCount = min(faces.length * 3, sourceMesh->vertexCount());
+		m_mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), approxVertexCount, faces.length);
+		m_unifiedMesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), approxVertexCount, faces.length);
+		HashMap<uint32_t, PassthroughHash<uint32_t>> sourceVertexToUnifiedVertexMap(MemTag::Mesh, approxVertexCount), sourceVertexToChartVertexMap(MemTag::Mesh, approxVertexCount);
 		// Add vertices.
-		const uint32_t faceCount = m_initialFaceCount = m_faceArray.size();
+		const uint32_t faceCount = m_initialFaceCount = faces.length;
 		for (uint32_t f = 0; f < faceCount; f++) {
 			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex = originalMesh->vertexAt(m_faceArray[f] * 3 + i);
-				const uint32_t unifiedVertex = originalMesh->firstColocal(vertex);
-				if (unifiedMeshIndices[unifiedVertex] == (uint32_t)~0) {
-					unifiedMeshIndices[unifiedVertex] = m_unifiedMesh->vertexCount();
-					XA_DEBUG_ASSERT(equal(originalMesh->position(vertex), originalMesh->position(unifiedVertex), originalMesh->epsilon()));
-					m_unifiedMesh->addVertex(originalMesh->position(vertex));
+				const uint32_t sourceVertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
+				const uint32_t sourceUnifiedVertex = sourceMesh->firstColocal(sourceVertex);
+				uint32_t unifiedVertex = sourceVertexToUnifiedVertexMap.get(sourceUnifiedVertex);
+				if (unifiedVertex == UINT32_MAX) {
+					unifiedVertex = sourceVertexToUnifiedVertexMap.add(sourceUnifiedVertex);
+					m_unifiedMesh->addVertex(sourceMesh->position(sourceVertex));
 				}
-				if (chartMeshIndices[vertex] == (uint32_t)~0) {
-					chartMeshIndices[vertex] = m_mesh->vertexCount();
-					m_chartToOriginalMap.push_back(vertex);
-					m_chartToUnifiedMap.push_back(unifiedMeshIndices[unifiedVertex]);
-					m_mesh->addVertex(originalMesh->position(vertex), Vector3(0.0f), originalMesh->texcoord(vertex));
+				if (sourceVertexToChartVertexMap.get(sourceVertex) == UINT32_MAX) {
+					sourceVertexToChartVertexMap.add(sourceVertex);
+					m_vertexToSourceVertexMap.push_back(sourceVertex);
+					m_chartVertexToUnifiedVertexMap.push_back(unifiedVertex);
+					m_mesh->addVertex(sourceMesh->position(sourceVertex), Vector3(0.0f), sourceMesh->texcoord(sourceVertex));
 				}
 			}
 		}
@@ -6348,9 +7305,12 @@ public:
 		for (uint32_t f = 0; f < faceCount; f++) {
 			uint32_t indices[3], unifiedIndices[3];
 			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex = originalMesh->vertexAt(m_faceArray[f] * 3 + i);
-				indices[i] = chartMeshIndices[vertex];
-				unifiedIndices[i] = unifiedMeshIndices[originalMesh->firstColocal(vertex)];
+				const uint32_t sourceVertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
+				const uint32_t sourceUnifiedVertex = sourceMesh->firstColocal(sourceVertex);
+				indices[i] = sourceVertexToChartVertexMap.get(sourceVertex);
+				XA_DEBUG_ASSERT(indices[i] != UINT32_MAX);
+				unifiedIndices[i] = sourceVertexToUnifiedVertexMap.get(sourceUnifiedVertex);
+				XA_DEBUG_ASSERT(unifiedIndices[i] != UINT32_MAX);
 			}
 			Mesh::AddFaceResult::Enum result = m_mesh->addFace(indices);
 			XA_UNUSED(result);
@@ -6368,15 +7328,18 @@ public:
 			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
 		}
 		m_mesh->createBoundaries(); // For AtlasPacker::computeBoundingBox
+		m_mesh->destroyEdgeMap(); // Only needed it for createBoundaries.
 		m_unifiedMesh->createBoundaries();
-		if (meshIsPlanar(*m_unifiedMesh))
+		if (meshIsPlanar(*m_unifiedMesh)) {
 			m_type = ChartType::Planar;
-		else {
-			m_unifiedMesh->linkBoundaries();
+			return;
+		}
+		m_unifiedMesh->linkBoundaries();
 #if XA_DEBUG_EXPORT_OBJ_BEFORE_FIX_TJUNCTION
-			m_unifiedMesh->writeObjFile("debug_before_fix_tjunction.obj");
+		m_unifiedMesh->writeObjFile("debug_before_fix_tjunction.obj");
 #endif
-			bool duplicatedEdge = false, failed = false;
+		bool duplicatedEdge = false, failed = false;
+		if (options.fixTJunctions) {
 			XA_PROFILE_START(fixChartMeshTJunctions)
 			Mesh *fixedUnifiedMesh = meshFixTJunctions(*m_unifiedMesh, &duplicatedEdge, &failed, &m_fixedTJunctionsCount);
 			XA_PROFILE_END(fixChartMeshTJunctions)
@@ -6391,6 +7354,8 @@ public:
 				m_unifiedMesh->linkBoundaries();
 				m_initialFaceCount = m_unifiedMesh->faceCount(); // Fixing t-junctions rewrites faces.
 			}
+		}
+		if (options.closeHoles) {
 			// See if there are any holes that need closing.
 			Array<uint32_t> &boundaryLoops = buffers.boundaryLoops;
 			meshGetBoundaryLoops(*m_unifiedMesh, boundaryLoops);
@@ -6421,7 +7386,7 @@ public:
 #if XA_DEBUG_EXPORT_OBJ_CLOSE_HOLES_ERROR
 				if (m_warningFlags & ChartWarningFlags::CloseHolesFailed) {
 					char filename[256];
-					XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_chart_%03u_close_holes_error.obj", meshId, chartGroupId, chartId);
+					XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_chart_%03u_close_holes_error.obj", sourceMesh->id(), chartGroupId, chartId);
 					FILE *file;
 					XA_FOPEN(file, filename, "w");
 					if (file) {
@@ -6450,88 +7415,57 @@ public:
 	}
 
 #if XA_RECOMPUTE_CHARTS
-	Chart(ChartCtorBuffers &buffers, const Chart *parent, const Mesh *parentMesh, ConstArrayView<uint32_t> faces, const Vector2 *texcoords, const Mesh *originalMesh, uint32_t meshId, uint32_t chartGroupId, uint32_t chartId) :
-			m_mesh(nullptr), m_unifiedMesh(nullptr), m_unmodifiedUnifiedMesh(nullptr), m_type(ChartType::Piecewise), m_warningFlags(0), m_closedHolesCount(0), m_fixedTJunctionsCount(0) {
-		XA_UNUSED(meshId);
-		XA_UNUSED(chartGroupId);
-		XA_UNUSED(chartId);
+	Chart(ChartCtorBuffers &buffers, const Chart *parent, const Mesh *parentMesh, ConstArrayView<uint32_t> faces, const Vector2 *texcoords, const Mesh *sourceMesh) : m_mesh(nullptr), m_unifiedMesh(nullptr), m_unmodifiedUnifiedMesh(nullptr), m_type(ChartType::Piecewise), m_warningFlags(0), m_closedHolesCount(0), m_fixedTJunctionsCount(0), m_isInvalid(false)
+	{
 		const uint32_t faceCount = m_initialFaceCount = faces.length;
-		m_faceArray.resize(faceCount);
+		m_faceToSourceFaceMap.resize(faceCount);
 		for (uint32_t i = 0; i < faceCount; i++)
-			m_faceArray[i] = parent->m_faceArray[faces[i]]; // Map faces to parent chart original mesh.
+			m_faceToSourceFaceMap[i] = parent->m_faceToSourceFaceMap[faces[i]]; // Map faces to parent chart source mesh.
 		// Copy face indices.
-		m_mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, originalMesh->epsilon(), m_faceArray.size() * 3, m_faceArray.size());
-		m_unifiedMesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, originalMesh->epsilon(), m_faceArray.size() * 3, m_faceArray.size());
+		m_mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), m_faceToSourceFaceMap.size() * 3, m_faceToSourceFaceMap.size());
 		Array<uint32_t> &chartMeshIndices = buffers.chartMeshIndices;
-		chartMeshIndices.resize(originalMesh->vertexCount());
-		chartMeshIndices.setAll(UINT32_MAX);
-		Array<uint32_t> &unifiedMeshIndices = buffers.unifiedMeshIndices;
-		unifiedMeshIndices.resize(originalMesh->vertexCount());
-		unifiedMeshIndices.setAll(UINT32_MAX);
+		chartMeshIndices.resize(sourceMesh->vertexCount());
+		chartMeshIndices.fillBytes(0xff);
 		// Add vertices.
 		for (uint32_t f = 0; f < faceCount; f++) {
 			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex = originalMesh->vertexAt(m_faceArray[f] * 3 + i);
-				const uint32_t unifiedVertex = originalMesh->firstColocal(vertex);
+				const uint32_t vertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
 				const uint32_t parentVertex = parentMesh->vertexAt(faces[f] * 3 + i);
-				if (unifiedMeshIndices[unifiedVertex] == (uint32_t)~0) {
-					unifiedMeshIndices[unifiedVertex] = m_unifiedMesh->vertexCount();
-					XA_DEBUG_ASSERT(equal(originalMesh->position(vertex), originalMesh->position(unifiedVertex), originalMesh->epsilon()));
-					m_unifiedMesh->addVertex(originalMesh->position(vertex), Vector3(0.0f), texcoords[parentVertex]);
-				}
 				if (chartMeshIndices[vertex] == (uint32_t)~0) {
 					chartMeshIndices[vertex] = m_mesh->vertexCount();
-					m_chartToOriginalMap.push_back(vertex);
-					m_chartToUnifiedMap.push_back(unifiedMeshIndices[unifiedVertex]);
-					m_mesh->addVertex(originalMesh->position(vertex), Vector3(0.0f), texcoords[parentVertex]);
+					m_vertexToSourceVertexMap.push_back(vertex);
+					m_mesh->addVertex(sourceMesh->position(vertex), Vector3(0.0f), texcoords[parentVertex]);
 				}
 			}
 		}
 		// Add faces.
 		for (uint32_t f = 0; f < faceCount; f++) {
-			uint32_t indices[3], unifiedIndices[3];
+			uint32_t indices[3];
 			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex = originalMesh->vertexAt(m_faceArray[f] * 3 + i);
+				const uint32_t vertex = sourceMesh->vertexAt(m_faceToSourceFaceMap[f] * 3 + i);
 				indices[i] = chartMeshIndices[vertex];
-				unifiedIndices[i] = unifiedMeshIndices[originalMesh->firstColocal(vertex)];
 			}
 			Mesh::AddFaceResult::Enum result = m_mesh->addFace(indices);
 			XA_UNUSED(result);
 			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
-#if XA_DEBUG
-			// Unifying colocals may create degenerate edges. e.g. if two triangle vertices are colocal.
-			for (int i = 0; i < 3; i++) {
-				const uint32_t index1 = unifiedIndices[i];
-				const uint32_t index2 = unifiedIndices[(i + 1) % 3];
-				XA_DEBUG_ASSERT(index1 != index2);
-			}
-#endif
-			result = m_unifiedMesh->addFace(unifiedIndices);
-			XA_UNUSED(result);
-			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
 		}
 		m_mesh->createBoundaries(); // For AtlasPacker::computeBoundingBox
-		m_unifiedMesh->createBoundaries();
-		m_unifiedMesh->linkBoundaries();
+		m_mesh->destroyEdgeMap(); // Only needed it for createBoundaries.
+		// Need to store texcoords for backup/restore so packing can be run multiple times.
+		backupTexcoords();
 	}
 #endif
 
-	~Chart() {
+	~Chart()
+	{
 		if (m_mesh) {
 			m_mesh->~Mesh();
 			XA_FREE(m_mesh);
 		}
-		if (m_unifiedMesh) {
-			m_unifiedMesh->~Mesh();
-			XA_FREE(m_unifiedMesh);
-		}
-		if (m_unmodifiedUnifiedMesh) {
-			m_unmodifiedUnifiedMesh->~Mesh();
-			XA_FREE(m_unmodifiedUnifiedMesh);
-		}
+		destroyUnifiedMesh();
 	}
 
-	const Basis &basis() const { return m_basis; }
+	bool isInvalid() const { return m_isInvalid; }
 	ChartType::Enum type() const { return m_type; }
 	uint32_t warningFlags() const { return m_warningFlags; }
 	uint32_t closedHolesCount() const { return m_closedHolesCount; }
@@ -6541,45 +7475,71 @@ public:
 #if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
 	const Array<uint32_t> &paramFlippedFaces() const { return m_paramFlippedFaces; }
 #endif
-	uint32_t mapFaceToSourceFace(uint32_t i) const { return m_faceArray[i]; }
+	uint32_t mapFaceToSourceFace(uint32_t i) const { return m_faceToSourceFaceMap[i]; }
+	uint32_t mapChartVertexToSourceVertex(uint32_t i) const { return m_vertexToSourceVertexMap[i]; }
 	const Mesh *mesh() const { return m_mesh; }
 	Mesh *mesh() { return m_mesh; }
 	const Mesh *unifiedMesh() const { return m_unifiedMesh; }
-	Mesh *unifiedMesh() { return m_unifiedMesh; }
 	const Mesh *unmodifiedUnifiedMesh() const { return m_unmodifiedUnifiedMesh; }
-	uint32_t mapChartVertexToOriginalVertex(uint32_t i) const { return m_chartToOriginalMap[i]; }
-
-	void evaluateOrthoQuality(UniformGrid2 &boundaryGrid) {
-		XA_PROFILE_START(parameterizeChartsEvaluateQuality)
-		m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
-		m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, nullptr);
-		m_quality.computeMetrics(m_unifiedMesh, m_initialFaceCount);
-		XA_PROFILE_END(parameterizeChartsEvaluateQuality)
-		// Use orthogonal parameterization if quality is acceptable.
-		if (!m_quality.boundaryIntersection && m_quality.totalGeometricArea > 0.0f && m_quality.stretchMetric <= 1.1f && m_quality.maxStretchMetric <= 1.25f)
-			m_type = ChartType::Ortho;
-	}
-
-	void evaluateQuality(UniformGrid2 &boundaryGrid) {
-		XA_PROFILE_START(parameterizeChartsEvaluateQuality)
-		m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
+
+	void parameterize(const ParameterizeOptions &options, UniformGrid2 &boundaryGrid)
+	{
+		XA_PROFILE_START(parameterizeChartsOrthogonal)
+		{
+			// Project vertices to plane.
+			const uint32_t vertexCount = m_unifiedMesh->vertexCount();
+			for (uint32_t i = 0; i < vertexCount; i++)
+				m_unifiedMesh->texcoord(i) = Vector2(dot(m_basis.tangent, m_unifiedMesh->position(i)), dot(m_basis.bitangent, m_unifiedMesh->position(i)));
+		}
+		XA_PROFILE_END(parameterizeChartsOrthogonal)
+		// Computing charts checks for flipped triangles and boundary intersection. Don't need to do that again here if chart is planar.
+		if (m_type != ChartType::Planar) {
+			XA_PROFILE_START(parameterizeChartsEvaluateQuality)
+			m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
+			m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, nullptr);
+			m_quality.computeMetrics(m_unifiedMesh, m_initialFaceCount);
+			XA_PROFILE_END(parameterizeChartsEvaluateQuality)
+			// Use orthogonal parameterization if quality is acceptable.
+			if (!m_quality.boundaryIntersection && m_quality.flippedTriangleCount == 0 && m_quality.totalGeometricArea > 0.0f && m_quality.stretchMetric <= 1.1f && m_quality.maxStretchMetric <= 1.25f)
+				m_type = ChartType::Ortho;
+		}
+		if (m_type == ChartType::LSCM) {
+			XA_PROFILE_START(parameterizeChartsLSCM)
+			if (options.func) {
+				options.func(&m_unifiedMesh->position(0).x, &m_unifiedMesh->texcoord(0).x, m_unifiedMesh->vertexCount(), m_unifiedMesh->indices(), m_unifiedMesh->indexCount());
+			}
+			else
+				computeLeastSquaresConformalMap(m_unifiedMesh);
+			XA_PROFILE_END(parameterizeChartsLSCM)
+			XA_PROFILE_START(parameterizeChartsEvaluateQuality)
+			m_quality.computeBoundaryIntersection(m_unifiedMesh, boundaryGrid);
 #if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
-		m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, &m_paramFlippedFaces);
+			m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, &m_paramFlippedFaces);
 #else
-		m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, nullptr);
+			m_quality.computeFlippedFaces(m_unifiedMesh, m_initialFaceCount, nullptr);
 #endif
-		// Don't need to call computeMetrics here, that's only used in evaluateOrthoQuality to determine if quality is acceptable enough to use ortho projection.
-		XA_PROFILE_END(parameterizeChartsEvaluateQuality)
-	}
-
-	// Transfer parameterization from unified mesh to chart mesh.
-	void transferParameterization() {
+			// Don't need to call computeMetrics here, that's only used in evaluateOrthoQuality to determine if quality is acceptable enough to use ortho projection.
+			if (m_quality.boundaryIntersection || m_quality.flippedTriangleCount > 0)
+				m_isInvalid = true;
+			XA_PROFILE_END(parameterizeChartsEvaluateQuality)
+		}
+#if XA_DEBUG_ALL_CHARTS_INVALID
+		m_isInvalid = true;
+#endif
+		// Transfer parameterization from unified mesh to chart mesh.
 		const uint32_t vertexCount = m_mesh->vertexCount();
 		for (uint32_t v = 0; v < vertexCount; v++)
-			m_mesh->texcoord(v) = m_unifiedMesh->texcoord(m_chartToUnifiedMap[v]);
+			m_mesh->texcoord(v) = m_unifiedMesh->texcoord(m_chartVertexToUnifiedVertexMap[v]);
+		// Can destroy unified mesh now.
+		// But not if the parameterization is invalid, the unified mesh will be needed for PiecewiseParameterization.
+		if (!m_isInvalid)
+			destroyUnifiedMesh();
+		// Need to store texcoords for backup/restore so packing can be run multiple times.
+		backupTexcoords();
 	}
 
-	Vector2 computeParametricBounds() const {
+	Vector2 computeParametricBounds() const
+	{
 		Vector2 minCorner(FLT_MAX, FLT_MAX);
 		Vector2 maxCorner(-FLT_MAX, -FLT_MAX);
 		const uint32_t vertexCount = m_mesh->vertexCount();
@@ -6590,7 +7550,34 @@ public:
 		return (maxCorner - minCorner) * 0.5f;
 	}
 
+	void restoreTexcoords()
+	{
+		memcpy(m_mesh->texcoords(), m_backupTexcoords.data(), m_mesh->vertexCount() * sizeof(Vector2));
+	}
+
 private:
+	void backupTexcoords()
+	{
+		m_backupTexcoords.resize(m_mesh->vertexCount());
+		memcpy(m_backupTexcoords.data(), m_mesh->texcoords(), m_mesh->vertexCount() * sizeof(Vector2));
+	}
+
+	void destroyUnifiedMesh()
+	{
+		if (m_unifiedMesh) {
+			m_unifiedMesh->~Mesh();
+			XA_FREE(m_unifiedMesh);
+			m_unifiedMesh = nullptr;
+		}
+		if (m_unmodifiedUnifiedMesh) {
+			m_unmodifiedUnifiedMesh->~Mesh();
+			XA_FREE(m_unmodifiedUnifiedMesh);
+			m_unmodifiedUnifiedMesh = nullptr;
+		}
+		// Don't need this when unified meshes are destroyed.
+		m_chartVertexToUnifiedVertexMap.destroy();
+	}
+
 	Basis m_basis;
 	Mesh *m_mesh;
 	Mesh *m_unifiedMesh;
@@ -6600,467 +7587,511 @@ private:
 	uint32_t m_initialFaceCount; // Before fixing T-junctions and/or closing holes.
 	uint32_t m_closedHolesCount, m_fixedTJunctionsCount;
 
-	// List of faces of the original mesh that belong to this chart.
-	Array<uint32_t> m_faceArray;
+	// List of faces of the source mesh that belong to this chart.
+	Array<uint32_t> m_faceToSourceFaceMap;
 
-	// Map vertices of the chart mesh to vertices of the original mesh.
-	Array<uint32_t> m_chartToOriginalMap;
+	// Map vertices of the chart mesh to vertices of the source mesh.
+	Array<uint32_t> m_vertexToSourceVertexMap;
 
-	Array<uint32_t> m_chartToUnifiedMap;
+	Array<uint32_t> m_chartVertexToUnifiedVertexMap;
+
+	Array<Vector2> m_backupTexcoords;
 
 	Quality m_quality;
 #if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
 	Array<uint32_t> m_paramFlippedFaces;
 #endif
+	bool m_isInvalid;
 };
 
-struct CreateChartTaskArgs {
-	const Mesh *mesh;
+struct CreateAndParameterizeChartTaskArgs
+{
 	const Basis *basis;
+	ThreadLocal<UniformGrid2> *boundaryGrid;
+	Chart *chart; // output
+	Array<Chart *> charts; // output (if more than one chart)
+	ThreadLocal<ChartCtorBuffers> *chartBuffers;
+	const Mesh *mesh;
+	const ParameterizeOptions *options;
+#if XA_RECOMPUTE_CHARTS
+	ThreadLocal<PiecewiseParam> *pp;
+#endif
 	ConstArrayView<uint32_t> faces;
-	uint32_t meshId;
 	uint32_t chartGroupId;
 	uint32_t chartId;
-	ThreadLocal<ChartCtorBuffers> *chartBuffers;
-	Chart **chart;
 };
 
-static void runCreateChartTask(void *userData) {
-	XA_PROFILE_START(createChartMeshesThread)
-	auto args = (CreateChartTaskArgs *)userData;
-	*(args->chart) = XA_NEW_ARGS(MemTag::Default, Chart, args->chartBuffers->get(), *(args->basis), args->faces, args->mesh, args->meshId, args->chartGroupId, args->chartId);
-	XA_PROFILE_END(createChartMeshesThread)
-}
-
-struct ParameterizeChartTaskArgs {
-	Chart *chart;
-	ParameterizeFunc func;
-	ThreadLocal<UniformGrid2> *boundaryGrid;
-};
-
-static void runParameterizeChartTask(void *userData) {
-	auto args = (ParameterizeChartTaskArgs *)userData;
-	Mesh *mesh = args->chart->unifiedMesh();
-	XA_PROFILE_START(parameterizeChartsOrthogonal) {
-		// Project vertices to plane.
-		const uint32_t vertexCount = mesh->vertexCount();
-		const Basis &basis = args->chart->basis();
-		for (uint32_t i = 0; i < vertexCount; i++)
-			mesh->texcoord(i) = Vector2(dot(basis.tangent, mesh->position(i)), dot(basis.bitangent, mesh->position(i)));
-	}
-	XA_PROFILE_END(parameterizeChartsOrthogonal)
-	// Computing charts checks for flipped triangles and boundary intersection. Don't need to do that again here if chart is planar.
-	if (args->chart->type() != ChartType::Planar)
-		args->chart->evaluateOrthoQuality(args->boundaryGrid->get());
-	if (args->chart->type() == ChartType::LSCM) {
-		XA_PROFILE_START(parameterizeChartsLSCM)
-		if (args->func)
-			args->func(&mesh->position(0).x, &mesh->texcoord(0).x, mesh->vertexCount(), mesh->indices(), mesh->indexCount());
-		else
-			computeLeastSquaresConformalMap(mesh);
-		XA_PROFILE_END(parameterizeChartsLSCM)
-		args->chart->evaluateQuality(args->boundaryGrid->get());
+static void runCreateAndParameterizeChartTask(void *userData)
+{
+	auto args = (CreateAndParameterizeChartTaskArgs *)userData;
+	XA_PROFILE_START(createChartMesh)
+	args->chart = XA_NEW_ARGS(MemTag::Default, Chart, args->chartBuffers->get(), *args->options, *args->basis, args->faces, args->mesh, args->chartGroupId, args->chartId);
+	XA_PROFILE_END(createChartMesh)
+	args->chart->parameterize(*args->options, args->boundaryGrid->get());
+#if XA_RECOMPUTE_CHARTS
+	if (!args->chart->isInvalid())
+		return;
+	// Recompute charts with invalid parameterizations.
+	XA_PROFILE_START(parameterizeChartsRecompute)
+	Chart *invalidChart = args->chart;
+	// Fixing t-junctions rewrites unified mesh faces, and we need to map faces back to input mesh. So use the unmodified unified mesh.
+	const Mesh *invalidMesh = invalidChart->unmodifiedUnifiedMesh();
+	uint32_t faceCount = 0;
+	if (invalidMesh) {
+		faceCount = invalidMesh->faceCount();
+	} else {
+		invalidMesh = invalidChart->unifiedMesh();
+		faceCount = invalidChart->initialFaceCount(); // Not invalidMesh->faceCount(). Don't want faces added by hole closing.
 	}
-	// Transfer parameterization from unified mesh to chart mesh.
-	args->chart->transferParameterization();
-}
-
-// Set of charts corresponding to mesh faces in the same face group.
-class ChartGroup {
-public:
-	ChartGroup(uint32_t id, const Mesh *sourceMesh, uint16_t faceGroup) :
-			m_sourceId(sourceMesh->id()), m_id(id), m_isVertexMap(faceGroup == Mesh::kInvalidFaceGroup), m_paramAddedChartsCount(0), m_paramDeletedChartsCount(0) {
-		// Create new mesh from the source mesh, using faces that belong to this group.
-		const uint32_t sourceFaceCount = sourceMesh->faceCount();
-		if (!m_isVertexMap) {
-			m_faceToSourceFaceMap.reserve(sourceMesh->faceGroupFaceCount(faceGroup));
-			for (Mesh::GroupFaceIterator it(sourceMesh, faceGroup); !it.isDone(); it.advance())
-				m_faceToSourceFaceMap.push_back(it.face());
-		} else {
-			for (uint32_t f = 0; f < sourceFaceCount; f++) {
-				if (sourceMesh->faceGroupAt(f) == faceGroup)
-					m_faceToSourceFaceMap.push_back(f);
+	PiecewiseParam &pp = args->pp->get();
+	pp.reset(invalidMesh, faceCount);
+#if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
+	char filename[256];
+	XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_chart_%03u_recomputed.obj", args->mesh->id(), args->chartGroupId, args->chartId);
+	FILE *file;
+	XA_FOPEN(file, filename, "w");
+	uint32_t subChartIndex = 0;
+#endif
+	for (;;) {
+		XA_PROFILE_START(parameterizeChartsPiecewise)
+		const bool facesRemaining = pp.computeChart();
+		XA_PROFILE_END(parameterizeChartsPiecewise)
+		if (!facesRemaining)
+			break;
+		Chart *chart = XA_NEW_ARGS(MemTag::Default, Chart, args->chartBuffers->get(), invalidChart, invalidMesh, pp.chartFaces(), pp.texcoords(), args->mesh);
+		args->charts.push_back(chart);
+#if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
+		if (file) {
+			for (uint32_t j = 0; j < invalidMesh->vertexCount(); j++) {
+				fprintf(file, "v %g %g %g\n", invalidMesh->position(j).x, invalidMesh->position(j).y, invalidMesh->position(j).z);
+				fprintf(file, "vt %g %g\n", pp.texcoords()[j].x, pp.texcoords()[j].y);
 			}
-		}
-		// Only initial meshes have face groups and ignored faces. The only flag we care about is HasNormals.
-		const uint32_t faceCount = m_faceToSourceFaceMap.size();
-		XA_DEBUG_ASSERT(faceCount > 0);
-		const uint32_t approxVertexCount = faceCount * 3;
-		m_mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, sourceMesh->epsilon(), approxVertexCount, faceCount, sourceMesh->flags() & MeshFlags::HasNormals);
-		m_vertexToSourceVertexMap.reserve(approxVertexCount);
-		HashMap<uint32_t> sourceVertexToVertexMap(MemTag::Mesh, approxVertexCount);
-		for (uint32_t f = 0; f < faceCount; f++) {
-			const uint32_t face = m_faceToSourceFaceMap[f];
-			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex = sourceMesh->vertexAt(face * 3 + i);
-				if (sourceVertexToVertexMap.get(vertex) == UINT32_MAX) {
-					sourceVertexToVertexMap.add(vertex);
-					m_vertexToSourceVertexMap.push_back(vertex);
-					Vector3 normal(0.0f);
-					if (sourceMesh->flags() & MeshFlags::HasNormals)
-						normal = sourceMesh->normal(vertex);
-					m_mesh->addVertex(sourceMesh->position(vertex), normal, sourceMesh->texcoord(vertex));
+			fprintf(file, "o chart%03u\n", subChartIndex);
+			fprintf(file, "s off\n");
+			for (uint32_t f = 0; f < pp.chartFaces().length; f++) {
+				fprintf(file, "f ");
+				const uint32_t face = pp.chartFaces()[f];
+				for (uint32_t j = 0; j < 3; j++) {
+					const uint32_t index = invalidMesh->vertexCount() * subChartIndex + invalidMesh->vertexAt(face * 3 + j) + 1; // 1-indexed
+					fprintf(file, "%d/%d/%c", index, index, j == 2 ? '\n' : ' ');
 				}
 			}
 		}
-		// Add faces.
-		for (uint32_t f = 0; f < faceCount; f++) {
-			const uint32_t face = m_faceToSourceFaceMap[f];
-			uint32_t indices[3];
-			for (uint32_t i = 0; i < 3; i++) {
-				const uint32_t vertex = sourceMesh->vertexAt(face * 3 + i);
-				indices[i] = sourceVertexToVertexMap.get(vertex);
-				XA_DEBUG_ASSERT(indices[i] != UINT32_MAX);
-			}
-			// Don't copy flags, it doesn't matter if a face is ignored after this point. All ignored faces get their own vertex map (m_isVertexMap) ChartGroup.
-			// Don't hash edges if m_isVertexMap, they may be degenerate.
-			Mesh::AddFaceResult::Enum result = m_mesh->addFace(indices, false, !m_isVertexMap);
-			XA_UNUSED(result);
-			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
-		}
-		if (!m_isVertexMap) {
-			m_mesh->createColocals();
-			m_mesh->createBoundaries();
-		}
-#if XA_DEBUG_EXPORT_OBJ_CHART_GROUPS
-		char filename[256];
-		XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u.obj", m_sourceId, m_id);
-		m_mesh->writeObjFile(filename);
-#else
-		XA_UNUSED(m_id);
+		subChartIndex++;
+#endif
+	}
+#if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
+	if (file)
+		fclose(file);
 #endif
+	XA_PROFILE_END(parameterizeChartsRecompute)
+#endif // XA_RECOMPUTE_CHARTS
+}
+
+// Set of charts corresponding to mesh faces in the same face group.
+class ChartGroup
+{
+public:
+	ChartGroup(uint32_t id, const Mesh *sourceMesh, const MeshFaceGroups *sourceMeshFaceGroups, MeshFaceGroups::Handle faceGroup) : m_id(id), m_sourceMesh(sourceMesh), m_sourceMeshFaceGroups(sourceMeshFaceGroups), m_faceGroup(faceGroup), m_faceCount(0), m_paramAddedChartsCount(0), m_paramDeletedChartsCount(0)
+	{
 	}
 
-	~ChartGroup() {
-		m_mesh->~Mesh();
-		XA_FREE(m_mesh);
+	~ChartGroup()
+	{
 		for (uint32_t i = 0; i < m_charts.size(); i++) {
 			m_charts[i]->~Chart();
 			XA_FREE(m_charts[i]);
 		}
 	}
 
+	uint32_t segmentChartCount() const { return m_chartBasis.size(); }
 	uint32_t chartCount() const { return m_charts.size(); }
 	Chart *chartAt(uint32_t i) const { return m_charts[i]; }
+	uint32_t faceCount() const { return m_faceCount; }
 	uint32_t paramAddedChartsCount() const { return m_paramAddedChartsCount; }
 	uint32_t paramDeletedChartsCount() const { return m_paramDeletedChartsCount; }
-	bool isVertexMap() const { return m_isVertexMap; }
-	uint32_t mapFaceToSourceFace(uint32_t face) const { return m_faceToSourceFaceMap[face]; }
-	uint32_t mapVertexToSourceVertex(uint32_t i) const { return m_vertexToSourceVertexMap[i]; }
-	const Mesh *mesh() const { return m_mesh; }
 
-	/*
-	Compute charts using a simple segmentation algorithm.
-
-	LSCM:
-	- identify sharp features using local dihedral angles.
-	- identify seed faces farthest from sharp features.
-	- grow charts from these seeds.
-
-	MCGIM:
-	- phase 1: chart growth
-	  - grow all charts simultaneously using dijkstra search on the dual graph of the mesh.
-	  - graph edges are weighted based on planarity metric.
-	  - metric uses distance to global chart normal.
-	  - terminate when all faces have been assigned.
-	- phase 2: seed computation:
-	  - place new seed of the chart at the most interior face.
-	  - most interior is evaluated using distance metric only.
-
-	- method repeates the two phases, until the location of the seeds does not change.
-	  - cycles are detected by recording all the previous seeds and chartification terminates.
-
-	D-Charts:
-
-	- Uniaxial conic metric:
-	  - N_c = axis of the generalized cone that best fits the chart. (cone can a be cylinder or a plane).
-	  - omega_c = angle between the face normals and the axis.
-	  - Fitting error between chart C and tringle t: F(c,t) = (N_c*n_t - cos(omega_c))^2
-
-	- Compactness metrics:
-	  - Roundness:
-		- C(c,t) = pi * D(S_c,t)^2 / A_c
-		- S_c = chart seed.
-		- D(S_c,t) = length of the shortest path inside the chart betwen S_c and t.
-		- A_c = chart area.
-	  - Straightness:
-		- P(c,t) = l_out(c,t) / l_in(c,t)
-		- l_out(c,t) = lenght of the edges not shared between C and t.
-		- l_in(c,t) = lenght of the edges shared between C and t.
-
-	- Combined metric:
-	  - Cost(c,t) = F(c,t)^alpha + C(c,t)^beta + P(c,t)^gamma
-	  - alpha = 1, beta = 0.7, gamma = 0.5
-
-	Our basic approach:
-	- Just one iteration of k-means?
-	- Avoid dijkstra by greedily growing charts until a threshold is met. Increase threshold and repeat until no faces left.
-	- If distortion metric is too high, split chart, add two seeds.
-	- If chart size is low, try removing chart.
-
-	Postprocess:
-	- If topology is not disk:
-	  - Fill holes, if new faces fit proxy.
-	  - Find best cut, otherwise.
-	- After parameterization:
-	  - If boundary self-intersects:
-		- cut chart along the closest two diametral boundary vertices, repeat parametrization.
-		- what if the overlap is on an appendix? How do we find that out and cut appropiately?
-		  - emphasize roundness metrics to prevent those cases.
-	  - If interior self-overlaps: preserve boundary parameterization and use mean-value map.
-	*/
-	void computeCharts(TaskScheduler *taskScheduler, const ChartOptions &options, segment::Atlas &atlas, ThreadLocal<ChartCtorBuffers> *chartBuffers) {
-		m_chartOptions = options;
-		// This function may be called multiple times, so destroy existing charts.
-		for (uint32_t i = 0; i < m_charts.size(); i++) {
-			m_charts[i]->~Chart();
-			XA_FREE(m_charts[i]);
-		}
-		m_charts.clear();
+	void computeChartFaces(const ChartOptions &options, segment::Atlas &atlas)
+	{
+		// Create mesh from source mesh, using only the faces in this face group.
+		XA_PROFILE_START(createChartGroupMesh)
+		Mesh *mesh = createMesh();
+		XA_PROFILE_END(createChartGroupMesh)
+		// Segment mesh into charts (arrays of faces).
 #if XA_DEBUG_SINGLE_CHART
-		Array<uint32_t> chartFaces;
-		chartFaces.resize(m_mesh->faceCount());
-		for (uint32_t i = 0; i < chartFaces.size(); i++)
-			chartFaces[i] = i;
-		Chart *chart = XA_NEW_ARGS(MemTag::Default, Chart, m_mesh, chartFaces, m_sourceId, m_id, 0);
-		m_charts.push_back(chart);
+		m_chartBasis.resize(1);
+		Fit::computeBasis(&mesh->position(0), mesh->vertexCount(), &m_chartBasis[0]);
+		m_chartFaces.resize(1 + mesh->faceCount());
+		m_chartFaces[0] = mesh->faceCount();
+		for (uint32_t i = 0; i < m_chartFaces.size(); i++)
+			m_chartFaces[i + 1] = i;
 #else
 		XA_PROFILE_START(buildAtlas)
-		atlas.reset(m_sourceId, m_id, m_mesh, options);
-		buildAtlas(atlas, options);
+		atlas.reset(mesh, options);
+		atlas.compute();
 		XA_PROFILE_END(buildAtlas)
-		const uint32_t chartCount = atlas.chartCount();
-		m_charts.resize(chartCount);
-		Array<CreateChartTaskArgs> taskArgs;
-		taskArgs.resize(chartCount);
-		for (uint32_t i = 0; i < chartCount; i++) {
-			CreateChartTaskArgs &args = taskArgs[i];
-			args.basis = &atlas.chartBasis(i);
-			args.faces = atlas.chartFaces(i);
-			args.mesh = m_mesh;
-			args.meshId = m_sourceId;
-			args.chartGroupId = m_id;
-			args.chartId = i;
-			args.chartBuffers = chartBuffers;
-			args.chart = &m_charts[i];
-		}
-		XA_PROFILE_START(createChartMeshesReal)
-		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartCount);
-		for (uint32_t i = 0; i < chartCount; i++) {
-			Task task;
-			task.userData = &taskArgs[i];
-			task.func = runCreateChartTask;
-			taskScheduler->run(taskGroup, task);
-		}
-		taskScheduler->wait(&taskGroup);
-		XA_PROFILE_END(createChartMeshesReal)
-#endif
 #if XA_DEBUG_EXPORT_OBJ_CHARTS
 		char filename[256];
-		XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_charts.obj", m_sourceId, m_id);
+		XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_charts.obj", m_sourceMesh->id(), m_id);
 		FILE *file;
 		XA_FOPEN(file, filename, "w");
 		if (file) {
-			m_mesh->writeObjVertices(file);
-			for (uint32_t i = 0; i < chartCount; i++) {
+			mesh->writeObjVertices(file);
+			for (uint32_t i = 0; i < atlas.chartCount(); i++) {
 				fprintf(file, "o chart_%04d\n", i);
 				fprintf(file, "s off\n");
-				const Array<uint32_t> &faces = builder.chartFaces(i);
-				for (uint32_t f = 0; f < faces.size(); f++)
-					m_mesh->writeObjFace(file, faces[f]);
+				ConstArrayView<uint32_t> faces = atlas.chartFaces(i);
+				for (uint32_t f = 0; f < faces.length; f++)
+					mesh->writeObjFace(file, faces[f]);
 			}
-			m_mesh->writeObjBoundaryEges(file);
-			m_mesh->writeObjLinkedBoundaries(file);
+			mesh->writeObjBoundaryEges(file);
+			mesh->writeObjLinkedBoundaries(file);
 			fclose(file);
 		}
 #endif
+		// Destroy mesh.
+		const uint32_t faceCount = mesh->faceCount();
+		mesh->~Mesh();
+		XA_FREE(mesh);
+		XA_PROFILE_START(copyChartFaces)
+		// Copy basis.
+		const uint32_t chartCount = atlas.chartCount();
+		m_chartBasis.resize(chartCount);
+		for (uint32_t i = 0; i < chartCount; i++)
+			m_chartBasis[i] = atlas.chartBasis(i);
+		// Copy faces from segment::Atlas to m_chartFaces array with <chart 0 face count> <face 0> <face n> <chart 1 face count> etc. encoding.
+		// segment::Atlas faces refer to the chart group mesh. Map them to the input mesh instead.
+		m_chartFaces.resize(chartCount + faceCount);
+		uint32_t offset = 0;
+		for (uint32_t i = 0; i < chartCount; i++) {
+			ConstArrayView<uint32_t> faces = atlas.chartFaces(i);
+			m_chartFaces[offset++] = faces.length;
+			for (uint32_t j = 0; j < faces.length; j++)
+				m_chartFaces[offset++] = m_faceToSourceFaceMap[faces[j]];
+		}
+		XA_PROFILE_END(copyChartFaces)
+#endif
 	}
 
 #if XA_RECOMPUTE_CHARTS
-	void parameterizeCharts(TaskScheduler *taskScheduler, ParameterizeFunc func, ThreadLocal<UniformGrid2> *boundaryGrid, ThreadLocal<ChartCtorBuffers> *chartBuffers, ThreadLocal<PiecewiseParam> *piecewiseParam)
+	void parameterizeCharts(TaskScheduler *taskScheduler, const ParameterizeOptions &options, ThreadLocal<UniformGrid2> *boundaryGrid, ThreadLocal<ChartCtorBuffers> *chartBuffers, ThreadLocal<PiecewiseParam> *piecewiseParam)
 #else
-	void parameterizeCharts(TaskScheduler *taskScheduler, ParameterizeFunc func, ThreadLocal<UniformGrid2> *boundaryGrid, ThreadLocal<ChartCtorBuffers> * /*chartBuffers*/)
+	void parameterizeCharts(TaskScheduler* taskScheduler, const ParameterizeOptions &options, ThreadLocal<UniformGrid2>* boundaryGrid, ThreadLocal<ChartCtorBuffers>* chartBuffers)
 #endif
 	{
+		// This function may be called multiple times, so destroy existing charts.
+		for (uint32_t i = 0; i < m_charts.size(); i++) {
+			m_charts[i]->~Chart();
+			XA_FREE(m_charts[i]);
+		}
 		m_paramAddedChartsCount = 0;
-		const uint32_t chartCount = m_charts.size();
-		Array<ParameterizeChartTaskArgs> taskArgs;
+		const uint32_t chartCount = m_chartBasis.size();
+		Array<CreateAndParameterizeChartTaskArgs> taskArgs;
 		taskArgs.resize(chartCount);
+		taskArgs.runCtors(); // Has Array member.
 		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartCount);
+		uint32_t offset = 0;
 		for (uint32_t i = 0; i < chartCount; i++) {
-			ParameterizeChartTaskArgs &args = taskArgs[i];
-			args.chart = m_charts[i];
-			args.func = func;
+			CreateAndParameterizeChartTaskArgs &args = taskArgs[i];
+			args.basis = &m_chartBasis[i];
 			args.boundaryGrid = boundaryGrid;
+			args.chart = nullptr;
+			args.chartGroupId = m_id;
+			args.chartId = i;
+			args.chartBuffers = chartBuffers;
+			const uint32_t faceCount = m_chartFaces[offset++];
+			args.faces = ConstArrayView<uint32_t>(&m_chartFaces[offset], faceCount);
+			offset += faceCount;
+			args.mesh = m_sourceMesh;
+			args.options = &options;
+#if XA_RECOMPUTE_CHARTS
+			args.pp = piecewiseParam;
+#endif
 			Task task;
 			task.userData = &args;
-			task.func = runParameterizeChartTask;
+			task.func = runCreateAndParameterizeChartTask;
 			taskScheduler->run(taskGroup, task);
 		}
 		taskScheduler->wait(&taskGroup);
 #if XA_RECOMPUTE_CHARTS
-		// Find charts with invalid parameterizations.
-		Array<Chart *> invalidCharts;
+		// Count charts. Skip invalid ones and include new ones added by recomputing.
+		uint32_t newChartCount = 0;
 		for (uint32_t i = 0; i < chartCount; i++) {
-			Chart *chart = m_charts[i];
-			const Quality &quality = chart->quality();
-			if (quality.boundaryIntersection || quality.flippedTriangleCount > 0)
-				invalidCharts.push_back(chart);
+			if (taskArgs[i].chart->isInvalid())
+				newChartCount += taskArgs[i].charts.size();
+			else
+				newChartCount++;
 		}
-		if (invalidCharts.isEmpty())
-			return;
-		// Recompute charts with invalid parameterizations.
-		PiecewiseParam &pp = piecewiseParam->get();
-		for (uint32_t i = 0; i < invalidCharts.size(); i++) {
-			Chart *invalidChart = invalidCharts[i];
-			// Fixing t-junctions rewrites unified mesh faces, and we need to map faces back to input mesh. So use the unmodified unified mesh.
-			const Mesh *invalidMesh = invalidChart->unmodifiedUnifiedMesh();
-			uint32_t faceCount = 0;
-			if (invalidMesh) {
-				faceCount = invalidMesh->faceCount();
-			} else {
-				invalidMesh = invalidChart->unifiedMesh();
-				faceCount = invalidChart->initialFaceCount(); // Not invalidMesh->faceCount(). Don't want faces added by hole closing.
+		m_charts.resize(newChartCount);
+		// Add valid charts first. Destroy invalid ones.
+		uint32_t current = 0;
+		for (uint32_t i = 0; i < chartCount; i++) {
+			Chart *chart = taskArgs[i].chart;
+			if (chart->isInvalid()) {
+				chart->~Chart();
+				XA_FREE(chart);
+				m_paramDeletedChartsCount++;
+				continue;
 			}
-			pp.reset(invalidMesh, faceCount);
-#if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
-			char filename[256];
-			XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u_recomputed_chart_%03u.obj", m_sourceId, m_id, m_paramAddedChartsCount);
-			FILE *file;
-			XA_FOPEN(file, filename, "w");
-			uint32_t subChartIndex = 0;
-#endif
-			for (;;) {
-				if (!pp.computeChart())
-					break;
-				Chart *chart = XA_NEW_ARGS(MemTag::Default, Chart, chartBuffers->get(), invalidChart, invalidMesh, pp.chartFaces(), pp.texcoords(), m_mesh, m_sourceId, m_id, m_charts.size());
-				m_charts.push_back(chart);
-#if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
-				if (file) {
-					for (uint32_t j = 0; j < invalidMesh->vertexCount(); j++) {
-						fprintf(file, "v %g %g %g\n", invalidMesh->position(j).x, invalidMesh->position(j).y, invalidMesh->position(j).z);
-						fprintf(file, "vt %g %g\n", pp.texcoords()[j].x, pp.texcoords()[j].y);
-					}
-					fprintf(file, "o chart%03u\n", subChartIndex);
-					fprintf(file, "s off\n");
-					for (uint32_t f = 0; f < pp.chartFaces().length; f++) {
-						fprintf(file, "f ");
-						const uint32_t face = pp.chartFaces()[f];
-						for (uint32_t j = 0; j < 3; j++) {
-							const uint32_t index = invalidMesh->vertexCount() * subChartIndex + invalidMesh->vertexAt(face * 3 + j) + 1; // 1-indexed
-							fprintf(file, "%d/%d/%c", index, index, j == 2 ? '\n' : ' ');
-						}
-					}
-				}
-				subChartIndex++;
-#endif
+			m_charts[current++] = chart;
+		}
+		// Now add new charts.
+		for (uint32_t i = 0; i < chartCount; i++) {
+			CreateAndParameterizeChartTaskArgs &args = taskArgs[i];
+			for (uint32_t j = 0; j < args.charts.size(); j++) {
+				m_charts[current++] = args.charts[j];
 				m_paramAddedChartsCount++;
 			}
-#if XA_DEBUG_EXPORT_OBJ_RECOMPUTED_CHARTS
-			if (file)
-				fclose(file);
-#endif
-		}
-		// Remove and delete the invalid charts.
-		for (uint32_t i = 0; i < invalidCharts.size(); i++) {
-			Chart *chart = invalidCharts[i];
-			removeChart(chart);
-			chart->~Chart();
-			XA_FREE(chart);
-			m_paramDeletedChartsCount++;
 		}
+#else // XA_RECOMPUTE_CHARTS
+		m_charts.resize(chartCount);
+		for (uint32_t i = 0; i < chartCount; i++)
+			m_charts[i] = taskArgs[i].chart;
 #endif // XA_RECOMPUTE_CHARTS
+		taskArgs.runDtors(); // Has Array member.
 	}
 
 private:
-	void buildAtlas(segment::Atlas &atlas, const ChartOptions &options) {
-		if (atlas.facesLeft() == 0)
-			return;
-		// Create initial charts greedely.
-		atlas.placeSeeds(options.maxThreshold * 0.5f);
-		if (options.maxIterations == 0) {
-			XA_DEBUG_ASSERT(atlas.facesLeft() == 0);
-			return;
-		}
-		atlas.relocateSeeds();
-		atlas.resetCharts();
-		// Restart process growing charts in parallel.
-		uint32_t iteration = 0;
-		for (;;) {
-			atlas.growCharts(options.maxThreshold);
-			// When charts cannot grow more: fill holes, merge charts, relocate seeds and start new iteration.
-			atlas.fillHoles(options.maxThreshold * 0.5f);
-#if XA_MERGE_CHARTS
-			atlas.mergeCharts();
-#endif
-			if (++iteration == options.maxIterations)
-				break;
-			if (!atlas.relocateSeeds())
-				break;
-			atlas.resetCharts();
+	Mesh *createMesh()
+	{
+		XA_DEBUG_ASSERT(m_faceGroup != MeshFaceGroups::kInvalid);
+		// Create new mesh from the source mesh, using faces that belong to this group.
+		m_faceToSourceFaceMap.reserve(m_sourceMeshFaceGroups->faceCount(m_faceGroup));
+		for (MeshFaceGroups::Iterator it(m_sourceMeshFaceGroups, m_faceGroup); !it.isDone(); it.advance())
+			m_faceToSourceFaceMap.push_back(it.face());
+		// Only initial meshes has ignored faces. The only flag we care about is HasNormals.
+		const uint32_t faceCount = m_faceCount = m_faceToSourceFaceMap.size();
+		XA_DEBUG_ASSERT(faceCount > 0);
+		const uint32_t approxVertexCount = min(faceCount * 3, m_sourceMesh->vertexCount());
+		Mesh *mesh = XA_NEW_ARGS(MemTag::Mesh, Mesh, m_sourceMesh->epsilon(), approxVertexCount, faceCount, m_sourceMesh->flags() & MeshFlags::HasNormals);
+		HashMap<uint32_t, PassthroughHash<uint32_t>> sourceVertexToVertexMap(MemTag::Mesh, approxVertexCount);
+		for (uint32_t f = 0; f < faceCount; f++) {
+			const uint32_t face = m_faceToSourceFaceMap[f];
+			for (uint32_t i = 0; i < 3; i++) {
+				const uint32_t vertex = m_sourceMesh->vertexAt(face * 3 + i);
+				if (sourceVertexToVertexMap.get(vertex) == UINT32_MAX) {
+					sourceVertexToVertexMap.add(vertex);
+					Vector3 normal(0.0f);
+					if (m_sourceMesh->flags() & MeshFlags::HasNormals)
+						normal = m_sourceMesh->normal(vertex);
+					mesh->addVertex(m_sourceMesh->position(vertex), normal, m_sourceMesh->texcoord(vertex));
+				}
+			}
 		}
-		// Make sure no holes are left!
-		XA_DEBUG_ASSERT(atlas.facesLeft() == 0);
-	}
-
-	void removeChart(const Chart *chart) {
-		for (uint32_t i = 0; i < m_charts.size(); i++) {
-			if (m_charts[i] == chart) {
-				m_charts.removeAt(i);
-				return;
+		// Add faces.
+		for (uint32_t f = 0; f < faceCount; f++) {
+			const uint32_t face = m_faceToSourceFaceMap[f];
+			XA_DEBUG_ASSERT(!m_sourceMesh->isFaceIgnored(face));
+			uint32_t indices[3];
+			for (uint32_t i = 0; i < 3; i++) {
+				const uint32_t vertex = m_sourceMesh->vertexAt(face * 3 + i);
+				indices[i] = sourceVertexToVertexMap.get(vertex);
+				XA_DEBUG_ASSERT(indices[i] != UINT32_MAX);
 			}
+			// Don't copy flags - ignored faces aren't used by chart groups, they are handled by InvalidMeshGeometry.
+			Mesh::AddFaceResult::Enum result = mesh->addFace(indices);
+			XA_UNUSED(result);
+			XA_DEBUG_ASSERT(result == Mesh::AddFaceResult::OK);
 		}
+		XA_PROFILE_START(createChartGroupMeshColocals)
+		mesh->createColocals();
+		XA_PROFILE_END(createChartGroupMeshColocals)
+		XA_PROFILE_START(createChartGroupMeshBoundaries)
+		mesh->createBoundaries();
+		mesh->destroyEdgeMap(); // Only needed it for createBoundaries.
+		XA_PROFILE_END(createChartGroupMeshBoundaries)
+#if XA_DEBUG_EXPORT_OBJ_CHART_GROUPS
+		char filename[256];
+		XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u_chartgroup_%03u.obj", m_sourceMesh->id(), m_id);
+		mesh->writeObjFile(filename);
+#endif
+		return mesh;
 	}
 
-	uint32_t m_sourceId, m_id;
-	bool m_isVertexMap;
-	Mesh *m_mesh;
+	const uint32_t m_id;
+	const Mesh * const m_sourceMesh;
+	const MeshFaceGroups * const m_sourceMeshFaceGroups;
+	const MeshFaceGroups::Handle m_faceGroup;
 	Array<uint32_t> m_faceToSourceFaceMap; // List of faces of the source mesh that belong to this chart group.
-	Array<uint32_t> m_vertexToSourceVertexMap; // Map vertices of the mesh to vertices of the source mesh.
+	Array<Basis> m_chartBasis; // Copied from segment::Atlas.
+	Array<uint32_t> m_chartFaces; // Copied from segment::Atlas. Encoding: <chart 0 face count> <face 0> <face n> <chart 1 face count> etc.
 	Array<Chart *> m_charts;
-	ChartOptions m_chartOptions;
+	uint32_t m_faceCount; // Set by createMesh(). Used for sorting.
 	uint32_t m_paramAddedChartsCount; // Number of new charts added by recomputing charts with invalid parameterizations.
 	uint32_t m_paramDeletedChartsCount; // Number of charts with invalid parameterizations that were deleted, after charts were recomputed.
 };
 
-struct CreateChartGroupTaskArgs {
-	uint16_t faceGroup;
-	uint32_t groupId;
-	const Mesh *mesh;
-	ChartGroup **chartGroup;
+// References invalid faces and vertices in a mesh.
+struct InvalidMeshGeometry
+{
+	// Invalid faces have the face groups MeshFaceGroups::kInvalid.
+	void extract(const Mesh *mesh, const MeshFaceGroups *meshFaceGroups)
+	{
+		// Copy invalid faces.
+		m_faces.clear();
+		const uint32_t meshFaceCount = mesh->faceCount();
+		for (uint32_t f = 0; f < meshFaceCount; f++) {
+			if (meshFaceGroups->groupAt(f) == MeshFaceGroups::kInvalid)
+				m_faces.push_back(f);
+		}
+		// Create *unique* list of vertices of invalid faces.
+		const uint32_t faceCount = m_faces.size();
+		m_indices.resize(faceCount * 3);
+		const uint32_t approxVertexCount = min(faceCount * 3, mesh->vertexCount());
+		m_vertexToSourceVertexMap.clear();
+		m_vertexToSourceVertexMap.reserve(approxVertexCount);
+		HashMap<uint32_t, PassthroughHash<uint32_t>> sourceVertexToVertexMap(MemTag::Mesh, approxVertexCount);
+		for (uint32_t f = 0; f < faceCount; f++) {
+			const uint32_t face = m_faces[f];
+			for (uint32_t i = 0; i < 3; i++) {
+				const uint32_t vertex = mesh->vertexAt(face * 3 + i);
+				uint32_t newVertex = sourceVertexToVertexMap.get(vertex);
+				if (newVertex == UINT32_MAX) {
+					newVertex = sourceVertexToVertexMap.add(vertex);
+					m_vertexToSourceVertexMap.push_back(vertex);
+				}
+				m_indices[f * 3 + i] = newVertex;
+			}
+		}
+	}
+
+	ConstArrayView<uint32_t> faces() const { return m_faces; }
+	ConstArrayView<uint32_t> indices() const { return m_indices; }
+	ConstArrayView<uint32_t> vertices() const { return m_vertexToSourceVertexMap; }
+
+private:
+	Array<uint32_t> m_faces, m_indices;
+	Array<uint32_t> m_vertexToSourceVertexMap; // Map face vertices to vertices of the source mesh.
 };
 
-static void runCreateChartGroupTask(void *userData) {
-	XA_PROFILE_START(addMeshCreateChartGroupsThread)
-	auto args = (CreateChartGroupTaskArgs *)userData;
-	*(args->chartGroup) = XA_NEW_ARGS(MemTag::Default, ChartGroup, args->groupId, args->mesh, args->faceGroup);
-	XA_PROFILE_END(addMeshCreateChartGroupsThread)
+struct ChartGroupComputeChartFacesTaskArgs
+{
+	ThreadLocal<segment::Atlas> *atlas;
+	ChartGroup *chartGroup;
+	const ChartOptions *options;
+	Progress *progress;
+};
+
+static void runChartGroupComputeChartFacesJob(void *userData)
+{
+	auto args = (ChartGroupComputeChartFacesTaskArgs *)userData;
+	if (args->progress->cancel)
+		return;
+	XA_PROFILE_START(chartGroupComputeChartsThread)
+	args->chartGroup->computeChartFaces(*args->options, args->atlas->get());
+	XA_PROFILE_END(chartGroupComputeChartsThread)
 }
 
-struct ComputeChartsTaskArgs {
-	TaskScheduler *taskScheduler;
-	ChartGroup *chartGroup;
+struct MeshComputeChartFacesTaskArgs
+{
+	Array<ChartGroup *> *chartGroups; // output
+	InvalidMeshGeometry *invalidMeshGeometry; // output
 	ThreadLocal<segment::Atlas> *atlas;
-	ThreadLocal<ChartCtorBuffers> *chartBuffers;
 	const ChartOptions *options;
 	Progress *progress;
+	const Mesh *sourceMesh;
+	TaskScheduler *taskScheduler;
 };
 
-static void runComputeChartsJob(void *userData) {
-	auto args = (ComputeChartsTaskArgs *)userData;
+#if XA_DEBUG_EXPORT_OBJ_FACE_GROUPS
+static uint32_t s_faceGroupsCurrentVertex = 0;
+#endif
+
+static void runMeshComputeChartFacesJob(void *userData)
+{
+	auto args = (MeshComputeChartFacesTaskArgs *)userData;
 	if (args->progress->cancel)
 		return;
 	XA_PROFILE_START(computeChartsThread)
-	args->chartGroup->computeCharts(args->taskScheduler, *args->options, args->atlas->get(), args->chartBuffers);
+	// Create face groups.
+	XA_PROFILE_START(createFaceGroups)
+	MeshFaceGroups *meshFaceGroups = XA_NEW_ARGS(MemTag::Mesh, MeshFaceGroups, args->sourceMesh);
+	meshFaceGroups->compute();
+	const uint32_t chartGroupCount = meshFaceGroups->groupCount();
+	XA_PROFILE_END(createFaceGroups)
+	if (args->progress->cancel)
+		goto cleanup;
+#if XA_DEBUG_EXPORT_OBJ_FACE_GROUPS
+	{
+		static std::mutex s_mutex;
+		std::lock_guard<std::mutex> lock(s_mutex);
+		char filename[256];
+		XA_SPRINTF(filename, sizeof(filename), "debug_face_groups.obj");
+		FILE *file;
+		XA_FOPEN(file, filename, s_faceGroupsCurrentVertex == 0 ? "w" : "a");
+		if (file) {
+			const Mesh *mesh = args->sourceMesh;
+			mesh->writeObjVertices(file);
+			// groups
+			uint32_t numGroups = 0;
+			for (uint32_t i = 0; i < mesh->faceCount(); i++) {
+				if (meshFaceGroups->groupAt(i) != MeshFaceGroups::kInvalid)
+					numGroups = max(numGroups, meshFaceGroups->groupAt(i) + 1);
+			}
+			for (uint32_t i = 0; i < numGroups; i++) {
+				fprintf(file, "o mesh_%03u_group_%04d\n", mesh->id(), i);
+				fprintf(file, "s off\n");
+				for (uint32_t f = 0; f < mesh->faceCount(); f++) {
+					if (meshFaceGroups->groupAt(f) == i)
+						mesh->writeObjFace(file, f, s_faceGroupsCurrentVertex);
+				}
+			}
+			fprintf(file, "o mesh_%03u_group_ignored\n", mesh->id());
+			fprintf(file, "s off\n");
+			for (uint32_t f = 0; f < mesh->faceCount(); f++) {
+				if (meshFaceGroups->groupAt(f) == MeshFaceGroups::kInvalid)
+					mesh->writeObjFace(file, f, s_faceGroupsCurrentVertex);
+			}
+			mesh->writeObjBoundaryEges(file);
+			s_faceGroupsCurrentVertex += mesh->vertexCount();
+			fclose(file);
+		}
+	}
+#endif
+	// Create a chart group for each face group.
+	args->chartGroups->resize(chartGroupCount);
+	for (uint32_t i = 0; i < chartGroupCount; i++)
+		(*args->chartGroups)[i] = XA_NEW_ARGS(MemTag::Default, ChartGroup, i, args->sourceMesh, meshFaceGroups, MeshFaceGroups::Handle(i));
+	// Extract invalid geometry via the invalid face group (MeshFaceGroups::kInvalid).
+	XA_PROFILE_START(extractInvalidMeshGeometry)
+	args->invalidMeshGeometry->extract(args->sourceMesh, meshFaceGroups);
+	XA_PROFILE_END(extractInvalidMeshGeometry)
+	// One task for each chart group - compute chart faces.
+	{
+		XA_PROFILE_START(chartGroupComputeChartsReal)
+		Array<ChartGroupComputeChartFacesTaskArgs> taskArgs;
+		taskArgs.resize(chartGroupCount);
+		for (uint32_t i = 0; i < chartGroupCount; i++) {
+			taskArgs[i].atlas = args->atlas;
+			taskArgs[i].chartGroup = (*args->chartGroups)[i];
+			taskArgs[i].options = args->options;
+			taskArgs[i].progress = args->progress;
+		}
+		TaskGroupHandle taskGroup = args->taskScheduler->createTaskGroup(chartGroupCount);
+		for (uint32_t i = 0; i < chartGroupCount; i++) {
+			Task task;
+			task.userData = &taskArgs[i];
+			task.func = runChartGroupComputeChartFacesJob;
+			args->taskScheduler->run(taskGroup, task);
+		}
+		args->taskScheduler->wait(&taskGroup);
+		XA_PROFILE_END(chartGroupComputeChartsReal)
+	}
 	XA_PROFILE_END(computeChartsThread)
 	args->progress->value++;
 	args->progress->update();
+cleanup:
+	if (meshFaceGroups) {
+		meshFaceGroups->~MeshFaceGroups();
+		XA_FREE(meshFaceGroups);
+	}
 }
 
-struct ParameterizeChartsTaskArgs {
+struct ParameterizeChartsTaskArgs
+{
 	TaskScheduler *taskScheduler;
 	ChartGroup *chartGroup;
-	ParameterizeFunc func;
+	const ParameterizeOptions *options;
 	ThreadLocal<UniformGrid2> *boundaryGrid;
 	ThreadLocal<ChartCtorBuffers> *chartBuffers;
 #if XA_RECOMPUTE_CHARTS
@@ -7069,15 +8100,16 @@ struct ParameterizeChartsTaskArgs {
 	Progress *progress;
 };
 
-static void runParameterizeChartsJob(void *userData) {
+static void runParameterizeChartsJob(void *userData)
+{
 	auto args = (ParameterizeChartsTaskArgs *)userData;
 	if (args->progress->cancel)
 		return;
 	XA_PROFILE_START(parameterizeChartsThread)
 #if XA_RECOMPUTE_CHARTS
-	args->chartGroup->parameterizeCharts(args->taskScheduler, args->func, args->boundaryGrid, args->chartBuffers, args->piecewiseParam);
+	args->chartGroup->parameterizeCharts(args->taskScheduler, *args->options, args->boundaryGrid, args->chartBuffers, args->piecewiseParam);
 #else
-	args->chartGroup->parameterizeCharts(args->taskScheduler, args->func, args->boundaryGrid, args->chartBuffers);
+	args->chartGroup->parameterizeCharts(args->taskScheduler, *args->options, args->boundaryGrid, args->chartBuffers);
 #endif
 	XA_PROFILE_END(parameterizeChartsThread)
 	args->progress->value++;
@@ -7085,137 +8117,86 @@ static void runParameterizeChartsJob(void *userData) {
 }
 
 /// An atlas is a set of chart groups.
-class Atlas {
+class Atlas
+{
 public:
-	Atlas() :
-			m_meshCount(0), m_chartsComputed(false), m_chartsParameterized(false) {}
+	Atlas() : m_chartsComputed(false), m_chartsParameterized(false) {}
 
-	~Atlas() {
-		for (uint32_t i = 0; i < m_chartGroups.size(); i++) {
-			m_chartGroups[i]->~ChartGroup();
-			XA_FREE(m_chartGroups[i]);
+	~Atlas()
+	{
+		for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
+			for (uint32_t j = 0; j < m_meshChartGroups[i].size(); j++) {
+				m_meshChartGroups[i][j]->~ChartGroup();
+				XA_FREE(m_meshChartGroups[i][j]);
+			}
 		}
+		m_meshChartGroups.runDtors();
+		m_invalidMeshGeometry.runDtors();
 	}
 
+	uint32_t meshCount() const { return m_meshes.size(); }
+	const InvalidMeshGeometry &invalidMeshGeometry(uint32_t meshIndex) const { return m_invalidMeshGeometry[meshIndex]; }
 	bool chartsComputed() const { return m_chartsComputed; }
 	bool chartsParameterized() const { return m_chartsParameterized; }
-	uint32_t chartGroupCount() const { return m_chartGroups.size(); }
-	const ChartGroup *chartGroupAt(uint32_t index) const { return m_chartGroups[index]; }
-
-	uint32_t chartGroupCount(uint32_t mesh) const {
-		uint32_t count = 0;
-		for (uint32_t i = 0; i < m_chartGroups.size(); i++) {
-			if (m_chartGroupSourceMeshes[i] == mesh)
-				count++;
-		}
-		return count;
-	}
-
-	const ChartGroup *chartGroupAt(uint32_t mesh, uint32_t group) const {
-		for (uint32_t c = 0; c < m_chartGroups.size(); c++) {
-			if (m_chartGroupSourceMeshes[c] != mesh)
-				continue;
-			if (group == 0)
-				return m_chartGroups[c];
-			group--;
-		}
-		return nullptr;
-	}
+	uint32_t chartGroupCount(uint32_t mesh) const { return m_meshChartGroups[mesh].size(); }
+	const ChartGroup *chartGroupAt(uint32_t mesh, uint32_t group) const { return m_meshChartGroups[mesh][group]; }
 
-	// This function is thread safe.
-	void addMesh(TaskScheduler *taskScheduler, const Mesh *mesh) {
-		// Create one chart group per face group.
-		// If there's any ignored faces in the mesh, create an extra face group for that (vertex map).
-		// Chart group creation is slow since it copies a chunk of the source mesh, so use tasks.
-		Array<ChartGroup *> chartGroups;
-		chartGroups.resize(mesh->faceGroupCount() + (mesh->ignoredFaceCount() > 0 ? 1 : 0));
-		Array<CreateChartGroupTaskArgs> taskArgs;
-		taskArgs.resize(chartGroups.size());
-		for (uint32_t g = 0; g < chartGroups.size(); g++) {
-			CreateChartGroupTaskArgs &args = taskArgs[g];
-			args.chartGroup = &chartGroups[g];
-			args.faceGroup = uint16_t(g < mesh->faceGroupCount() ? g : Mesh::kInvalidFaceGroup);
-			args.groupId = g;
-			args.mesh = mesh;
-		}
-		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartGroups.size());
-		for (uint32_t g = 0; g < chartGroups.size(); g++) {
-			Task task;
-			task.userData = &taskArgs[g];
-			task.func = runCreateChartGroupTask;
-			taskScheduler->run(taskGroup, task);
-		}
-		taskScheduler->wait(&taskGroup);
-		// Thread-safe append.
-		m_addMeshMutex.lock();
-		for (uint32_t g = 0; g < chartGroups.size(); g++) {
-			m_chartGroups.push_back(chartGroups[g]);
-			m_chartGroupSourceMeshes.push_back(mesh->id());
-		}
-		m_meshCount++;
-		m_addMeshMutex.unlock();
-	}
-
-	// Chart id/index is determined by depth-first hierarchy of mesh -> chart group -> chart.
-	// For chart index to be consistent here, chart groups needs to sorted by mesh index. Since addMesh is called by multithreaded tasks, order is indeterminate, so chart groups need to be explicitly sorted after all meshes are added.
-	void sortChartGroups() {
-		Array<ChartGroup *> oldChartGroups;
-		oldChartGroups.resize(m_chartGroups.size());
-		memcpy(oldChartGroups.data(), m_chartGroups.data(), sizeof(ChartGroup *) * m_chartGroups.size());
-		Array<uint32_t> oldChartGroupSourceMeshes;
-		oldChartGroupSourceMeshes.resize(m_chartGroupSourceMeshes.size());
-		memcpy(oldChartGroupSourceMeshes.data(), m_chartGroupSourceMeshes.data(), sizeof(uint32_t) * m_chartGroupSourceMeshes.size());
-		uint32_t current = 0;
-		for (uint32_t i = 0; i < m_meshCount; i++) {
-			for (uint32_t j = 0; j < oldChartGroups.size(); j++) {
-				if (oldChartGroupSourceMeshes[j] == i) {
-					m_chartGroups[current] = oldChartGroups[j];
-					m_chartGroupSourceMeshes[current] = oldChartGroupSourceMeshes[j];
-					current++;
-				}
-			}
-		}
+	void addMesh(const Mesh *mesh)
+	{
+		m_meshes.push_back(mesh);
 	}
 
-	bool computeCharts(TaskScheduler *taskScheduler, const ChartOptions &options, ProgressFunc progressFunc, void *progressUserData) {
+	bool computeCharts(TaskScheduler *taskScheduler, const ChartOptions &options, ProgressFunc progressFunc, void *progressUserData)
+	{
+#if XA_DEBUG_EXPORT_OBJ_PLANAR_REGIONS
+		segment::s_planarRegionsCurrentRegion = segment::s_planarRegionsCurrentVertex = 0;
+#endif
 		m_chartsComputed = false;
 		m_chartsParameterized = false;
-		// Ignore vertex maps.
-		uint32_t chartGroupCount = 0;
-		for (uint32_t i = 0; i < m_chartGroups.size(); i++) {
-			if (!m_chartGroups[i]->isVertexMap())
-				chartGroupCount++;
+		// Clear chart groups, since this function may be called multiple times.
+		if (!m_meshChartGroups.isEmpty()) {
+			for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
+				for (uint32_t j = 0; j < m_meshChartGroups[i].size(); j++) {
+					m_meshChartGroups[i][j]->~ChartGroup();
+					XA_FREE(m_meshChartGroups[i][j]);
+				}
+				m_meshChartGroups[i].clear();
+			}
+			XA_ASSERT(m_meshChartGroups.size() == m_meshes.size()); // The number of meshes shouldn't have changed.
 		}
-		Progress progress(ProgressCategory::ComputeCharts, progressFunc, progressUserData, chartGroupCount);
+		m_meshChartGroups.resize(m_meshes.size());
+		m_meshChartGroups.runCtors();
+		m_invalidMeshGeometry.resize(m_meshes.size());
+		m_invalidMeshGeometry.runCtors();
+		// One task per mesh.
+		const uint32_t meshCount = m_meshes.size();
+		Progress progress(ProgressCategory::ComputeCharts, progressFunc, progressUserData, meshCount);
 		ThreadLocal<segment::Atlas> atlas;
-		ThreadLocal<ChartCtorBuffers> chartBuffers;
-		Array<ComputeChartsTaskArgs> taskArgs;
-		taskArgs.reserve(chartGroupCount);
-		for (uint32_t i = 0; i < m_chartGroups.size(); i++) {
-			if (!m_chartGroups[i]->isVertexMap()) {
-				ComputeChartsTaskArgs args;
-				args.taskScheduler = taskScheduler;
-				args.chartGroup = m_chartGroups[i];
-				args.atlas = &atlas;
-				args.chartBuffers = &chartBuffers;
-				args.options = &options;
-				args.progress = &progress;
-				taskArgs.push_back(args);
-			}
-		}
-		// Sort chart groups by mesh indexCount.
-		m_chartGroupsRadix = RadixSort();
-		Array<float> chartGroupSortData;
-		chartGroupSortData.resize(chartGroupCount);
-		for (uint32_t i = 0; i < chartGroupCount; i++)
-			chartGroupSortData[i] = (float)taskArgs[i].chartGroup->mesh()->indexCount();
-		m_chartGroupsRadix.sort(chartGroupSortData);
-		// Larger chart group meshes are added first to reduce the chance of thread starvation.
-		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartGroupCount);
-		for (uint32_t i = 0; i < chartGroupCount; i++) {
+		Array<MeshComputeChartFacesTaskArgs> taskArgs;
+		taskArgs.resize(meshCount);
+		for (uint32_t i = 0; i < meshCount; i++) {
+			MeshComputeChartFacesTaskArgs &args = taskArgs[i];
+			args.atlas = &atlas;
+			args.chartGroups = &m_meshChartGroups[i];
+			args.invalidMeshGeometry = &m_invalidMeshGeometry[i];
+			args.options = &options;
+			args.progress = &progress;
+			args.sourceMesh = m_meshes[i];
+			args.taskScheduler = taskScheduler;
+		}
+		// Sort meshes by indexCount.
+		Array<float> meshSortData;
+		meshSortData.resize(meshCount);
+		for (uint32_t i = 0; i < meshCount; i++)
+			meshSortData[i] = (float)m_meshes[i]->indexCount();
+		RadixSort meshSort;
+		meshSort.sort(meshSortData);
+		// Larger meshes are added first to reduce the chance of thread starvation.
+		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(meshCount);
+		for (uint32_t i = 0; i < meshCount; i++) {
 			Task task;
-			task.userData = &taskArgs[m_chartGroupsRadix.ranks()[chartGroupCount - i - 1]];
-			task.func = runComputeChartsJob;
+			task.userData = &taskArgs[meshSort.ranks()[meshCount - i - 1]];
+			task.func = runMeshComputeChartFacesJob;
 			taskScheduler->run(taskGroup, task);
 		}
 		taskScheduler->wait(&taskGroup);
@@ -7225,14 +8206,12 @@ public:
 		return true;
 	}
 
-	bool parameterizeCharts(TaskScheduler *taskScheduler, ParameterizeFunc func, ProgressFunc progressFunc, void *progressUserData) {
+	bool parameterizeCharts(TaskScheduler *taskScheduler, const ParameterizeOptions &options, ProgressFunc progressFunc, void *progressUserData)
+	{
 		m_chartsParameterized = false;
-		// Ignore vertex maps.
 		uint32_t chartGroupCount = 0;
-		for (uint32_t i = 0; i < m_chartGroups.size(); i++) {
-			if (!m_chartGroups[i]->isVertexMap())
-				chartGroupCount++;
-		}
+		for (uint32_t i = 0; i < m_meshChartGroups.size(); i++)
+			chartGroupCount += m_meshChartGroups[i].size();
 		Progress progress(ProgressCategory::ParameterizeCharts, progressFunc, progressUserData, chartGroupCount);
 		ThreadLocal<UniformGrid2> boundaryGrid; // For Quality boundary intersection.
 		ThreadLocal<ChartCtorBuffers> chartBuffers;
@@ -7240,27 +8219,45 @@ public:
 		ThreadLocal<PiecewiseParam> piecewiseParam;
 #endif
 		Array<ParameterizeChartsTaskArgs> taskArgs;
-		taskArgs.reserve(chartGroupCount);
-		for (uint32_t i = 0; i < m_chartGroups.size(); i++) {
-			if (!m_chartGroups[i]->isVertexMap()) {
-				ParameterizeChartsTaskArgs args;
-				args.taskScheduler = taskScheduler;
-				args.chartGroup = m_chartGroups[i];
-				args.func = func;
-				args.boundaryGrid = &boundaryGrid;
-				args.chartBuffers = &chartBuffers;
+		taskArgs.resize(chartGroupCount);
+		{
+			uint32_t k = 0;
+			for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
+				const uint32_t count = m_meshChartGroups[i].size();
+				for (uint32_t j = 0; j < count; j++) {
+					ParameterizeChartsTaskArgs &args = taskArgs[k];
+					args.taskScheduler = taskScheduler;
+					args.chartGroup = m_meshChartGroups[i][j];
+					args.options = &options;
+					args.boundaryGrid = &boundaryGrid;
+					args.chartBuffers = &chartBuffers;
 #if XA_RECOMPUTE_CHARTS
-				args.piecewiseParam = &piecewiseParam;
+					args.piecewiseParam = &piecewiseParam;
 #endif
-				args.progress = &progress;
-				taskArgs.push_back(args);
+					args.progress = &progress;
+					k++;
+				}
 			}
 		}
-		// Larger chart group meshes are added first to reduce the chance of thread starvation.
+		// Sort chart groups by face count.
+		Array<float> chartGroupSortData;
+		chartGroupSortData.resize(chartGroupCount);
+		{
+			uint32_t k = 0;
+			for (uint32_t i = 0; i < m_meshChartGroups.size(); i++) {
+				const uint32_t count = m_meshChartGroups[i].size();
+				for (uint32_t j = 0; j < count; j++) {
+					chartGroupSortData[k++] = (float)m_meshChartGroups[i][j]->faceCount();
+				}
+			}
+		}
+		RadixSort chartGroupSort;
+		chartGroupSort.sort(chartGroupSortData);
+		// Larger chart groups are added first to reduce the chance of thread starvation.
 		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartGroupCount);
 		for (uint32_t i = 0; i < chartGroupCount; i++) {
 			Task task;
-			task.userData = &taskArgs[m_chartGroupsRadix.ranks()[chartGroupCount - i - 1]];
+			task.userData = &taskArgs[chartGroupSort.ranks()[chartGroupCount - i - 1]];
 			task.func = runParameterizeChartsJob;
 			taskScheduler->run(taskGroup, task);
 		}
@@ -7272,28 +8269,28 @@ public:
 	}
 
 private:
-	std::mutex m_addMeshMutex;
-	uint32_t m_meshCount;
+	Array<const Mesh *> m_meshes;
+	Array<InvalidMeshGeometry> m_invalidMeshGeometry; // 1 per mesh.
+	Array<Array<ChartGroup *> > m_meshChartGroups;
 	bool m_chartsComputed;
 	bool m_chartsParameterized;
-	Array<ChartGroup *> m_chartGroups;
-	RadixSort m_chartGroupsRadix; // By mesh indexCount.
-	Array<uint32_t> m_chartGroupSourceMeshes;
 };
 
 } // namespace param
 
 namespace pack {
 
-class AtlasImage {
+class AtlasImage
+{
 public:
-	AtlasImage(uint32_t width, uint32_t height) :
-			m_width(width), m_height(height) {
+	AtlasImage(uint32_t width, uint32_t height) : m_width(width), m_height(height)
+	{
 		m_data.resize(m_width * m_height);
 		memset(m_data.data(), 0, sizeof(uint32_t) * m_data.size());
 	}
 
-	void resize(uint32_t width, uint32_t height) {
+	void resize(uint32_t width, uint32_t height)
+	{
 		Array<uint32_t> data;
 		data.resize(width * height);
 		memset(data.data(), 0, sizeof(uint32_t) * data.size());
@@ -7304,7 +8301,8 @@ public:
 		data.moveTo(m_data);
 	}
 
-	void addChart(uint32_t chartIndex, const BitImage *image, const BitImage *imageBilinear, const BitImage *imagePadding, int atlas_w, int atlas_h, int offset_x, int offset_y) {
+	void addChart(uint32_t chartIndex, const BitImage *image, const BitImage *imageBilinear, const BitImage *imagePadding, int atlas_w, int atlas_h, int offset_x, int offset_y)
+	{
 		const int w = image->width();
 		const int h = image->height();
 		for (int y = 0; y < h; y++) {
@@ -7330,13 +8328,15 @@ public:
 		}
 	}
 
-	void copyTo(uint32_t *dest, uint32_t destWidth, uint32_t destHeight, int padding) const {
+	void copyTo(uint32_t *dest, uint32_t destWidth, uint32_t destHeight, int padding) const
+	{
 		for (uint32_t y = 0; y < destHeight; y++)
 			memcpy(&dest[y * destWidth], &m_data[padding + (y + padding) * m_width], destWidth * sizeof(uint32_t));
 	}
 
 #if XA_DEBUG_EXPORT_ATLAS_IMAGES
-	void writeTga(const char *filename, uint32_t width, uint32_t height) const {
+	void writeTga(const char *filename, uint32_t width, uint32_t height) const
+	{
 		Array<uint8_t> image;
 		image.resize(width * height * 3);
 		for (uint32_t y = 0; y < height; y++) {
@@ -7378,7 +8378,8 @@ private:
 	Array<uint32_t> m_data;
 };
 
-struct Chart {
+struct Chart
+{
 	int32_t atlasIndex;
 	uint32_t material;
 	uint32_t indexCount;
@@ -7400,18 +8401,20 @@ struct Chart {
 	uint32_t uniqueVertexCount() const { return uniqueVertices.isEmpty() ? vertexCount : uniqueVertices.size(); }
 };
 
-struct AddChartTaskArgs {
+struct AddChartTaskArgs
+{
 	ThreadLocal<BoundingBox2D> *boundingBox;
 	param::Chart *paramChart;
 	Chart *chart; // out
 };
 
-static void runAddChartTask(void *userData) {
+static void runAddChartTask(void *userData)
+{
 	XA_PROFILE_START(packChartsAddChartsThread)
 	auto args = (AddChartTaskArgs *)userData;
 	param::Chart *paramChart = args->paramChart;
 	XA_PROFILE_START(packChartsAddChartsRestoreTexcoords)
-	paramChart->transferParameterization();
+	paramChart->restoreTexcoords();
 	XA_PROFILE_END(packChartsAddChartsRestoreTexcoords)
 	Mesh *mesh = paramChart->mesh();
 	Chart *chart = args->chart = XA_NEW(MemTag::Default, Chart);
@@ -7445,8 +8448,10 @@ static void runAddChartTask(void *userData) {
 	XA_PROFILE_END(packChartsAddChartsThread)
 }
 
-struct Atlas {
-	~Atlas() {
+struct Atlas
+{
+	~Atlas()
+	{
 		for (uint32_t i = 0; i < m_atlasImages.size(); i++) {
 			m_atlasImages[i]->~AtlasImage();
 			XA_FREE(m_atlasImages[i]);
@@ -7470,15 +8475,16 @@ struct Atlas {
 	const Array<AtlasImage *> &getImages() const { return m_atlasImages; }
 	float getUtilization(uint32_t atlas) const { return m_utilization[atlas]; }
 
-	void addCharts(TaskScheduler *taskScheduler, param::Atlas *paramAtlas) {
+	void addCharts(TaskScheduler *taskScheduler, param::Atlas *paramAtlas)
+	{
 		// Count charts.
 		uint32_t chartCount = 0;
-		const uint32_t chartGroupsCount = paramAtlas->chartGroupCount();
-		for (uint32_t i = 0; i < chartGroupsCount; i++) {
-			const param::ChartGroup *chartGroup = paramAtlas->chartGroupAt(i);
-			if (chartGroup->isVertexMap())
-				continue;
-			chartCount += chartGroup->chartCount();
+		for (uint32_t i = 0; i < paramAtlas->meshCount(); i++) {
+			const uint32_t chartGroupsCount = paramAtlas->chartGroupCount(i);
+			for (uint32_t j = 0; j < chartGroupsCount; j++) {
+				const param::ChartGroup *chartGroup = paramAtlas->chartGroupAt(i, j);
+				chartCount += chartGroup->chartCount();
+			}
 		}
 		if (chartCount == 0)
 			return;
@@ -7488,20 +8494,21 @@ struct Atlas {
 		TaskGroupHandle taskGroup = taskScheduler->createTaskGroup(chartCount);
 		uint32_t chartIndex = 0;
 		ThreadLocal<BoundingBox2D> boundingBox;
-		for (uint32_t i = 0; i < chartGroupsCount; i++) {
-			const param::ChartGroup *chartGroup = paramAtlas->chartGroupAt(i);
-			if (chartGroup->isVertexMap())
-				continue;
-			const uint32_t count = chartGroup->chartCount();
-			for (uint32_t j = 0; j < count; j++) {
-				AddChartTaskArgs &args = taskArgs[chartIndex];
-				args.boundingBox = &boundingBox;
-				args.paramChart = chartGroup->chartAt(j);
-				Task task;
-				task.userData = &taskArgs[chartIndex];
-				task.func = runAddChartTask;
-				taskScheduler->run(taskGroup, task);
-				chartIndex++;
+		for (uint32_t i = 0; i < paramAtlas->meshCount(); i++) {
+			const uint32_t chartGroupsCount = paramAtlas->chartGroupCount(i);
+			for (uint32_t j = 0; j < chartGroupsCount; j++) {
+				const param::ChartGroup *chartGroup = paramAtlas->chartGroupAt(i, j);
+				const uint32_t count = chartGroup->chartCount();
+				for (uint32_t k = 0; k < count; k++) {
+					AddChartTaskArgs &args = taskArgs[chartIndex];
+					args.boundingBox = &boundingBox;
+					args.paramChart = chartGroup->chartAt(k);
+					Task task;
+					task.userData = &taskArgs[chartIndex];
+					task.func = runAddChartTask;
+					taskScheduler->run(taskGroup, task);
+					chartIndex++;
+				}
 			}
 		}
 		taskScheduler->wait(&taskGroup);
@@ -7511,7 +8518,8 @@ struct Atlas {
 			m_charts[i] = taskArgs[i].chart;
 	}
 
-	void addUvMeshCharts(UvMeshInstance *mesh) {
+	void addUvMeshCharts(UvMeshInstance *mesh)
+	{
 		BitArray vertexUsed(mesh->texcoords.size());
 		BoundingBox2D boundingBox;
 		for (uint32_t c = 0; c < mesh->mesh->charts.size(); c++) {
@@ -7572,7 +8580,8 @@ struct Atlas {
 	}
 
 	// Pack charts in the smallest possible rectangle.
-	bool packCharts(const PackOptions &options, ProgressFunc progressFunc, void *progressUserData) {
+	bool packCharts(const PackOptions &options, ProgressFunc progressFunc, void *progressUserData)
+	{
 		if (progressFunc) {
 			if (!progressFunc(ProgressCategory::PackCharts, 0, progressUserData))
 				return false;
@@ -7699,7 +8708,6 @@ struct Atlas {
 			maxChartPerimeter = max(maxChartPerimeter, chartOrderArray[c]);
 		}
 		// Sort charts by perimeter.
-		m_radix = RadixSort();
 		m_radix.sort(chartOrderArray);
 		const uint32_t *ranks = m_radix.ranks();
 		// Divide chart perimeter range into buckets.
@@ -7807,7 +8815,8 @@ struct Atlas {
 			int best_x = 0, best_y = 0;
 			int best_cw = 0, best_ch = 0;
 			int best_r = 0;
-			for (;;) {
+			for (;;)
+			{
 				bool firstChartInBitImage = false;
 				XA_UNUSED(firstChartInBitImage);
 				if (currentAtlas + 1 > m_bitImages.size()) {
@@ -7840,7 +8849,8 @@ struct Atlas {
 				if (best_x + best_cw > atlasSizes[currentAtlas].x || best_y + best_ch > atlasSizes[currentAtlas].y) {
 					for (uint32_t j = 0; j < chartStartPositions.size(); j++)
 						chartStartPositions[j] = Vector2i(0, 0);
-				} else {
+				}
+				else {
 					chartStartPositions[currentAtlas] = Vector2i(best_x, best_y);
 				}
 			}
@@ -7892,10 +8902,8 @@ struct Atlas {
 				}
 				texcoord.x = best_x + t.x;
 				texcoord.y = best_y + t.y;
-				if (!options.blockAlign) {
-					texcoord.x -= (float)options.padding;
-					texcoord.y -= (float)options.padding;
-				}
+				texcoord.x -= (float)options.padding;
+				texcoord.y -= (float)options.padding;
 				XA_ASSERT(texcoord.x >= 0 && texcoord.y >= 0);
 				XA_ASSERT(isFinite(texcoord.x) && isFinite(texcoord.y));
 			}
@@ -7908,21 +8916,12 @@ struct Atlas {
 				}
 			}
 		}
-		if (options.blockAlign) {
-			if (maxResolution == 0) {
-				m_width = max(0, atlasSizes[0].x);
-				m_height = max(0, atlasSizes[0].y);
-			} else {
-				m_width = m_height = maxResolution;
-			}
+		// Remove padding from outer edges.
+		if (maxResolution == 0) {
+			m_width = max(0, atlasSizes[0].x - (int)options.padding * 2);
+			m_height = max(0, atlasSizes[0].y - (int)options.padding * 2);
 		} else {
-			// Remove padding from outer edges.
-			if (maxResolution == 0) {
-				m_width = max(0, atlasSizes[0].x - (int)options.padding * 2);
-				m_height = max(0, atlasSizes[0].y - (int)options.padding * 2);
-			} else {
-				m_width = m_height = maxResolution - (int)options.padding * 2;
-			}
+			m_width = m_height = maxResolution - (int)options.padding * 2;
 		}
 		XA_PRINT("   %dx%d resolution\n", m_width, m_height);
 		m_utilization.resize(m_bitImages.size());
@@ -7939,7 +8938,8 @@ struct Atlas {
 			}
 			if (m_utilization.size() > 1) {
 				XA_PRINT("   %u: %f%% utilization\n", i, m_utilization[i] * 100.0f);
-			} else {
+			}
+			else {
 				XA_PRINT("   %f%% utilization\n", m_utilization[i] * 100.0f);
 			}
 		}
@@ -7962,14 +8962,16 @@ private:
 	// is occupied at this point. At the end we have many small charts and a large atlas with sparse holes. Finding those holes randomly is slow. A better approach would be to
 	// start stacking large charts as if they were tetris pieces. Once charts get small try to place them randomly. It may be interesting to try a intermediate strategy, first try
 	// along one axis and then try exhaustively along that axis.
-	bool findChartLocation(const Vector2i &startPosition, bool bruteForce, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, bool blockAligned, uint32_t maxResolution, bool allowRotate) {
+	bool findChartLocation(const Vector2i &startPosition, bool bruteForce, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, bool blockAligned, uint32_t maxResolution, bool allowRotate)
+	{
 		const int attempts = 4096;
 		if (bruteForce || attempts >= w * h)
 			return findChartLocation_bruteForce(startPosition, atlasBitImage, chartBitImage, chartBitImageRotated, w, h, best_x, best_y, best_w, best_h, best_r, blockAligned, maxResolution, allowRotate);
 		return findChartLocation_random(atlasBitImage, chartBitImage, chartBitImageRotated, w, h, best_x, best_y, best_w, best_h, best_r, attempts, blockAligned, maxResolution, allowRotate);
 	}
 
-	bool findChartLocation_bruteForce(const Vector2i &startPosition, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, bool blockAligned, uint32_t maxResolution, bool allowRotate) {
+	bool findChartLocation_bruteForce(const Vector2i &startPosition, const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, bool blockAligned, uint32_t maxResolution, bool allowRotate)
+	{
 		const int stepSize = blockAligned ? 4 : 1;
 		int best_metric = INT_MAX;
 		// Try two different orientations.
@@ -8014,7 +9016,8 @@ private:
 		return best_metric != INT_MAX;
 	}
 
-	bool findChartLocation_random(const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, int minTrialCount, bool blockAligned, uint32_t maxResolution, bool allowRotate) {
+	bool findChartLocation_random(const BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int w, int h, int *best_x, int *best_y, int *best_w, int *best_h, int *best_r, int minTrialCount, bool blockAligned, uint32_t maxResolution, bool allowRotate)
+	{
 		bool result = false;
 		const int BLOCK_SIZE = 4;
 		int best_metric = INT_MAX;
@@ -8069,7 +9072,8 @@ private:
 		return result;
 	}
 
-	void addChart(BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int atlas_w, int atlas_h, int offset_x, int offset_y, int r) {
+	void addChart(BitImage *atlasBitImage, const BitImage *chartBitImage, const BitImage *chartBitImageRotated, int atlas_w, int atlas_h, int offset_x, int offset_y, int r)
+	{
 		XA_DEBUG_ASSERT(r == 0 || r == 1);
 		const BitImage *image = r == 0 ? chartBitImage : chartBitImageRotated;
 		const int w = image->width();
@@ -8092,7 +9096,8 @@ private:
 		}
 	}
 
-	void bilinearExpand(const Chart *chart, BitImage *source, BitImage *dest, BitImage *destRotated, UniformGrid2 &boundaryEdgeGrid) const {
+	void bilinearExpand(const Chart *chart, BitImage *source, BitImage *dest, BitImage *destRotated, UniformGrid2 &boundaryEdgeGrid) const
+	{
 		boundaryEdgeGrid.reset(chart->vertices, chart->indices);
 		if (chart->boundaryEdges) {
 			const uint32_t edgeCount = chart->boundaryEdges->size();
@@ -8147,11 +9152,13 @@ private:
 		}
 	}
 
-	struct DrawTriangleCallbackArgs {
+	struct DrawTriangleCallbackArgs
+	{
 		BitImage *chartBitImage, *chartBitImageRotated;
 	};
 
-	static bool drawTriangleCallback(void *param, int x, int y) {
+	static bool drawTriangleCallback(void *param, int x, int y)
+	{
 		auto args = (DrawTriangleCallbackArgs *)param;
 		args->chartBitImage->set(x, y);
 		if (args->chartBitImageRotated)
@@ -8173,48 +9180,52 @@ private:
 } // namespace pack
 } // namespace internal
 
-struct Context {
+struct Context
+{
 	Atlas atlas;
-	uint32_t meshCount = 0;
 	internal::Progress *addMeshProgress = nullptr;
 	internal::TaskGroupHandle addMeshTaskGroup;
 	internal::param::Atlas paramAtlas;
 	ProgressFunc progressFunc = nullptr;
 	void *progressUserData = nullptr;
 	internal::TaskScheduler *taskScheduler;
+	internal::Array<internal::Mesh *> meshes;
 	internal::Array<internal::UvMesh *> uvMeshes;
 	internal::Array<internal::UvMeshInstance *> uvMeshInstances;
 };
 
-Atlas *Create() {
+Atlas *Create()
+{
 	Context *ctx = XA_NEW(internal::MemTag::Default, Context);
 	memset(&ctx->atlas, 0, sizeof(Atlas));
 	ctx->taskScheduler = XA_NEW(internal::MemTag::Default, internal::TaskScheduler);
 	return &ctx->atlas;
 }
 
-static void DestroyOutputMeshes(Context *ctx) {
+static void DestroyOutputMeshes(Context *ctx)
+{
 	if (!ctx->atlas.meshes)
 		return;
 	for (int i = 0; i < (int)ctx->atlas.meshCount; i++) {
 		Mesh &mesh = ctx->atlas.meshes[i];
-		for (uint32_t j = 0; j < mesh.chartCount; j++) {
-			if (mesh.chartArray[j].faceArray)
-				XA_FREE(mesh.chartArray[j].faceArray);
-		}
-		if (mesh.chartArray)
+		if (mesh.chartArray) {
+			for (uint32_t j = 0; j < mesh.chartCount; j++) {
+				if (mesh.chartArray[j].faceArray)
+					XA_FREE(mesh.chartArray[j].faceArray);
+			}
 			XA_FREE(mesh.chartArray);
+		}
 		if (mesh.vertexArray)
 			XA_FREE(mesh.vertexArray);
 		if (mesh.indexArray)
 			XA_FREE(mesh.indexArray);
 	}
-	if (ctx->atlas.meshes)
-		XA_FREE(ctx->atlas.meshes);
+	XA_FREE(ctx->atlas.meshes);
 	ctx->atlas.meshes = nullptr;
 }
 
-void Destroy(Atlas *atlas) {
+void Destroy(Atlas *atlas)
+{
 	XA_DEBUG_ASSERT(atlas);
 	Context *ctx = (Context *)atlas;
 	if (atlas->utilization)
@@ -8228,6 +9239,11 @@ void Destroy(Atlas *atlas) {
 	}
 	ctx->taskScheduler->~TaskScheduler();
 	XA_FREE(ctx->taskScheduler);
+	for (uint32_t i = 0; i < ctx->meshes.size(); i++) {
+		internal::Mesh *mesh = ctx->meshes[i];
+		mesh->~Mesh();
+		XA_FREE(mesh);
+	}
 	for (uint32_t i = 0; i < ctx->uvMeshes.size(); i++) {
 		internal::UvMesh *mesh = ctx->uvMeshes[i];
 		for (uint32_t j = 0; j < mesh->charts.size(); j++) {
@@ -8249,12 +9265,14 @@ void Destroy(Atlas *atlas) {
 #endif
 }
 
-struct AddMeshTaskArgs {
+struct AddMeshTaskArgs
+{
 	Context *ctx;
 	internal::Mesh *mesh;
 };
 
-static void runAddMeshTask(void *userData) {
+static void runAddMeshTask(void *userData)
+{
 	XA_PROFILE_START(addMeshThread)
 	auto args = (AddMeshTaskArgs *)userData; // Responsible for freeing this.
 	internal::Mesh *mesh = args->mesh;
@@ -8268,87 +9286,45 @@ static void runAddMeshTask(void *userData) {
 	}
 	if (progress->cancel)
 		goto cleanup;
-	{
-		XA_PROFILE_START(addMeshCreateFaceGroups)
-		mesh->createFaceGroups();
-		XA_PROFILE_END(addMeshCreateFaceGroups)
-	}
-	if (progress->cancel)
-		goto cleanup;
-#if XA_DEBUG_EXPORT_OBJ_SOURCE_MESHES
-	char filename[256];
-	XA_SPRINTF(filename, sizeof(filename), "debug_mesh_%03u.obj", mesh->id());
-	FILE *file;
-	XA_FOPEN(file, filename, "w");
-	if (file) {
-		mesh->writeObjVertices(file);
-		// groups
-		uint32_t numGroups = 0;
-		for (uint32_t i = 0; i < mesh->faceCount(); i++) {
-			if (mesh->faceGroupAt(i) != Mesh::kInvalidFaceGroup)
-				numGroups = internal::max(numGroups, mesh->faceGroupAt(i) + 1);
-		}
-		for (uint32_t i = 0; i < numGroups; i++) {
-			fprintf(file, "o group_%04d\n", i);
-			fprintf(file, "s off\n");
-			for (uint32_t f = 0; f < mesh->faceCount(); f++) {
-				if (mesh->faceGroupAt(f) == i)
-					mesh->writeObjFace(file, f);
-			}
-		}
-		fprintf(file, "o group_ignored\n");
-		fprintf(file, "s off\n");
-		for (uint32_t f = 0; f < mesh->faceCount(); f++) {
-			if (mesh->faceGroupAt(f) == Mesh::kInvalidFaceGroup)
-				mesh->writeObjFace(file, f);
-		}
-		mesh->writeObjBoundaryEges(file);
-		fclose(file);
-	}
-#endif
-	{
-		XA_PROFILE_START(addMeshCreateChartGroupsReal)
-		args->ctx->paramAtlas.addMesh(args->ctx->taskScheduler, mesh); // addMesh is thread safe
-		XA_PROFILE_END(addMeshCreateChartGroupsReal)
-	}
-	if (progress->cancel)
-		goto cleanup;
 	progress->value++;
 	progress->update();
 cleanup:
-	mesh->~Mesh();
-	XA_FREE(mesh);
 	args->~AddMeshTaskArgs();
 	XA_FREE(args);
 	XA_PROFILE_END(addMeshThread)
 }
 
-static internal::Vector3 DecodePosition(const MeshDecl &meshDecl, uint32_t index) {
+static internal::Vector3 DecodePosition(const MeshDecl &meshDecl, uint32_t index)
+{
 	XA_DEBUG_ASSERT(meshDecl.vertexPositionData);
 	XA_DEBUG_ASSERT(meshDecl.vertexPositionStride > 0);
 	return *((const internal::Vector3 *)&((const uint8_t *)meshDecl.vertexPositionData)[meshDecl.vertexPositionStride * index]);
 }
 
-static internal::Vector3 DecodeNormal(const MeshDecl &meshDecl, uint32_t index) {
+static internal::Vector3 DecodeNormal(const MeshDecl &meshDecl, uint32_t index)
+{
 	XA_DEBUG_ASSERT(meshDecl.vertexNormalData);
 	XA_DEBUG_ASSERT(meshDecl.vertexNormalStride > 0);
 	return *((const internal::Vector3 *)&((const uint8_t *)meshDecl.vertexNormalData)[meshDecl.vertexNormalStride * index]);
 }
 
-static internal::Vector2 DecodeUv(const MeshDecl &meshDecl, uint32_t index) {
+static internal::Vector2 DecodeUv(const MeshDecl &meshDecl, uint32_t index)
+{
 	XA_DEBUG_ASSERT(meshDecl.vertexUvData);
 	XA_DEBUG_ASSERT(meshDecl.vertexUvStride > 0);
 	return *((const internal::Vector2 *)&((const uint8_t *)meshDecl.vertexUvData)[meshDecl.vertexUvStride * index]);
 }
 
-static uint32_t DecodeIndex(IndexFormat::Enum format, const void *indexData, int32_t offset, uint32_t i) {
+static uint32_t DecodeIndex(IndexFormat::Enum format, const void *indexData, int32_t offset, uint32_t i)
+{
 	XA_DEBUG_ASSERT(indexData);
 	if (format == IndexFormat::UInt16)
 		return uint16_t((int32_t)((const uint16_t *)indexData)[i] + offset);
 	return uint32_t((int32_t)((const uint32_t *)indexData)[i] + offset);
 }
 
-AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t meshCountHint) {
+AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t meshCountHint)
+{
 	XA_DEBUG_ASSERT(atlas);
 	if (!atlas) {
 		XA_PRINT_WARNING("AddMesh: atlas is null.\n");
@@ -8360,19 +9336,20 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 		return AddMeshError::Error;
 	}
 #if XA_PROFILE
-	if (ctx->meshCount == 0)
+	if (ctx->meshes.isEmpty())
 		internal::s_profile.addMeshReal = clock();
 #endif
 	// Don't know how many times AddMesh will be called, so progress needs to adjusted each time.
 	if (!ctx->addMeshProgress) {
 		ctx->addMeshProgress = XA_NEW_ARGS(internal::MemTag::Default, internal::Progress, ProgressCategory::AddMesh, ctx->progressFunc, ctx->progressUserData, 1);
-	} else {
-		ctx->addMeshProgress->setMaxValue(internal::max(ctx->meshCount + 1, meshCountHint));
+	}
+	else {
+		ctx->addMeshProgress->setMaxValue(internal::max(ctx->meshes.size() + 1, meshCountHint));
 	}
 	XA_PROFILE_START(addMeshCopyData)
 	const bool hasIndices = meshDecl.indexCount > 0;
 	const uint32_t indexCount = hasIndices ? meshDecl.indexCount : meshDecl.vertexCount;
-	XA_PRINT("Adding mesh %d: %u vertices, %u triangles\n", ctx->meshCount, meshDecl.vertexCount, indexCount / 3);
+	XA_PRINT("Adding mesh %d: %u vertices, %u triangles\n", ctx->meshes.size(), meshDecl.vertexCount, indexCount / 3);
 	// Expecting triangle faces.
 	if ((indexCount % 3) != 0)
 		return AddMeshError::InvalidIndexCount;
@@ -8384,10 +9361,10 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 				return AddMeshError::IndexOutOfRange;
 		}
 	}
-	uint32_t meshFlags = internal::MeshFlags::HasFaceGroups | internal::MeshFlags::HasIgnoredFaces;
+	uint32_t meshFlags = internal::MeshFlags::HasIgnoredFaces;
 	if (meshDecl.vertexNormalData)
 		meshFlags |= internal::MeshFlags::HasNormals;
-	internal::Mesh *mesh = XA_NEW_ARGS(internal::MemTag::Mesh, internal::Mesh, meshDecl.epsilon, meshDecl.vertexCount, indexCount / 3, meshFlags, ctx->meshCount);
+	internal::Mesh *mesh = XA_NEW_ARGS(internal::MemTag::Mesh, internal::Mesh, meshDecl.epsilon, meshDecl.vertexCount, indexCount / 3, meshFlags, ctx->meshes.size());
 	for (uint32_t i = 0; i < meshDecl.vertexCount; i++) {
 		internal::Vector3 normal(0.0f);
 		internal::Vector2 texcoord(0.0f);
@@ -8397,6 +9374,8 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 			texcoord = DecodeUv(meshDecl, i);
 		mesh->addVertex(DecodePosition(meshDecl, i), normal, texcoord);
 	}
+	const uint32_t kMaxWarnings = 50;
+	uint32_t warningCount = 0;
 	for (uint32_t i = 0; i < indexCount / 3; i++) {
 		uint32_t tri[3];
 		for (int j = 0; j < 3; j++)
@@ -8408,14 +9387,16 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 			const uint32_t index2 = tri[(j + 1) % 3];
 			if (index1 == index2) {
 				ignore = true;
-				XA_PRINT("   Degenerate edge: index %d, index %d\n", index1, index2);
+				if (++warningCount <= kMaxWarnings)
+					XA_PRINT("   Degenerate edge: index %d, index %d\n", index1, index2);
 				break;
 			}
 			const internal::Vector3 &pos1 = mesh->position(index1);
 			const internal::Vector3 &pos2 = mesh->position(index2);
 			if (internal::length(pos2 - pos1) <= 0.0f) {
 				ignore = true;
-				XA_PRINT("   Zero length edge: index %d position (%g %g %g), index %d position (%g %g %g)\n", index1, pos1.x, pos1.y, pos1.z, index2, pos2.x, pos2.y, pos2.z);
+				if (++warningCount <= kMaxWarnings)
+					XA_PRINT("   Zero length edge: index %d position (%g %g %g), index %d position (%g %g %g)\n", index1, pos1.x, pos1.y, pos1.z, index2, pos2.x, pos2.y, pos2.z);
 				break;
 			}
 		}
@@ -8424,14 +9405,16 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 			for (int j = 0; j < 3; j++) {
 				const internal::Vector3 &pos = mesh->position(tri[j]);
 				if (internal::isNan(pos.x) || internal::isNan(pos.y) || internal::isNan(pos.z)) {
-					XA_PRINT("   NAN position in face: %d\n", i);
+					if (++warningCount <= kMaxWarnings)
+						XA_PRINT("   NAN position in face: %d\n", i);
 					ignore = true;
 					break;
 				}
 				if (meshDecl.vertexNormalData) {
 					const internal::Vector3 &normal = mesh->normal(tri[j]);
 					if (internal::isNan(normal.x) || internal::isNan(normal.y) || internal::isNan(normal.z)) {
-						XA_PRINT("   NAN normal in face: %d\n", i);
+						if (++warningCount <= kMaxWarnings)
+							XA_PRINT("   NAN normal in face: %d\n", i);
 						ignore = true;
 						break;
 					}
@@ -8439,7 +9422,8 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 				if (meshDecl.vertexUvData) {
 					const internal::Vector2 &uv = mesh->texcoord(tri[j]);
 					if (internal::isNan(uv.x) || internal::isNan(uv.y)) {
-						XA_PRINT("   NAN texture coordinate in face: %d\n", i);
+						if (++warningCount <= kMaxWarnings)
+							XA_PRINT("   NAN texture coordinate in face: %d\n", i);
 						ignore = true;
 						break;
 					}
@@ -8455,20 +9439,26 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 			area = internal::length(internal::cross(b - a, c - a)) * 0.5f;
 			if (area <= internal::kAreaEpsilon) {
 				ignore = true;
-				XA_PRINT("   Zero area face: %d, indices (%d %d %d), area is %f\n", i, tri[0], tri[1], tri[2], area);
+				if (++warningCount <= kMaxWarnings)
+					XA_PRINT("   Zero area face: %d, indices (%d %d %d), area is %f\n", i, tri[0], tri[1], tri[2], area);
 			}
 		}
 		if (!ignore) {
 			if (internal::equal(a, b, meshDecl.epsilon) || internal::equal(a, c, meshDecl.epsilon) || internal::equal(b, c, meshDecl.epsilon)) {
 				ignore = true;
-				XA_PRINT("   Degenerate face: %d, area is %f\n", i, area);
+				if (++warningCount <= kMaxWarnings)
+					XA_PRINT("   Degenerate face: %d, area is %f\n", i, area);
 			}
 		}
 		if (meshDecl.faceIgnoreData && meshDecl.faceIgnoreData[i])
 			ignore = true;
 		mesh->addFace(tri[0], tri[1], tri[2], ignore);
 	}
+	if (warningCount > kMaxWarnings)
+		XA_PRINT("   %u additional warnings truncated\n", warningCount - kMaxWarnings);
 	XA_PROFILE_END(addMeshCopyData)
+	ctx->meshes.push_back(mesh);
+	ctx->paramAtlas.addMesh(mesh);
 	if (ctx->addMeshTaskGroup.value == UINT32_MAX)
 		ctx->addMeshTaskGroup = ctx->taskScheduler->createTaskGroup();
 	AddMeshTaskArgs *taskArgs = XA_NEW(internal::MemTag::Default, AddMeshTaskArgs); // The task frees this.
@@ -8478,11 +9468,11 @@ AddMeshError::Enum AddMesh(Atlas *atlas, const MeshDecl &meshDecl, uint32_t mesh
 	task.userData = taskArgs;
 	task.func = runAddMeshTask;
 	ctx->taskScheduler->run(ctx->addMeshTaskGroup, task);
-	ctx->meshCount++;
 	return AddMeshError::Success;
 }
 
-void AddMeshJoin(Atlas *atlas) {
+void AddMeshJoin(Atlas *atlas)
+{
 	XA_DEBUG_ASSERT(atlas);
 	if (!atlas) {
 		XA_PRINT_WARNING("AddMeshJoin: atlas is null.\n");
@@ -8495,41 +9485,43 @@ void AddMeshJoin(Atlas *atlas) {
 	ctx->addMeshProgress->~Progress();
 	XA_FREE(ctx->addMeshProgress);
 	ctx->addMeshProgress = nullptr;
-	ctx->paramAtlas.sortChartGroups();
 #if XA_PROFILE
-	XA_PRINT("Added %u meshes\n", ctx->meshCount);
+	XA_PRINT("Added %u meshes\n", ctx->meshes.size());
 	internal::s_profile.addMeshReal = clock() - internal::s_profile.addMeshReal;
 #endif
 	XA_PROFILE_PRINT_AND_RESET("   Total (real): ", addMeshReal)
 	XA_PROFILE_PRINT_AND_RESET("      Copy data: ", addMeshCopyData)
 	XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", addMeshThread)
 	XA_PROFILE_PRINT_AND_RESET("      Create colocals: ", addMeshCreateColocals)
-	XA_PROFILE_PRINT_AND_RESET("      Create face groups: ", addMeshCreateFaceGroups)
-	XA_PROFILE_PRINT_AND_RESET("      Create chart groups (real): ", addMeshCreateChartGroupsReal)
-	XA_PROFILE_PRINT_AND_RESET("      Create chart groups (thread): ", addMeshCreateChartGroupsThread)
+#if XA_PROFILE_ALLOC
+	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+#endif
 	XA_PRINT_MEM_USAGE
+#if XA_DEBUG_EXPORT_OBJ_FACE_GROUPS
+	internal::param::s_faceGroupsCurrentVertex = 0;
+#endif
 }
 
-struct EdgeKey {
+struct EdgeKey
+{
 	EdgeKey() {}
-	EdgeKey(const EdgeKey &k) :
-			v0(k.v0), v1(k.v1) {}
-	EdgeKey(uint32_t v0, uint32_t v1) :
-			v0(v0), v1(v1) {}
+	EdgeKey(const EdgeKey &k) : v0(k.v0), v1(k.v1) {}
+	EdgeKey(uint32_t v0, uint32_t v1) : v0(v0), v1(v1) {}
 	bool operator==(const EdgeKey &k) const { return v0 == k.v0 && v1 == k.v1; }
 
 	uint32_t v0;
 	uint32_t v1;
 };
 
-AddMeshError::Enum AddUvMesh(Atlas *atlas, const UvMeshDecl &decl) {
+AddMeshError::Enum AddUvMesh(Atlas *atlas, const UvMeshDecl &decl)
+{
 	XA_DEBUG_ASSERT(atlas);
 	if (!atlas) {
 		XA_PRINT_WARNING("AddUvMesh: atlas is null.\n");
 		return AddMeshError::Error;
 	}
 	Context *ctx = (Context *)atlas;
-	if (ctx->meshCount > 0) {
+	if (!ctx->meshes.isEmpty()) {
 		XA_PRINT_WARNING("AddUvMesh: Meshes and UV meshes cannot be added to the same atlas.\n");
 		return AddMeshError::Error;
 	}
@@ -8632,7 +9624,8 @@ AddMeshError::Enum AddUvMesh(Atlas *atlas, const UvMeshDecl &decl) {
 	return AddMeshError::Success;
 }
 
-void ComputeCharts(Atlas *atlas, ChartOptions chartOptions) {
+void ComputeCharts(Atlas *atlas, ChartOptions options)
+{
 	if (!atlas) {
 		XA_PRINT_WARNING("ComputeCharts: atlas is null.\n");
 		return;
@@ -8643,69 +9636,66 @@ void ComputeCharts(Atlas *atlas, ChartOptions chartOptions) {
 		return;
 	}
 	AddMeshJoin(atlas);
-	if (ctx->meshCount == 0) {
+	if (ctx->meshes.isEmpty()) {
 		XA_PRINT_WARNING("ComputeCharts: No meshes. Call AddMesh first.\n");
 		return;
 	}
 	XA_PRINT("Computing charts\n");
-	uint32_t chartCount = 0, chartsWithHolesCount = 0, holesCount = 0, chartsWithTJunctionsCount = 0, tJunctionsCount = 0;
 	XA_PROFILE_START(computeChartsReal)
-	if (!ctx->paramAtlas.computeCharts(ctx->taskScheduler, chartOptions, ctx->progressFunc, ctx->progressUserData)) {
+	if (!ctx->paramAtlas.computeCharts(ctx->taskScheduler, options, ctx->progressFunc, ctx->progressUserData)) {
 		XA_PRINT("   Cancelled by user\n");
 		return;
 	}
 	XA_PROFILE_END(computeChartsReal)
-	// Count charts and print warnings.
-	for (uint32_t i = 0; i < ctx->meshCount; i++) {
+	// Count charts.
+	uint32_t chartCount = 0;
+	const uint32_t meshCount = ctx->meshes.size();
+	for (uint32_t i = 0; i < meshCount; i++) {
 		for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
 			const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
-			if (chartGroup->isVertexMap())
-				continue;
-			for (uint32_t k = 0; k < chartGroup->chartCount(); k++) {
-				const internal::param::Chart *chart = chartGroup->chartAt(k);
-#if XA_PRINT_CHART_WARNINGS
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::CloseHolesFailed)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): failed to close holes\n", chartCount, i, j, k);
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::FixTJunctionsDuplicatedEdge)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): fixing t-junctions created non-manifold geometry\n", chartCount, i, j, k);
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::FixTJunctionsFailed)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): fixing t-junctions failed\n", chartCount, i, j, k);
-				if (chart->warningFlags() & internal::param::ChartWarningFlags::TriangulateDuplicatedEdge)
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): triangulation created non-manifold geometry\n", chartCount, i, j, k);
-#endif
-				holesCount += chart->closedHolesCount();
-				if (chart->closedHolesCount() > 0)
-					chartsWithHolesCount++;
-				tJunctionsCount += chart->fixedTJunctionsCount();
-				if (chart->fixedTJunctionsCount() > 0)
-					chartsWithTJunctionsCount++;
-				chartCount++;
-			}
+			chartCount += chartGroup->segmentChartCount();
 		}
 	}
-	if (holesCount > 0)
-		XA_PRINT("   Closed %u holes in %u charts\n", holesCount, chartsWithHolesCount);
-	if (tJunctionsCount > 0)
-		XA_PRINT("   Fixed %u t-junctions in %u charts\n", tJunctionsCount, chartsWithTJunctionsCount);
 	XA_PRINT("   %u charts\n", chartCount);
+#if XA_PROFILE
+	XA_PRINT("   Chart groups\n");
+	uint32_t chartGroupCount = 0;
+	for (uint32_t i = 0; i < meshCount; i++) {
+		XA_PRINT("      Mesh %u: %u chart groups\n", i, ctx->paramAtlas.chartGroupCount(i));
+		chartGroupCount += ctx->paramAtlas.chartGroupCount(i);
+	}
+	XA_PRINT("      %u total\n", chartGroupCount);
+#endif
 	XA_PROFILE_PRINT_AND_RESET("   Total (real): ", computeChartsReal)
 	XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", computeChartsThread)
-	XA_PROFILE_PRINT_AND_RESET("      Build atlas: ", buildAtlas)
-	XA_PROFILE_PRINT_AND_RESET("         Init: ", buildAtlasInit)
-	XA_PROFILE_PRINT_AND_RESET("         Place seeds: ", buildAtlasPlaceSeeds)
-	XA_PROFILE_PRINT_AND_RESET("         Relocate seeds: ", buildAtlasRelocateSeeds)
-	XA_PROFILE_PRINT_AND_RESET("         Reset charts: ", buildAtlasResetCharts)
-	XA_PROFILE_PRINT_AND_RESET("         Grow charts: ", buildAtlasGrowCharts)
-	XA_PROFILE_PRINT_AND_RESET("         Merge charts: ", buildAtlasMergeCharts)
-	XA_PROFILE_PRINT_AND_RESET("         Fill holes: ", buildAtlasFillHoles)
-	XA_PROFILE_PRINT_AND_RESET("      Create chart meshes (real): ", createChartMeshesReal)
-	XA_PROFILE_PRINT_AND_RESET("      Create chart meshes (thread): ", createChartMeshesThread)
-	XA_PROFILE_PRINT_AND_RESET("         Fix t-junctions: ", fixChartMeshTJunctions)
-	XA_PROFILE_PRINT_AND_RESET("         Close holes: ", closeChartMeshHoles)
+	XA_PROFILE_PRINT_AND_RESET("      Create face groups: ", createFaceGroups)
+	XA_PROFILE_PRINT_AND_RESET("      Extract invalid mesh geometry: ", extractInvalidMeshGeometry)
+	XA_PROFILE_PRINT_AND_RESET("      Chart group compute charts (real): ", chartGroupComputeChartsReal)
+	XA_PROFILE_PRINT_AND_RESET("      Chart group compute charts (thread): ", chartGroupComputeChartsThread)
+	XA_PROFILE_PRINT_AND_RESET("         Create chart group mesh: ", createChartGroupMesh)
+	XA_PROFILE_PRINT_AND_RESET("            Create colocals: ", createChartGroupMeshColocals)
+	XA_PROFILE_PRINT_AND_RESET("            Create boundaries: ", createChartGroupMeshBoundaries)
+	XA_PROFILE_PRINT_AND_RESET("         Build atlas: ", buildAtlas)
+	XA_PROFILE_PRINT_AND_RESET("            Init: ", buildAtlasInit)
+	XA_PROFILE_PRINT_AND_RESET("            Planar charts: ", planarCharts)
+	XA_PROFILE_PRINT_AND_RESET("            Clustered charts: ", clusteredCharts)
+	XA_PROFILE_PRINT_AND_RESET("               Place seeds: ", clusteredChartsPlaceSeeds)
+	XA_PROFILE_PRINT_AND_RESET("                  Boundary intersection: ", clusteredChartsPlaceSeedsBoundaryIntersection)
+	XA_PROFILE_PRINT_AND_RESET("               Relocate seeds: ", clusteredChartsRelocateSeeds)
+	XA_PROFILE_PRINT_AND_RESET("               Reset: ", clusteredChartsReset)
+	XA_PROFILE_PRINT_AND_RESET("               Grow: ", clusteredChartsGrow)
+	XA_PROFILE_PRINT_AND_RESET("                  Boundary intersection: ", clusteredChartsGrowBoundaryIntersection)
+	XA_PROFILE_PRINT_AND_RESET("               Merge: ", clusteredChartsMerge)
+	XA_PROFILE_PRINT_AND_RESET("               Fill holes: ", clusteredChartsFillHoles)
+	XA_PROFILE_PRINT_AND_RESET("         Copy chart faces: ", copyChartFaces)
+#if XA_PROFILE_ALLOC
+	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+#endif
 	XA_PRINT_MEM_USAGE
 }
 
-void ParameterizeCharts(Atlas *atlas, ParameterizeFunc func) {
+void ParameterizeCharts(Atlas *atlas, ParameterizeOptions options)
+{
 	if (!atlas) {
 		XA_PRINT_WARNING("ParameterizeCharts: atlas is null.\n");
 		return;
@@ -8734,19 +9724,34 @@ void ParameterizeCharts(Atlas *atlas, ParameterizeFunc func) {
 	DestroyOutputMeshes(ctx);
 	XA_PRINT("Parameterizing charts\n");
 	XA_PROFILE_START(parameterizeChartsReal)
-	if (!ctx->paramAtlas.parameterizeCharts(ctx->taskScheduler, func, ctx->progressFunc, ctx->progressUserData)) {
+	if (!ctx->paramAtlas.parameterizeCharts(ctx->taskScheduler, options, ctx->progressFunc, ctx->progressUserData)) {
 		XA_PRINT("   Cancelled by user\n");
-		return;
+			return;
 	}
 	XA_PROFILE_END(parameterizeChartsReal)
-	uint32_t chartCount = 0, orthoChartsCount = 0, planarChartsCount = 0, lscmChartsCount = 0, piecewiseChartsCount = 0, chartsAddedCount = 0, chartsDeletedCount = 0;
-	for (uint32_t i = 0; i < ctx->meshCount; i++) {
+	const uint32_t meshCount = ctx->meshes.size();
+	uint32_t chartCount = 0, chartsWithHolesCount = 0, holesCount = 0, chartsWithTJunctionsCount = 0, tJunctionsCount = 0, orthoChartsCount = 0, planarChartsCount = 0, lscmChartsCount = 0, piecewiseChartsCount = 0, chartsAddedCount = 0, chartsDeletedCount = 0;
+	for (uint32_t i = 0; i < meshCount; i++) {
 		for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
 			const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
-			if (chartGroup->isVertexMap())
-				continue;
 			for (uint32_t k = 0; k < chartGroup->chartCount(); k++) {
 				const internal::param::Chart *chart = chartGroup->chartAt(k);
+#if XA_PRINT_CHART_WARNINGS
+				if (chart->warningFlags() & internal::param::ChartWarningFlags::CloseHolesFailed)
+					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): failed to close holes\n", chartCount, i, j, k);
+				if (chart->warningFlags() & internal::param::ChartWarningFlags::FixTJunctionsDuplicatedEdge)
+					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): fixing t-junctions created non-manifold geometry\n", chartCount, i, j, k);
+				if (chart->warningFlags() & internal::param::ChartWarningFlags::FixTJunctionsFailed)
+					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): fixing t-junctions failed\n", chartCount, i, j, k);
+				if (chart->warningFlags() & internal::param::ChartWarningFlags::TriangulateDuplicatedEdge)
+					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u): triangulation created non-manifold geometry\n", chartCount, i, j, k);
+#endif
+				holesCount += chart->closedHolesCount();
+				if (chart->closedHolesCount() > 0)
+					chartsWithHolesCount++;
+				tJunctionsCount += chart->fixedTJunctionsCount();
+				if (chart->fixedTJunctionsCount() > 0)
+					chartsWithTJunctionsCount++;
 				if (chart->type() == ChartType::Planar)
 					planarChartsCount++;
 				else if (chart->type() == ChartType::Ortho)
@@ -8761,19 +9766,21 @@ void ParameterizeCharts(Atlas *atlas, ParameterizeFunc func) {
 			chartsDeletedCount += chartGroup->paramDeletedChartsCount();
 		}
 	}
+	if (holesCount > 0)
+		XA_PRINT("   %u holes closed in %u charts\n", holesCount, chartsWithHolesCount);
+	if (tJunctionsCount > 0)
+		XA_PRINT("   %u t-junctions fixed in %u charts\n", tJunctionsCount, chartsWithTJunctionsCount);
 	XA_PRINT("   %u planar charts, %u ortho charts, %u LSCM charts, %u piecewise charts\n", planarChartsCount, orthoChartsCount, lscmChartsCount, piecewiseChartsCount);
 	if (chartsDeletedCount > 0) {
 		XA_PRINT("   %u charts with invalid parameterizations replaced with %u new charts\n", chartsDeletedCount, chartsAddedCount);
 		XA_PRINT("   %u charts\n", chartCount);
 	}
 	uint32_t chartIndex = 0, invalidParamCount = 0;
-	for (uint32_t i = 0; i < ctx->meshCount; i++) {
+	for (uint32_t i = 0; i < meshCount; i++) {
 		for (uint32_t j = 0; j < ctx->paramAtlas.chartGroupCount(i); j++) {
 			const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, j);
-			if (chartGroup->isVertexMap())
-				continue;
 			for (uint32_t k = 0; k < chartGroup->chartCount(); k++) {
-				const internal::param::Chart *chart = chartGroup->chartAt(k);
+				internal::param::Chart *chart = chartGroup->chartAt(k);
 				const internal::param::Quality &quality = chart->quality();
 #if XA_DEBUG_EXPORT_OBJ_CHARTS_AFTER_PARAMETERIZATION
 				{
@@ -8782,7 +9789,6 @@ void ParameterizeCharts(Atlas *atlas, ParameterizeFunc func) {
 					chart->unifiedMesh()->writeObjFile(filename);
 				}
 #endif
-				bool invalid = false;
 				const char *type = "LSCM";
 				if (chart->type() == ChartType::Planar)
 					type = "planar";
@@ -8790,18 +9796,15 @@ void ParameterizeCharts(Atlas *atlas, ParameterizeFunc func) {
 					type = "ortho";
 				else if (chart->type() == ChartType::Piecewise)
 					type = "piecewise";
-				if (quality.boundaryIntersection) {
-					invalid = true;
-					XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u) (%s): invalid parameterization, self-intersecting boundary.\n", chartIndex, i, j, k, type);
-				}
-				if (quality.flippedTriangleCount > 0) {
-					invalid = true;
-					XA_PRINT_WARNING("   Chart %u  (mesh %u, group %u, id %u) (%s): invalid parameterization, %u / %u flipped triangles.\n", chartIndex, i, j, k, type, quality.flippedTriangleCount, quality.totalTriangleCount);
-				}
-				if (invalid)
+				if (chart->isInvalid()) {
+					if (quality.boundaryIntersection) {
+						XA_PRINT_WARNING("   Chart %u (mesh %u, group %u, id %u) (%s): invalid parameterization, self-intersecting boundary.\n", chartIndex, i, j, k, type);
+					}
+					if (quality.flippedTriangleCount > 0) {
+						XA_PRINT_WARNING("   Chart %u  (mesh %u, group %u, id %u) (%s): invalid parameterization, %u / %u flipped triangles.\n", chartIndex, i, j, k, type, quality.flippedTriangleCount, quality.totalTriangleCount);
+					}
 					invalidParamCount++;
 #if XA_DEBUG_EXPORT_OBJ_INVALID_PARAMETERIZATION
-				if (invalid) {
 					char filename[256];
 					XA_SPRINTF(filename, sizeof(filename), "debug_chart_%03u_invalid_parameterization.obj", chartIndex);
 					const internal::Mesh *mesh = chart->unifiedMesh();
@@ -8822,8 +9825,8 @@ void ParameterizeCharts(Atlas *atlas, ParameterizeFunc func) {
 						mesh->writeObjLinkedBoundaries(file);
 						fclose(file);
 					}
-				}
 #endif
+				}
 				chartIndex++;
 			}
 		}
@@ -8832,20 +9835,30 @@ void ParameterizeCharts(Atlas *atlas, ParameterizeFunc func) {
 		XA_PRINT_WARNING("   %u charts with invalid parameterizations\n", invalidParamCount);
 	XA_PROFILE_PRINT_AND_RESET("   Total (real): ", parameterizeChartsReal)
 	XA_PROFILE_PRINT_AND_RESET("   Total (thread): ", parameterizeChartsThread)
+	XA_PROFILE_PRINT_AND_RESET("      Create chart mesh: ", createChartMesh)
+	XA_PROFILE_PRINT_AND_RESET("         Fix t-junctions: ", fixChartMeshTJunctions)
+	XA_PROFILE_PRINT_AND_RESET("         Close holes: ", closeChartMeshHoles)
 	XA_PROFILE_PRINT_AND_RESET("      Orthogonal: ", parameterizeChartsOrthogonal)
 	XA_PROFILE_PRINT_AND_RESET("      LSCM: ", parameterizeChartsLSCM)
+	XA_PROFILE_PRINT_AND_RESET("      Recompute: ", parameterizeChartsRecompute)
+	XA_PROFILE_PRINT_AND_RESET("         Piecewise: ", parameterizeChartsPiecewise)
+	XA_PROFILE_PRINT_AND_RESET("            Boundary intersection: ", parameterizeChartsPiecewiseBoundaryIntersection)
 	XA_PROFILE_PRINT_AND_RESET("      Evaluate quality: ", parameterizeChartsEvaluateQuality)
+#if XA_PROFILE_ALLOC
+	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+#endif
 	XA_PRINT_MEM_USAGE
 }
 
-void PackCharts(Atlas *atlas, PackOptions packOptions) {
+void PackCharts(Atlas *atlas, PackOptions packOptions)
+{
 	// Validate arguments and context state.
 	if (!atlas) {
 		XA_PRINT_WARNING("PackCharts: atlas is null.\n");
 		return;
 	}
 	Context *ctx = (Context *)atlas;
-	if (ctx->meshCount == 0 && ctx->uvMeshInstances.isEmpty()) {
+	if (ctx->meshes.isEmpty() && ctx->uvMeshInstances.isEmpty()) {
 		XA_PRINT_WARNING("PackCharts: No meshes. Call AddMesh or AddUvMesh first.\n");
 		return;
 	}
@@ -8880,7 +9893,8 @@ void PackCharts(Atlas *atlas, PackOptions packOptions) {
 	if (!ctx->uvMeshInstances.isEmpty()) {
 		for (uint32_t i = 0; i < ctx->uvMeshInstances.size(); i++)
 			packAtlas.addUvMeshCharts(ctx->uvMeshInstances[i]);
-	} else
+	}
+	else
 		packAtlas.addCharts(ctx->taskScheduler, &ctx->paramAtlas);
 	XA_PROFILE_END(packChartsAddCharts)
 	XA_PROFILE_START(packCharts)
@@ -8901,7 +9915,7 @@ void PackCharts(Atlas *atlas, PackOptions packOptions) {
 	if (packOptions.createImage) {
 		atlas->image = XA_ALLOC_ARRAY(internal::MemTag::Default, uint32_t, atlas->atlasCount * atlas->width * atlas->height);
 		for (uint32_t i = 0; i < atlas->atlasCount; i++)
-			packAtlas.getImages()[i]->copyTo(&atlas->image[atlas->width * atlas->height * i], atlas->width, atlas->height, packOptions.blockAlign ? 0 : packOptions.padding);
+			packAtlas.getImages()[i]->copyTo(&atlas->image[atlas->width * atlas->height * i], atlas->width, atlas->height, packOptions.padding);
 	}
 	XA_PROFILE_PRINT_AND_RESET("   Total: ", packCharts)
 	XA_PROFILE_PRINT_AND_RESET("      Add charts (real): ", packChartsAddCharts)
@@ -8911,6 +9925,9 @@ void PackCharts(Atlas *atlas, PackOptions packOptions) {
 	XA_PROFILE_PRINT_AND_RESET("      Dilate (padding): ", packChartsDilate)
 	XA_PROFILE_PRINT_AND_RESET("      Find location: ", packChartsFindLocation)
 	XA_PROFILE_PRINT_AND_RESET("      Blit: ", packChartsBlit)
+#if XA_PROFILE_ALLOC
+	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+#endif
 	XA_PRINT_MEM_USAGE
 	XA_PRINT("Building output meshes\n");
 	XA_PROFILE_START(buildOutputMeshes)
@@ -8920,91 +9937,92 @@ void PackCharts(Atlas *atlas, PackOptions packOptions) {
 			return;
 	}
 	if (ctx->uvMeshInstances.isEmpty())
-		atlas->meshCount = ctx->meshCount;
+		atlas->meshCount = ctx->meshes.size();
 	else
 		atlas->meshCount = ctx->uvMeshInstances.size();
 	atlas->meshes = XA_ALLOC_ARRAY(internal::MemTag::Default, Mesh, atlas->meshCount);
 	memset(atlas->meshes, 0, sizeof(Mesh) * atlas->meshCount);
 	if (ctx->uvMeshInstances.isEmpty()) {
 		uint32_t chartIndex = 0;
-		for (uint32_t i = 0; i < ctx->meshCount; i++) {
+		for (uint32_t i = 0; i < atlas->meshCount; i++) {
 			Mesh &outputMesh = atlas->meshes[i];
-			// Count and alloc arrays. Ignore vertex mapped chart groups in Mesh::chartCount, since they're ignored faces.
+			// Count and alloc arrays.
+			const internal::param::InvalidMeshGeometry &invalid = ctx->paramAtlas.invalidMeshGeometry(i);
+			outputMesh.vertexCount += invalid.vertices().length;
+			outputMesh.indexCount += invalid.faces().length * 3;
 			for (uint32_t cg = 0; cg < ctx->paramAtlas.chartGroupCount(i); cg++) {
 				const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, cg);
-				if (chartGroup->isVertexMap()) {
-					outputMesh.vertexCount += chartGroup->mesh()->vertexCount();
-					outputMesh.indexCount += chartGroup->mesh()->faceCount() * 3;
-				} else {
-					for (uint32_t c = 0; c < chartGroup->chartCount(); c++) {
-						const internal::param::Chart *chart = chartGroup->chartAt(c);
-						outputMesh.vertexCount += chart->mesh()->vertexCount();
-						outputMesh.indexCount += chart->mesh()->faceCount() * 3;
-						outputMesh.chartCount++;
-					}
+				for (uint32_t c = 0; c < chartGroup->chartCount(); c++) {
+					const internal::param::Chart *chart = chartGroup->chartAt(c);
+					outputMesh.vertexCount += chart->mesh()->vertexCount();
+					outputMesh.indexCount += chart->mesh()->faceCount() * 3;
+					outputMesh.chartCount++;
 				}
 			}
 			outputMesh.vertexArray = XA_ALLOC_ARRAY(internal::MemTag::Default, Vertex, outputMesh.vertexCount);
 			outputMesh.indexArray = XA_ALLOC_ARRAY(internal::MemTag::Default, uint32_t, outputMesh.indexCount);
 			outputMesh.chartArray = XA_ALLOC_ARRAY(internal::MemTag::Default, Chart, outputMesh.chartCount);
-			XA_PRINT("   mesh %u: %u vertices, %u triangles, %u charts\n", i, outputMesh.vertexCount, outputMesh.indexCount / 3, outputMesh.chartCount);
+			XA_PRINT("   Mesh %u: %u vertices, %u triangles, %u charts\n", i, outputMesh.vertexCount, outputMesh.indexCount / 3, outputMesh.chartCount);
 			// Copy mesh data.
-			uint32_t firstVertex = 0, meshChartIndex = 0;
+			uint32_t firstVertex = 0;
+			{
+				const internal::param::InvalidMeshGeometry &mesh = ctx->paramAtlas.invalidMeshGeometry(i);
+				internal::ConstArrayView<uint32_t> faces = mesh.faces();
+				internal::ConstArrayView<uint32_t> indices = mesh.indices();
+				internal::ConstArrayView<uint32_t> vertices = mesh.vertices();
+				// Vertices.
+				for (uint32_t v = 0; v < vertices.length; v++) {
+					Vertex &vertex = outputMesh.vertexArray[v];
+					vertex.atlasIndex = -1;
+					vertex.chartIndex = -1;
+					vertex.uv[0] = vertex.uv[1] = 0.0f;
+					vertex.xref = vertices[v];
+				}
+				// Indices.
+				for (uint32_t f = 0; f < faces.length; f++) {
+					const uint32_t indexOffset = faces[f] * 3;
+					for (uint32_t j = 0; j < 3; j++)
+						outputMesh.indexArray[indexOffset + j] = indices[f * 3 + j];
+				}
+				firstVertex = vertices.length;
+			}
+			uint32_t meshChartIndex = 0;
 			for (uint32_t cg = 0; cg < ctx->paramAtlas.chartGroupCount(i); cg++) {
 				const internal::param::ChartGroup *chartGroup = ctx->paramAtlas.chartGroupAt(i, cg);
-				if (chartGroup->isVertexMap()) {
-					const internal::Mesh *mesh = chartGroup->mesh();
+				for (uint32_t c = 0; c < chartGroup->chartCount(); c++) {
+					const internal::param::Chart *chart = chartGroup->chartAt(c);
+					const internal::Mesh *mesh = chart->mesh();
 					// Vertices.
 					for (uint32_t v = 0; v < mesh->vertexCount(); v++) {
 						Vertex &vertex = outputMesh.vertexArray[firstVertex + v];
-						vertex.atlasIndex = -1;
-						vertex.chartIndex = -1;
-						vertex.uv[0] = vertex.uv[1] = 0.0f;
-						vertex.xref = chartGroup->mapVertexToSourceVertex(v);
+						vertex.atlasIndex = packAtlas.getChart(chartIndex)->atlasIndex;
+						XA_DEBUG_ASSERT(vertex.atlasIndex >= 0);
+						vertex.chartIndex = (int32_t)chartIndex;
+						const internal::Vector2 &uv = mesh->texcoord(v);
+						vertex.uv[0] = internal::max(0.0f, uv.x);
+						vertex.uv[1] = internal::max(0.0f, uv.y);
+						vertex.xref = chart->mapChartVertexToSourceVertex(v);
 					}
 					// Indices.
 					for (uint32_t f = 0; f < mesh->faceCount(); f++) {
-						const uint32_t indexOffset = chartGroup->mapFaceToSourceFace(f) * 3;
+						const uint32_t indexOffset = chart->mapFaceToSourceFace(f) * 3;
 						for (uint32_t j = 0; j < 3; j++)
 							outputMesh.indexArray[indexOffset + j] = firstVertex + mesh->vertexAt(f * 3 + j);
 					}
+					// Charts.
+					Chart *outputChart = &outputMesh.chartArray[meshChartIndex];
+					const int32_t atlasIndex = packAtlas.getChart(chartIndex)->atlasIndex;
+					XA_DEBUG_ASSERT(atlasIndex >= 0);
+					outputChart->atlasIndex = (uint32_t)atlasIndex;
+					outputChart->type = chart->isInvalid() ? ChartType::Invalid : chart->type();
+					outputChart->faceCount = mesh->faceCount();
+					outputChart->faceArray = XA_ALLOC_ARRAY(internal::MemTag::Default, uint32_t, outputChart->faceCount);
+					for (uint32_t f = 0; f < outputChart->faceCount; f++)
+						outputChart->faceArray[f] = chart->mapFaceToSourceFace(f);
+					outputChart->material = 0;
+					meshChartIndex++;
+					chartIndex++;
 					firstVertex += mesh->vertexCount();
-				} else {
-					for (uint32_t c = 0; c < chartGroup->chartCount(); c++) {
-						const internal::param::Chart *chart = chartGroup->chartAt(c);
-						const internal::Mesh *mesh = chart->mesh();
-						// Vertices.
-						for (uint32_t v = 0; v < mesh->vertexCount(); v++) {
-							Vertex &vertex = outputMesh.vertexArray[firstVertex + v];
-							vertex.atlasIndex = packAtlas.getChart(chartIndex)->atlasIndex;
-							XA_DEBUG_ASSERT(vertex.atlasIndex >= 0);
-							vertex.chartIndex = (int32_t)chartIndex;
-							const internal::Vector2 &uv = mesh->texcoord(v);
-							vertex.uv[0] = internal::max(0.0f, uv.x);
-							vertex.uv[1] = internal::max(0.0f, uv.y);
-							vertex.xref = chartGroup->mapVertexToSourceVertex(chart->mapChartVertexToOriginalVertex(v));
-						}
-						// Indices.
-						for (uint32_t f = 0; f < mesh->faceCount(); f++) {
-							const uint32_t indexOffset = chartGroup->mapFaceToSourceFace(chart->mapFaceToSourceFace(f)) * 3;
-							for (uint32_t j = 0; j < 3; j++)
-								outputMesh.indexArray[indexOffset + j] = firstVertex + mesh->vertexAt(f * 3 + j);
-						}
-						// Charts.
-						Chart *outputChart = &outputMesh.chartArray[meshChartIndex];
-						const int32_t atlasIndex = packAtlas.getChart(chartIndex)->atlasIndex;
-						XA_DEBUG_ASSERT(atlasIndex >= 0);
-						outputChart->atlasIndex = (uint32_t)atlasIndex;
-						outputChart->type = chart->type();
-						outputChart->faceCount = mesh->faceCount();
-						outputChart->faceArray = XA_ALLOC_ARRAY(internal::MemTag::Default, uint32_t, outputChart->faceCount);
-						for (uint32_t f = 0; f < outputChart->faceCount; f++)
-							outputChart->faceArray[f] = chartGroup->mapFaceToSourceFace(chart->mapFaceToSourceFace(f));
-						outputChart->material = 0;
-						meshChartIndex++;
-						chartIndex++;
-						firstVertex += mesh->vertexCount();
-					}
 				}
 			}
 			XA_DEBUG_ASSERT(outputMesh.vertexCount == firstVertex);
@@ -9078,10 +10096,14 @@ void PackCharts(Atlas *atlas, PackOptions packOptions) {
 		ctx->progressFunc(ProgressCategory::BuildOutputMeshes, 100, ctx->progressUserData);
 	XA_PROFILE_END(buildOutputMeshes)
 	XA_PROFILE_PRINT_AND_RESET("   Total: ", buildOutputMeshes)
+#if XA_PROFILE_ALLOC
+	XA_PROFILE_PRINT_AND_RESET("   Alloc: ", alloc)
+#endif
 	XA_PRINT_MEM_USAGE
 }
 
-void Generate(Atlas *atlas, ChartOptions chartOptions, ParameterizeFunc paramFunc, PackOptions packOptions) {
+void Generate(Atlas *atlas, ChartOptions chartOptions, ParameterizeOptions parameterizeOptions, PackOptions packOptions)
+{
 	if (!atlas) {
 		XA_PRINT_WARNING("Generate: atlas is null.\n");
 		return;
@@ -9091,16 +10113,17 @@ void Generate(Atlas *atlas, ChartOptions chartOptions, ParameterizeFunc paramFun
 		XA_PRINT_WARNING("Generate: This function should not be called with UV meshes.\n");
 		return;
 	}
-	if (ctx->meshCount == 0) {
+	if (ctx->meshes.isEmpty()) {
 		XA_PRINT_WARNING("Generate: No meshes. Call AddMesh first.\n");
 		return;
 	}
 	ComputeCharts(atlas, chartOptions);
-	ParameterizeCharts(atlas, paramFunc);
+	ParameterizeCharts(atlas, parameterizeOptions);
 	PackCharts(atlas, packOptions);
 }
 
-void SetProgressCallback(Atlas *atlas, ProgressFunc progressFunc, void *progressUserData) {
+void SetProgressCallback(Atlas *atlas, ProgressFunc progressFunc, void *progressUserData)
+{
 	if (!atlas) {
 		XA_PRINT_WARNING("SetProgressCallback: atlas is null.\n");
 		return;
@@ -9110,17 +10133,20 @@ void SetProgressCallback(Atlas *atlas, ProgressFunc progressFunc, void *progress
 	ctx->progressUserData = progressUserData;
 }
 
-void SetAlloc(ReallocFunc reallocFunc, FreeFunc freeFunc) {
+void SetAlloc(ReallocFunc reallocFunc, FreeFunc freeFunc)
+{
 	internal::s_realloc = reallocFunc;
 	internal::s_free = freeFunc;
 }
 
-void SetPrint(PrintFunc print, bool verbose) {
+void SetPrint(PrintFunc print, bool verbose)
+{
 	internal::s_print = print;
 	internal::s_printVerbose = verbose;
 }
 
-const char *StringForEnum(AddMeshError::Enum error) {
+const char *StringForEnum(AddMeshError::Enum error)
+{
 	if (error == AddMeshError::Error)
 		return "Unspecified error";
 	if (error == AddMeshError::IndexOutOfRange)
@@ -9130,7 +10156,8 @@ const char *StringForEnum(AddMeshError::Enum error) {
 	return "Success";
 }
 
-const char *StringForEnum(ProgressCategory::Enum category) {
+const char *StringForEnum(ProgressCategory::Enum category)
+{
 	if (category == ProgressCategory::AddMesh)
 		return "Adding mesh(es)";
 	if (category == ProgressCategory::ComputeCharts)