summaryrefslogtreecommitdiff
path: root/thirdparty
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty')
-rw-r--r--thirdparty/README.md10
-rw-r--r--thirdparty/meshoptimizer/LICENSE.md21
-rw-r--r--thirdparty/meshoptimizer/allocator.cpp8
-rw-r--r--thirdparty/meshoptimizer/clusterizer.cpp351
-rw-r--r--thirdparty/meshoptimizer/indexcodec.cpp752
-rw-r--r--thirdparty/meshoptimizer/indexgenerator.cpp347
-rw-r--r--thirdparty/meshoptimizer/meshoptimizer.h948
-rw-r--r--thirdparty/meshoptimizer/overdrawanalyzer.cpp230
-rw-r--r--thirdparty/meshoptimizer/overdrawoptimizer.cpp333
-rw-r--r--thirdparty/meshoptimizer/simplifier.cpp1529
-rw-r--r--thirdparty/meshoptimizer/spatialorder.cpp194
-rw-r--r--thirdparty/meshoptimizer/stripifier.cpp295
-rw-r--r--thirdparty/meshoptimizer/vcacheanalyzer.cpp73
-rw-r--r--thirdparty/meshoptimizer/vcacheoptimizer.cpp473
-rw-r--r--thirdparty/meshoptimizer/vertexcodec.cpp1265
-rw-r--r--thirdparty/meshoptimizer/vertexfilter.cpp825
-rw-r--r--thirdparty/meshoptimizer/vfetchanalyzer.cpp58
-rw-r--r--thirdparty/meshoptimizer/vfetchoptimizer.cpp74
-rw-r--r--thirdparty/minimp3/LICENSE117
-rw-r--r--thirdparty/minimp3/minimp3.h1855
-rw-r--r--thirdparty/minimp3/minimp3_ex.h1394
-rw-r--r--thirdparty/misc/easing_equations.cpp35
22 files changed, 11175 insertions, 12 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index b2707e7f7c..3962a83597 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -357,6 +357,16 @@ File extracted from upstream release tarball:
- Added 2 files `godot_core_mbedtls_platform.{c,h}` providing configuration
for light bundling with core.
+## meshoptimizer
+
+- Upstream: https://github.com/zeux/meshoptimizer
+- Version: 0.15(2020)
+- License: MIT
+
+File extracted from upstream release tarball:
+
+- Files in src/ go to thirdparty/meshoptimizer
+
## miniupnpc
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
new file mode 100644
index 0000000000..4fcd766d22
--- /dev/null
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2016-2020 Arseny Kapoulkine
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/thirdparty/meshoptimizer/allocator.cpp b/thirdparty/meshoptimizer/allocator.cpp
new file mode 100644
index 0000000000..da7cc540b2
--- /dev/null
+++ b/thirdparty/meshoptimizer/allocator.cpp
@@ -0,0 +1,8 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*))
+{
+ meshopt_Allocator::Storage::allocate = allocate;
+ meshopt_Allocator::Storage::deallocate = deallocate;
+}
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
new file mode 100644
index 0000000000..f7d88c5136
--- /dev/null
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -0,0 +1,351 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016
+// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016
+// Jack Ritter. An Efficient Bounding Sphere. 1990
+namespace meshopt
+{
+
+static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
+{
+ assert(count > 0);
+
+ // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates
+ size_t pmin[3] = {0, 0, 0};
+ size_t pmax[3] = {0, 0, 0};
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ const float* p = points[i];
+
+ for (int axis = 0; axis < 3; ++axis)
+ {
+ pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis];
+ pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis];
+ }
+ }
+
+ // find the pair of points with largest distance
+ float paxisd2 = 0;
+ int paxis = 0;
+
+ for (int axis = 0; axis < 3; ++axis)
+ {
+ const float* p1 = points[pmin[axis]];
+ const float* p2 = points[pmax[axis]];
+
+ float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]);
+
+ if (d2 > paxisd2)
+ {
+ paxisd2 = d2;
+ paxis = axis;
+ }
+ }
+
+ // use the longest segment as the initial sphere diameter
+ const float* p1 = points[pmin[paxis]];
+ const float* p2 = points[pmax[paxis]];
+
+ float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2};
+ float radius = sqrtf(paxisd2) / 2;
+
+ // iteratively adjust the sphere up until all points fit
+ for (size_t i = 0; i < count; ++i)
+ {
+ const float* p = points[i];
+ float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]);
+
+ if (d2 > radius * radius)
+ {
+ float d = sqrtf(d2);
+ assert(d > 0);
+
+ float k = 0.5f + (radius / d) / 2;
+
+ center[0] = center[0] * k + p[0] * (1 - k);
+ center[1] = center[1] * k + p[1] * (1 - k);
+ center[2] = center[2] * k + p[2] * (1 - k);
+ radius = (radius + d) / 2;
+ }
+ }
+
+ result[0] = center[0];
+ result[1] = center[1];
+ result[2] = center[2];
+ result[3] = radius;
+}
+
+} // namespace meshopt
+
+size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
+{
+ assert(index_count % 3 == 0);
+ assert(max_vertices >= 3);
+ assert(max_triangles >= 1);
+
+ // meshlet construction is limited by max vertices and max triangles per meshlet
+ // the worst case is that the input is an unindexed stream since this equally stresses both limits
+ // note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle
+ size_t max_vertices_conservative = max_vertices - 2;
+ size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative;
+ size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles;
+
+ return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
+}
+
+size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+ assert(index_count % 3 == 0);
+ assert(max_vertices >= 3);
+ assert(max_triangles >= 1);
+
+ meshopt_Allocator allocator;
+
+ meshopt_Meshlet meshlet;
+ memset(&meshlet, 0, sizeof(meshlet));
+
+ assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
+ assert(max_triangles <= sizeof(meshlet.indices) / 3);
+
+ // index of the vertex in the meshlet, 0xff if the vertex isn't used
+ unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+ memset(used, -1, vertex_count);
+
+ size_t offset = 0;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ unsigned char& av = used[a];
+ unsigned char& bv = used[b];
+ unsigned char& cv = used[c];
+
+ unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+ if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+ {
+ destination[offset++] = meshlet;
+
+ for (size_t j = 0; j < meshlet.vertex_count; ++j)
+ used[meshlet.vertices[j]] = 0xff;
+
+ memset(&meshlet, 0, sizeof(meshlet));
+ }
+
+ if (av == 0xff)
+ {
+ av = meshlet.vertex_count;
+ meshlet.vertices[meshlet.vertex_count++] = a;
+ }
+
+ if (bv == 0xff)
+ {
+ bv = meshlet.vertex_count;
+ meshlet.vertices[meshlet.vertex_count++] = b;
+ }
+
+ if (cv == 0xff)
+ {
+ cv = meshlet.vertex_count;
+ meshlet.vertices[meshlet.vertex_count++] = c;
+ }
+
+ meshlet.indices[meshlet.triangle_count][0] = av;
+ meshlet.indices[meshlet.triangle_count][1] = bv;
+ meshlet.indices[meshlet.triangle_count][2] = cv;
+ meshlet.triangle_count++;
+ }
+
+ if (meshlet.triangle_count)
+ destination[offset++] = meshlet;
+
+ assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+
+ return offset;
+}
+
+meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ assert(index_count / 3 <= 256);
+
+ (void)vertex_count;
+
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ // compute triangle normals and gather triangle corners
+ float normals[256][3];
+ float corners[256][3][3];
+ size_t triangles = 0;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ const float* p0 = vertex_positions + vertex_stride_float * a;
+ const float* p1 = vertex_positions + vertex_stride_float * b;
+ const float* p2 = vertex_positions + vertex_stride_float * c;
+
+ float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+ float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+ float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+ float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+ float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+ float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+ // no need to include degenerate triangles - they will be invisible anyway
+ if (area == 0.f)
+ continue;
+
+ // record triangle normals & corners for future use; normal and corner 0 define a plane equation
+ normals[triangles][0] = normalx / area;
+ normals[triangles][1] = normaly / area;
+ normals[triangles][2] = normalz / area;
+ memcpy(corners[triangles][0], p0, 3 * sizeof(float));
+ memcpy(corners[triangles][1], p1, 3 * sizeof(float));
+ memcpy(corners[triangles][2], p2, 3 * sizeof(float));
+ triangles++;
+ }
+
+ meshopt_Bounds bounds = {};
+
+ // degenerate cluster, no valid triangles => trivial reject (cone data is 0)
+ if (triangles == 0)
+ return bounds;
+
+ // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well
+ float psphere[4] = {};
+ computeBoundingSphere(psphere, corners[0], triangles * 3);
+
+ float center[3] = {psphere[0], psphere[1], psphere[2]};
+
+ // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis
+ float nsphere[4] = {};
+ computeBoundingSphere(nsphere, normals, triangles);
+
+ float axis[3] = {nsphere[0], nsphere[1], nsphere[2]};
+ float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]);
+ float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength;
+
+ axis[0] *= invaxislength;
+ axis[1] *= invaxislength;
+ axis[2] *= invaxislength;
+
+ // compute a tight cone around all normals, mindp = cos(angle/2)
+ float mindp = 1.f;
+
+ for (size_t i = 0; i < triangles; ++i)
+ {
+ float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2];
+
+ mindp = (dp < mindp) ? dp : mindp;
+ }
+
+ // fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones
+ bounds.center[0] = center[0];
+ bounds.center[1] = center[1];
+ bounds.center[2] = center[2];
+ bounds.radius = psphere[3];
+
+ // degenerate cluster, normal cone is larger than a hemisphere => trivial accept
+ // note that if mindp is positive but close to 0, the triangle intersection code below gets less stable
+ // we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful
+ if (mindp <= 0.1f)
+ {
+ bounds.cone_cutoff = 1;
+ bounds.cone_cutoff_s8 = 127;
+ return bounds;
+ }
+
+ float maxt = 0;
+
+ // we need to find the point on center-t*axis ray that lies in negative half-space of all triangles
+ for (size_t i = 0; i < triangles; ++i)
+ {
+ // dot(center-t*axis-corner, trinormal) = 0
+ // dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0
+ float cx = center[0] - corners[i][0][0];
+ float cy = center[1] - corners[i][0][1];
+ float cz = center[2] - corners[i][0][2];
+
+ float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2];
+ float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2];
+
+ // dn should be larger than mindp cutoff above
+ assert(dn > 0.f);
+ float t = dc / dn;
+
+ maxt = (t > maxt) ? t : maxt;
+ }
+
+ // cone apex should be in the negative half-space of all cluster triangles by construction
+ bounds.cone_apex[0] = center[0] - axis[0] * maxt;
+ bounds.cone_apex[1] = center[1] - axis[1] * maxt;
+ bounds.cone_apex[2] = center[2] - axis[2] * maxt;
+
+ // note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis
+ bounds.cone_axis[0] = axis[0];
+ bounds.cone_axis[1] = axis[1];
+ bounds.cone_axis[2] = axis[2];
+
+ // cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone
+ // which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a))
+ bounds.cone_cutoff = sqrtf(1 - mindp * mindp);
+
+ // quantize axis & cutoff to 8-bit SNORM format
+ bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8));
+ bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8));
+ bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8));
+
+ // for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error
+ float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]);
+ float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]);
+ float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]);
+
+ // note that we need to round this up instead of rounding to nearest, hence +1
+ int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1);
+
+ bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8);
+
+ return bounds;
+}
+
+meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+
+ for (size_t i = 0; i < meshlet->triangle_count; ++i)
+ {
+ unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
+ unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
+ unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
+
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ indices[i * 3 + 0] = a;
+ indices[i * 3 + 1] = b;
+ indices[i * 3 + 2] = c;
+ }
+
+ return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+}
diff --git a/thirdparty/meshoptimizer/indexcodec.cpp b/thirdparty/meshoptimizer/indexcodec.cpp
new file mode 100644
index 0000000000..eeb541e5be
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexcodec.cpp
@@ -0,0 +1,752 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013
+// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014
+namespace meshopt
+{
+
+const unsigned char kIndexHeader = 0xe0;
+const unsigned char kSequenceHeader = 0xd0;
+
+static int gEncodeIndexVersion = 0;
+
+typedef unsigned int VertexFifo[16];
+typedef unsigned int EdgeFifo[16][2];
+
+static const unsigned int kTriangleIndexOrder[3][3] = {
+ {0, 1, 2},
+ {1, 2, 0},
+ {2, 0, 1},
+};
+
+static const unsigned char kCodeAuxEncodingTable[16] = {
+ 0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69,
+ 0, 0, // last two entries aren't used for encoding
+};
+
+static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next)
+{
+ (void)a;
+
+ return (b == next) ? 1 : (c == next) ? 2 : 0;
+}
+
+static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset)
+{
+ for (int i = 0; i < 16; ++i)
+ {
+ size_t index = (offset - 1 - i) & 15;
+
+ unsigned int e0 = fifo[index][0];
+ unsigned int e1 = fifo[index][1];
+
+ if (e0 == a && e1 == b)
+ return (i << 2) | 0;
+ if (e0 == b && e1 == c)
+ return (i << 2) | 1;
+ if (e0 == c && e1 == a)
+ return (i << 2) | 2;
+ }
+
+ return -1;
+}
+
+static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset)
+{
+ fifo[offset][0] = a;
+ fifo[offset][1] = b;
+ offset = (offset + 1) & 15;
+}
+
+static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset)
+{
+ for (int i = 0; i < 16; ++i)
+ {
+ size_t index = (offset - 1 - i) & 15;
+
+ if (fifo[index] == v)
+ return i;
+ }
+
+ return -1;
+}
+
+static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1)
+{
+ fifo[offset] = v;
+ offset = (offset + cond) & 15;
+}
+
+static void encodeVByte(unsigned char*& data, unsigned int v)
+{
+ // encode 32-bit value in up to 5 7-bit groups
+ do
+ {
+ *data++ = (v & 127) | (v > 127 ? 128 : 0);
+ v >>= 7;
+ } while (v);
+}
+
+static unsigned int decodeVByte(const unsigned char*& data)
+{
+ unsigned char lead = *data++;
+
+ // fast path: single byte
+ if (lead < 128)
+ return lead;
+
+ // slow path: up to 4 extra bytes
+ // note that this loop always terminates, which is important for malformed data
+ unsigned int result = lead & 127;
+ unsigned int shift = 7;
+
+ for (int i = 0; i < 4; ++i)
+ {
+ unsigned char group = *data++;
+ result |= (group & 127) << shift;
+ shift += 7;
+
+ if (group < 128)
+ break;
+ }
+
+ return result;
+}
+
+static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last)
+{
+ unsigned int d = index - last;
+ unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+ encodeVByte(data, v);
+}
+
+static unsigned int decodeIndex(const unsigned char*& data, unsigned int last)
+{
+ unsigned int v = decodeVByte(data);
+ unsigned int d = (v >> 1) ^ -int(v & 1);
+
+ return last + d;
+}
+
+static int getCodeAuxIndex(unsigned char v, const unsigned char* table)
+{
+ for (int i = 0; i < 16; ++i)
+ if (table[i] == v)
+ return i;
+
+ return -1;
+}
+
+static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c)
+{
+ if (index_size == 2)
+ {
+ static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a);
+ static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b);
+ static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c);
+ }
+ else
+ {
+ static_cast<unsigned int*>(destination)[offset + 0] = a;
+ static_cast<unsigned int*>(destination)[offset + 1] = b;
+ static_cast<unsigned int*>(destination)[offset + 2] = c;
+ }
+}
+
+#if TRACE
+static size_t sortTop16(unsigned char dest[16], size_t stats[256])
+{
+ size_t destsize = 0;
+
+ for (size_t i = 0; i < 256; ++i)
+ {
+ size_t j = 0;
+ for (; j < destsize; ++j)
+ {
+ if (stats[i] >= stats[dest[j]])
+ {
+ if (destsize < 16)
+ destsize++;
+
+ memmove(&dest[j + 1], &dest[j], destsize - 1 - j);
+ dest[j] = (unsigned char)i;
+ break;
+ }
+ }
+
+ if (j == destsize && destsize < 16)
+ {
+ dest[destsize] = (unsigned char)i;
+ destsize++;
+ }
+ }
+
+ return destsize;
+}
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+
+#if TRACE
+ size_t codestats[256] = {};
+ size_t codeauxstats[256] = {};
+#endif
+
+ // the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+ if (buffer_size < 1 + index_count / 3 + 16)
+ return 0;
+
+ int version = gEncodeIndexVersion;
+
+ buffer[0] = (unsigned char)(kIndexHeader | version);
+
+ EdgeFifo edgefifo;
+ memset(edgefifo, -1, sizeof(edgefifo));
+
+ VertexFifo vertexfifo;
+ memset(vertexfifo, -1, sizeof(vertexfifo));
+
+ size_t edgefifooffset = 0;
+ size_t vertexfifooffset = 0;
+
+ unsigned int next = 0;
+ unsigned int last = 0;
+
+ unsigned char* code = buffer + 1;
+ unsigned char* data = code + index_count / 3;
+ unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+ int fecmax = version >= 1 ? 13 : 15;
+
+ // use static encoding table; it's possible to pack the result and then build an optimal table and repack
+ // for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set
+ const unsigned char* codeaux_table = kCodeAuxEncodingTable;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ // make sure we have enough space to write a triangle
+ // each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index
+ // after this we can be sure we can write without extra bounds checks
+ if (data > data_safe_end)
+ return 0;
+
+ int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset);
+
+ if (fer >= 0 && (fer >> 2) < 15)
+ {
+ const unsigned int* order = kTriangleIndexOrder[fer & 3];
+
+ unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+ // encode edge index and vertex fifo index, next or free index
+ int fe = fer >> 2;
+ int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+ int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next) ? (next++, 0) : 15;
+
+ if (fec == 15 && version >= 1)
+ {
+ // encode last-1 and last+1 to optimize strip-like sequences
+ if (c + 1 == last)
+ fec = 13, last = c;
+ if (c == last + 1)
+ fec = 14, last = c;
+ }
+
+ *code++ = (unsigned char)((fe << 4) | fec);
+
+#if TRACE
+ codestats[code[-1]]++;
+#endif
+
+ // note that we need to update the last index since free indices are delta-encoded
+ if (fec == 15)
+ encodeIndex(data, c, last), last = c;
+
+ // we only need to push third vertex since first two are likely already in the vertex fifo
+ if (fec == 0 || fec >= fecmax)
+ pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+ // we only need to push two new edges to edge fifo since the third one is already there
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+ }
+ else
+ {
+ int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next);
+ const unsigned int* order = kTriangleIndexOrder[rotation];
+
+ unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]];
+
+ // if a/b/c are 0/1/2, we emit a reset code
+ bool reset = false;
+
+ if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1)
+ {
+ reset = true;
+ next = 0;
+
+ // reset vertex fifo to make sure we don't accidentally reference vertices from that in the future
+ // this makes sure next continues to get incremented instead of being stuck
+ memset(vertexfifo, -1, sizeof(vertexfifo));
+ }
+
+ int fb = getVertexFifo(vertexfifo, b, vertexfifooffset);
+ int fc = getVertexFifo(vertexfifo, c, vertexfifooffset);
+
+ // after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a
+ int fea = (a == next) ? (next++, 0) : 15;
+ int feb = (fb >= 0 && fb < 14) ? (fb + 1) : (b == next) ? (next++, 0) : 15;
+ int fec = (fc >= 0 && fc < 14) ? (fc + 1) : (c == next) ? (next++, 0) : 15;
+
+ // we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise
+ unsigned char codeaux = (unsigned char)((feb << 4) | fec);
+ int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table);
+
+ // <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15
+ if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset)
+ {
+ *code++ = (unsigned char)((15 << 4) | codeauxindex);
+ }
+ else
+ {
+ *code++ = (unsigned char)((15 << 4) | 14 | fea);
+ *data++ = codeaux;
+ }
+
+#if TRACE
+ codestats[code[-1]]++;
+ codeauxstats[codeaux]++;
+#endif
+
+ // note that we need to update the last index since free indices are delta-encoded
+ if (fea == 15)
+ encodeIndex(data, a, last), last = a;
+
+ if (feb == 15)
+ encodeIndex(data, b, last), last = b;
+
+ if (fec == 15)
+ encodeIndex(data, c, last), last = c;
+
+ // only push vertices that weren't already in fifo
+ if (fea == 0 || fea == 15)
+ pushVertexFifo(vertexfifo, a, vertexfifooffset);
+
+ if (feb == 0 || feb == 15)
+ pushVertexFifo(vertexfifo, b, vertexfifooffset);
+
+ if (fec == 0 || fec == 15)
+ pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+ // all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles
+ pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+ }
+ }
+
+ // make sure we have enough space to write codeaux table
+ if (data > data_safe_end)
+ return 0;
+
+ // add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding
+ // we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data
+ // this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input
+ for (size_t i = 0; i < 16; ++i)
+ {
+ // decoder assumes that table entries never refer to separately encoded indices
+ assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf);
+
+ *data++ = codeaux_table[i];
+ }
+
+ // since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference
+ assert(codeaux_table[0] == 0);
+
+ assert(data >= buffer + index_count / 3 + 16);
+ assert(data <= buffer + buffer_size);
+
+#if TRACE
+ unsigned char codetop[16], codeauxtop[16];
+ size_t codetopsize = sortTop16(codetop, codestats);
+ size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats);
+
+ size_t sumcode = 0, sumcodeaux = 0;
+ for (size_t i = 0; i < 256; ++i)
+ sumcode += codestats[i], sumcodeaux += codeauxstats[i];
+
+ size_t acccode = 0, acccodeaux = 0;
+
+ printf("code\t\t\t\t\tcodeaux\n");
+
+ for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i)
+ {
+ acccode += codestats[codetop[i]];
+ acccodeaux += codeauxstats[codeauxtop[i]];
+
+ printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n",
+ int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100,
+ int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100);
+ }
+#endif
+
+ return data - buffer;
+}
+
+size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count)
+{
+ assert(index_count % 3 == 0);
+
+ // compute number of bits required for each index
+ unsigned int vertex_bits = 1;
+
+ while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+ vertex_bits++;
+
+ // worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas
+ unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7;
+
+ return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16;
+}
+
+void meshopt_encodeIndexVersion(int version)
+{
+ assert(unsigned(version) <= 1);
+
+ meshopt::gEncodeIndexVersion = version;
+}
+
+int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(index_size == 2 || index_size == 4);
+
+ // the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table
+ if (buffer_size < 1 + index_count / 3 + 16)
+ return -2;
+
+ if ((buffer[0] & 0xf0) != kIndexHeader)
+ return -1;
+
+ int version = buffer[0] & 0x0f;
+ if (version > 1)
+ return -1;
+
+ EdgeFifo edgefifo;
+ memset(edgefifo, -1, sizeof(edgefifo));
+
+ VertexFifo vertexfifo;
+ memset(vertexfifo, -1, sizeof(vertexfifo));
+
+ size_t edgefifooffset = 0;
+ size_t vertexfifooffset = 0;
+
+ unsigned int next = 0;
+ unsigned int last = 0;
+
+ int fecmax = version >= 1 ? 13 : 15;
+
+ // since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end
+ const unsigned char* code = buffer + 1;
+ const unsigned char* data = code + index_count / 3;
+ const unsigned char* data_safe_end = buffer + buffer_size - 16;
+
+ const unsigned char* codeaux_table = data_safe_end;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ // make sure we have enough data to read for a triangle
+ // each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index
+ // after this we can be sure we can read without extra bounds checks
+ if (data > data_safe_end)
+ return -2;
+
+ unsigned char codetri = *code++;
+
+ if (codetri < 0xf0)
+ {
+ int fe = codetri >> 4;
+
+ // fifo reads are wrapped around 16 entry buffer
+ unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0];
+ unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1];
+
+ int fec = codetri & 15;
+
+ // note: this is the most common path in the entire decoder
+ // inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable
+ if (fec < fecmax)
+ {
+ // fifo reads are wrapped around 16 entry buffer
+ unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15];
+ unsigned int c = (fec == 0) ? next : cf;
+
+ int fec0 = fec == 0;
+ next += fec0;
+
+ // output triangle
+ writeTriangle(destination, i, index_size, a, b, c);
+
+ // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+ }
+ else
+ {
+ unsigned int c = 0;
+
+ // fec - (fec ^ 3) decodes 13, 14 into -1, 1
+ // note that we need to update the last index since free indices are delta-encoded
+ last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last);
+
+ // output triangle
+ writeTriangle(destination, i, index_size, a, b, c);
+
+ // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ pushVertexFifo(vertexfifo, c, vertexfifooffset);
+
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+ }
+ }
+ else
+ {
+ // fast path: read codeaux from the table
+ if (codetri < 0xfe)
+ {
+ unsigned char codeaux = codeaux_table[codetri & 15];
+
+ // note: table can't contain feb/fec=15
+ int feb = codeaux >> 4;
+ int fec = codeaux & 15;
+
+ // fifo reads are wrapped around 16 entry buffer
+ // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+ unsigned int a = next++;
+
+ unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15];
+ unsigned int b = (feb == 0) ? next : bf;
+
+ int feb0 = feb == 0;
+ next += feb0;
+
+ unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15];
+ unsigned int c = (fec == 0) ? next : cf;
+
+ int fec0 = fec == 0;
+ next += fec0;
+
+ // output triangle
+ writeTriangle(destination, i, index_size, a, b, c);
+
+ // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ pushVertexFifo(vertexfifo, a, vertexfifooffset);
+ pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0);
+ pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0);
+
+ pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+ }
+ else
+ {
+ // slow path: read a full byte for codeaux instead of using a table lookup
+ unsigned char codeaux = *data++;
+
+ int fea = codetri == 0xfe ? 0 : 15;
+ int feb = codeaux >> 4;
+ int fec = codeaux & 15;
+
+ // reset: codeaux is 0 but encoded as not-a-table
+ if (codeaux == 0)
+ next = 0;
+
+ // fifo reads are wrapped around 16 entry buffer
+ // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior
+ unsigned int a = (fea == 0) ? next++ : 0;
+ unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15];
+ unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15];
+
+ // note that we need to update the last index since free indices are delta-encoded
+ if (fea == 15)
+ last = a = decodeIndex(data, last);
+
+ if (feb == 15)
+ last = b = decodeIndex(data, last);
+
+ if (fec == 15)
+ last = c = decodeIndex(data, last);
+
+ // output triangle
+ writeTriangle(destination, i, index_size, a, b, c);
+
+ // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly
+ pushVertexFifo(vertexfifo, a, vertexfifooffset);
+ pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15));
+ pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15));
+
+ pushEdgeFifo(edgefifo, b, a, edgefifooffset);
+ pushEdgeFifo(edgefifo, c, b, edgefifooffset);
+ pushEdgeFifo(edgefifo, a, c, edgefifooffset);
+ }
+ }
+ }
+
+ // we should've read all data bytes and stopped at the boundary between data and codeaux table
+ if (data != data_safe_end)
+ return -3;
+
+ return 0;
+}
+
+size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count)
+{
+ using namespace meshopt;
+
+ // the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+ if (buffer_size < 1 + index_count + 4)
+ return 0;
+
+ int version = gEncodeIndexVersion;
+
+ buffer[0] = (unsigned char)(kSequenceHeader | version);
+
+ unsigned int last[2] = {};
+ unsigned int current = 0;
+
+ unsigned char* data = buffer + 1;
+ unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ // make sure we have enough data to write
+ // each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+ // after this we can be sure we can write without extra bounds checks
+ if (data >= data_safe_end)
+ return 0;
+
+ unsigned int index = indices[i];
+
+ // this is a heuristic that switches between baselines when the delta grows too large
+ // we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index
+ // for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily
+ int cd = int(index - last[current]);
+ current ^= ((cd < 0 ? -cd : cd) >= 30);
+
+ // encode delta from the last index
+ unsigned int d = index - last[current];
+ unsigned int v = (d << 1) ^ (int(d) >> 31);
+
+ // note: low bit encodes the index of the last baseline which will be used for reconstruction
+ encodeVByte(data, (v << 1) | current);
+
+ // update last for the next iteration that uses it
+ last[current] = index;
+ }
+
+ // make sure we have enough space to write tail
+ if (data > data_safe_end)
+ return 0;
+
+ for (int k = 0; k < 4; ++k)
+ *data++ = 0;
+
+ return data - buffer;
+}
+
+size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count)
+{
+ // compute number of bits required for each index
+ unsigned int vertex_bits = 1;
+
+ while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits)
+ vertex_bits++;
+
+ // worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit
+ unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7;
+
+ return 1 + index_count * vertex_groups + 4;
+}
+
+int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size)
+{
+ using namespace meshopt;
+
+ // the minimum valid encoding is header, 1 byte per index and a 4-byte tail
+ if (buffer_size < 1 + index_count + 4)
+ return -2;
+
+ if ((buffer[0] & 0xf0) != kSequenceHeader)
+ return -1;
+
+ int version = buffer[0] & 0x0f;
+ if (version > 1)
+ return -1;
+
+ const unsigned char* data = buffer + 1;
+ const unsigned char* data_safe_end = buffer + buffer_size - 4;
+
+ unsigned int last[2] = {};
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ // make sure we have enough data to read
+ // each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end
+ // after this we can be sure we can read without extra bounds checks
+ if (data >= data_safe_end)
+ return -2;
+
+ unsigned int v = decodeVByte(data);
+
+ // decode the index of the last baseline
+ unsigned int current = v & 1;
+ v >>= 1;
+
+ // reconstruct index as a delta
+ unsigned int d = (v >> 1) ^ -int(v & 1);
+ unsigned int index = last[current] + d;
+
+ // update last for the next iteration that uses it
+ last[current] = index;
+
+ if (index_size == 2)
+ {
+ static_cast<unsigned short*>(destination)[i] = (unsigned short)(index);
+ }
+ else
+ {
+ static_cast<unsigned int*>(destination)[i] = index;
+ }
+ }
+
+ // we should've read all data bytes and stopped at the boundary between data and tail
+ if (data != data_safe_end)
+ return -3;
+
+ return 0;
+}
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
new file mode 100644
index 0000000000..aa4a30efa4
--- /dev/null
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -0,0 +1,347 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+namespace meshopt
+{
+
+static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len)
+{
+ // MurmurHash2
+ const unsigned int m = 0x5bd1e995;
+ const int r = 24;
+
+ while (len >= 4)
+ {
+ unsigned int k = *reinterpret_cast<const unsigned int*>(key);
+
+ k *= m;
+ k ^= k >> r;
+ k *= m;
+
+ h *= m;
+ h ^= k;
+
+ key += 4;
+ len -= 4;
+ }
+
+ return h;
+}
+
+struct VertexHasher
+{
+ const unsigned char* vertices;
+ size_t vertex_size;
+ size_t vertex_stride;
+
+ size_t hash(unsigned int index) const
+ {
+ return hashUpdate4(0, vertices + index * vertex_stride, vertex_size);
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0;
+ }
+};
+
+struct VertexStreamHasher
+{
+ const meshopt_Stream* streams;
+ size_t stream_count;
+
+ size_t hash(unsigned int index) const
+ {
+ unsigned int h = 0;
+
+ for (size_t i = 0; i < stream_count; ++i)
+ {
+ const meshopt_Stream& s = streams[i];
+ const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+ h = hashUpdate4(h, data + index * s.stride, s.size);
+ }
+
+ return h;
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ for (size_t i = 0; i < stream_count; ++i)
+ {
+ const meshopt_Stream& s = streams[i];
+ const unsigned char* data = static_cast<const unsigned char*>(s.data);
+
+ if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0)
+ return false;
+ }
+
+ return true;
+ }
+};
+
+static size_t hashBuckets(size_t count)
+{
+ size_t buckets = 1;
+ while (buckets < count)
+ buckets *= 2;
+
+ return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+ assert(buckets > 0);
+ assert((buckets & (buckets - 1)) == 0);
+
+ size_t hashmod = buckets - 1;
+ size_t bucket = hash.hash(key) & hashmod;
+
+ for (size_t probe = 0; probe <= hashmod; ++probe)
+ {
+ T& item = table[bucket];
+
+ if (item == empty)
+ return &item;
+
+ if (hash.equal(item, key))
+ return &item;
+
+ // hash collision, quadratic probing
+ bucket = (bucket + probe + 1) & hashmod;
+ }
+
+ assert(false && "Hash table is full"); // unreachable
+ return 0;
+}
+
+} // namespace meshopt
+
+size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+ using namespace meshopt;
+
+ assert(indices || index_count == vertex_count);
+ assert(index_count % 3 == 0);
+ assert(vertex_size > 0 && vertex_size <= 256);
+
+ meshopt_Allocator allocator;
+
+ memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+ VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size};
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate<unsigned int>(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ unsigned int next_vertex = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices ? indices[i] : unsigned(i);
+ assert(index < vertex_count);
+
+ if (destination[index] == ~0u)
+ {
+ unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+ if (*entry == ~0u)
+ {
+ *entry = index;
+
+ destination[index] = next_vertex++;
+ }
+ else
+ {
+ assert(destination[*entry] != ~0u);
+
+ destination[index] = destination[*entry];
+ }
+ }
+ }
+
+ assert(next_vertex <= vertex_count);
+
+ return next_vertex;
+}
+
+size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+ using namespace meshopt;
+
+ assert(indices || index_count == vertex_count);
+ assert(index_count % 3 == 0);
+ assert(stream_count > 0 && stream_count <= 16);
+
+ for (size_t i = 0; i < stream_count; ++i)
+ {
+ assert(streams[i].size > 0 && streams[i].size <= 256);
+ assert(streams[i].size <= streams[i].stride);
+ }
+
+ meshopt_Allocator allocator;
+
+ memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+ VertexStreamHasher hasher = {streams, stream_count};
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate<unsigned int>(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ unsigned int next_vertex = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices ? indices[i] : unsigned(i);
+ assert(index < vertex_count);
+
+ if (destination[index] == ~0u)
+ {
+ unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+ if (*entry == ~0u)
+ {
+ *entry = index;
+
+ destination[index] = next_vertex++;
+ }
+ else
+ {
+ assert(destination[*entry] != ~0u);
+
+ destination[index] = destination[*entry];
+ }
+ }
+ }
+
+ assert(next_vertex <= vertex_count);
+
+ return next_vertex;
+}
+
+void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap)
+{
+ assert(vertex_size > 0 && vertex_size <= 256);
+
+ meshopt_Allocator allocator;
+
+ // support in-place remap
+ if (destination == vertices)
+ {
+ unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+ memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+ vertices = vertices_copy;
+ }
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (remap[i] != ~0u)
+ {
+ assert(remap[i] < vertex_count);
+
+ memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size);
+ }
+ }
+}
+
+void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap)
+{
+ assert(index_count % 3 == 0);
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices ? indices[i] : unsigned(i);
+ assert(remap[index] != ~0u);
+
+ destination[i] = remap[index];
+ }
+}
+
+void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+ using namespace meshopt;
+
+ assert(indices);
+ assert(index_count % 3 == 0);
+ assert(vertex_size > 0 && vertex_size <= 256);
+ assert(vertex_size <= vertex_stride);
+
+ meshopt_Allocator allocator;
+
+ unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride};
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate<unsigned int>(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ if (remap[index] == ~0u)
+ {
+ unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+ if (*entry == ~0u)
+ *entry = index;
+
+ remap[index] = *entry;
+ }
+
+ destination[i] = remap[index];
+ }
+}
+
+void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count)
+{
+ using namespace meshopt;
+
+ assert(indices);
+ assert(index_count % 3 == 0);
+ assert(stream_count > 0 && stream_count <= 16);
+
+ for (size_t i = 0; i < stream_count; ++i)
+ {
+ assert(streams[i].size > 0 && streams[i].size <= 256);
+ assert(streams[i].size <= streams[i].stride);
+ }
+
+ meshopt_Allocator allocator;
+
+ unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+ memset(remap, -1, vertex_count * sizeof(unsigned int));
+
+ VertexStreamHasher hasher = {streams, stream_count};
+
+ size_t table_size = hashBuckets(vertex_count);
+ unsigned int* table = allocator.allocate<unsigned int>(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ if (remap[index] == ~0u)
+ {
+ unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u);
+
+ if (*entry == ~0u)
+ *entry = index;
+
+ remap[index] = *entry;
+ }
+
+ destination[i] = remap[index];
+ }
+}
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
new file mode 100644
index 0000000000..a442d103c8
--- /dev/null
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -0,0 +1,948 @@
+/**
+ * meshoptimizer - version 0.15
+ *
+ * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
+ *
+ * This library is distributed under the MIT License. See notice at the end of this file.
+ */
+#pragma once
+
+#include <assert.h>
+#include <stddef.h>
+
+/* Version macro; major * 1000 + minor * 10 + patch */
+#define MESHOPTIMIZER_VERSION 150 /* 0.15 */
+
+/* If no API is defined, assume default */
+#ifndef MESHOPTIMIZER_API
+#define MESHOPTIMIZER_API
+#endif
+
+/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */
+#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API
+
+/* C interface */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Vertex attribute stream, similar to glVertexPointer
+ * Each element takes size bytes, with stride controlling the spacing between successive elements.
+ */
+struct meshopt_Stream
+{
+ const void* data;
+ size_t size;
+ size_t stride;
+};
+
+/**
+ * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Generates a vertex remap table from multiple vertex streams and an optional index buffer and returns number of unique vertices
+ * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer.
+ * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting vertex buffer (unique_vertex_count elements, returned by meshopt_generateVertexRemap)
+ * vertex_count should be the initial vertex count and not the value returned by meshopt_generateVertexRemap
+ */
+MESHOPTIMIZER_API void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap);
+
+/**
+ * Generate index buffer from the source index buffer and remap table generated by meshopt_generateVertexRemap
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices can be NULL if the input is unindexed
+ */
+MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+
+/**
+ * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary
+ * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer.
+ * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering.
+ * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
+
+/**
+ * Vertex transform cache optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for strip-like caches
+ * Produces inferior results to meshopt_optimizeVertexCache from the GPU vertex cache perspective
+ * However, the resulting index order is more optimal if the goal is to reduce the triangle strip length or improve compression efficiency
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Vertex transform cache optimizer for FIFO caches
+ * Reorders indices to reduce the number of GPU vertex shader invocations
+ * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * cache_size should be less than the actual GPU cache size to avoid cache thrashing
+ */
+MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+
+/**
+ * Overdraw optimizer
+ * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw
+ * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently
+ */
+MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+
+/**
+ * Vertex fetch cache optimizer
+ * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count elements)
+ * indices is used both as an input and as an output index buffer
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Vertex fetch cache optimizer
+ * Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing
+ * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused
+ * The resulting remap table should be used to reorder vertex/index buffers using meshopt_remapVertexBuffer/meshopt_remapIndexBuffer
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count);
+
+/**
+ * Index buffer encoder
+ * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original.
+ * Input index buffer must represent a triangle list.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first.
+ *
+ * buffer must contain enough space for the encoded index buffer (use meshopt_encodeIndexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Experimental: Set index encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeIndexVersion(int version);
+
+/**
+ * Index buffer decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ */
+MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Experimental: Index sequence encoder
+ * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original.
+ * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ *
+ * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count);
+
+/**
+ * Index sequence decoder
+ * Decodes index data from an array of bytes generated by meshopt_encodeIndexSequence
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices).
+ *
+ * destination must contain enough space for the resulting index sequence (index_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer encoder
+ * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original.
+ * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space
+ * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream.
+ * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized.
+ *
+ * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size)
+ */
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Set vertex encoder format version
+ * version must specify the data format version to encode; valid values are 0 (decodable by all library versions)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeVertexVersion(int version);
+
+/**
+ * Vertex buffer decoder
+ * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer
+ * Returns 0 if decoding was successful, and an error code otherwise
+ * The decoder is safe to use for untrusted input, but it may produce garbage data.
+ *
+ * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes)
+ */
+MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size);
+
+/**
+ * Vertex buffer filters
+ * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place.
+ * count must be aligned by 4 and stride is fixed for each function to facilitate SIMD implementation.
+ *
+ * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f.
+ * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is.
+ *
+ * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct.
+ * Each component is stored as an 16-bit integer; stride must be equal to 8.
+ *
+ * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M.
+ * Each 32-bit component is decoded in isolation; stride must be divisible by 4.
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size);
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size);
+
+/**
+ * Experimental: Mesh simplifier
+ * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible
+ * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error.
+ * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+
+/**
+ * Experimental: Mesh simplifier (sloppy)
+ * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance
+ * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count.
+ * Returns the number of indices after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+
+/**
+ * Experimental: Point cloud simplifier
+ * Reduces the number of points in the cloud to reach the given target
+ * Returns the number of points after simplification, with destination containing new index data
+ * The resulting index buffer references vertices from the original vertex buffer.
+ * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended.
+ *
+ * destination must contain enough space for the target index buffer
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count);
+
+/**
+ * Mesh stripifier
+ * Converts a previously vertex cache optimized triangle list to triangle strip, stitching strips using restart index or degenerate triangles
+ * Returns the number of indices in the resulting strip, with destination containing new index data
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ * Using restart indices can result in ~10% smaller index buffers, but on some GPUs restart indices may result in decreased performance.
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_stripifyBound
+ * restart_index should be 0xffff or 0xffffffff depending on index size, or 0 to use degenerate triangles
+ */
+MESHOPTIMIZER_API size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_stripifyBound(size_t index_count);
+
+/**
+ * Mesh unstripifier
+ * Converts a triangle strip to a triangle list
+ * Returns the number of indices in the resulting list, with destination containing new index data
+ *
+ * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_unstripifyBound
+ */
+MESHOPTIMIZER_API size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index);
+MESHOPTIMIZER_API size_t meshopt_unstripifyBound(size_t index_count);
+
+struct meshopt_VertexCacheStatistics
+{
+ unsigned int vertices_transformed;
+ unsigned int warps_executed;
+ float acmr; /* transformed vertices / triangle count; best case 0.5, worst case 3.0, optimum depends on topology */
+ float atvr; /* transformed vertices / vertex count; best case 1.0, worst case 6.0, optimum is 1.0 (each vertex is transformed once) */
+};
+
+/**
+ * Vertex transform cache analyzer
+ * Returns cache hit statistics using a simplified FIFO model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size);
+
+struct meshopt_OverdrawStatistics
+{
+ unsigned int pixels_covered;
+ unsigned int pixels_shaded;
+ float overdraw; /* shaded pixels / covered pixels; best case 1.0 */
+};
+
+/**
+ * Overdraw analyzer
+ * Returns overdraw statistics using a software rasterizer
+ * Results may not match actual GPU performance
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+struct meshopt_VertexFetchStatistics
+{
+ unsigned int bytes_fetched;
+ float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */
+};
+
+/**
+ * Vertex fetch cache analyzer
+ * Returns cache hit statistics using a simplified direct mapped model
+ * Results may not match actual GPU performance
+ */
+MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+
+struct meshopt_Meshlet
+{
+ unsigned int vertices[64];
+ unsigned char indices[126][3];
+ unsigned char triangle_count;
+ unsigned char vertex_count;
+};
+
+/**
+ * Experimental: Meshlet builder
+ * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
+ * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
+ * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ *
+ * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ */
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
+
+struct meshopt_Bounds
+{
+ /* bounding sphere, useful for frustum and occlusion culling */
+ float center[3];
+ float radius;
+
+ /* normal cone, useful for backface culling */
+ float cone_apex[3];
+ float cone_axis[3];
+ float cone_cutoff; /* = cos(angle/2) */
+
+ /* normal cone axis and cutoff, stored in 8-bit SNORM format; decode using x/127.0 */
+ signed char cone_axis_s8[3];
+ signed char cone_cutoff_s8;
+};
+
+/**
+ * Experimental: Cluster bounds generator
+ * Creates bounding volumes that can be used for frustum, backface and occlusion culling.
+ *
+ * For backface culling with orthographic projection, use the following formula to reject backfacing clusters:
+ * dot(view, cone_axis) >= cone_cutoff
+ *
+ * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff:
+ * dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff
+ *
+ * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead:
+ * dot(normalize(center - camera_position), cone_axis) >= cone_cutoff + radius / length(center - camera_position)
+ * or an equivalent formula that doesn't have a singularity at center = camera_position:
+ * dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius
+ *
+ * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere
+ * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
+ *
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ */
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Generates a remap table that can be used to reorder points for spatial locality.
+ * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer.
+ *
+ * destination must contain enough space for the resulting remap table (vertex_count elements)
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Experimental: Spatial sorter
+ * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Set allocation callbacks
+ * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library.
+ * Note that all algorithms only allocate memory for temporary use.
+ * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first.
+ */
+MESHOPTIMIZER_API void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*));
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+/* Quantization into commonly supported data formats */
+#ifdef __cplusplus
+/**
+ * Quantize a float in [0..1] range into an N-bit fixed point unorm value
+ * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion
+ * Maximum reconstruction error: 1/2^(N+1)
+ */
+inline int meshopt_quantizeUnorm(float v, int N);
+
+/**
+ * Quantize a float in [-1..1] range into an N-bit fixed point snorm value
+ * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions)
+ * Maximum reconstruction error: 1/2^N
+ */
+inline int meshopt_quantizeSnorm(float v, int N);
+
+/**
+ * Quantize a float into half-precision floating point value
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Representable magnitude range: [6e-5; 65504]
+ * Maximum relative reconstruction error: 5e-4
+ */
+inline unsigned short meshopt_quantizeHalf(float v);
+
+/**
+ * Quantize a float into a floating point value with a limited number of significant mantissa bits
+ * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest
+ * Assumes N is in a valid mantissa precision range, which is 1..23
+ */
+inline float meshopt_quantizeFloat(float v, int N);
+#endif
+
+/**
+ * C++ template interface
+ *
+ * These functions mirror the C interface the library provides, providing template-based overloads so that
+ * the caller can use an arbitrary type for the index data, both for input and output.
+ * When the supplied type is the same size as that of unsigned int, the wrappers are zero-cost; when it's not,
+ * the wrappers end up allocating memory and copying index data to convert from one type to another.
+ */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap);
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride);
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size);
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count);
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count);
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size);
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error);
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count);
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index);
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index);
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size);
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+#endif
+
+/* Inline implementation */
+#ifdef __cplusplus
+inline int meshopt_quantizeUnorm(float v, int N)
+{
+ const float scale = float((1 << N) - 1);
+
+ v = (v >= 0) ? v : 0;
+ v = (v <= 1) ? v : 1;
+
+ return int(v * scale + 0.5f);
+}
+
+inline int meshopt_quantizeSnorm(float v, int N)
+{
+ const float scale = float((1 << (N - 1)) - 1);
+
+ float round = (v >= 0 ? 0.5f : -0.5f);
+
+ v = (v >= -1) ? v : -1;
+ v = (v <= +1) ? v : +1;
+
+ return int(v * scale + round);
+}
+
+inline unsigned short meshopt_quantizeHalf(float v)
+{
+ union { float f; unsigned int ui; } u = {v};
+ unsigned int ui = u.ui;
+
+ int s = (ui >> 16) & 0x8000;
+ int em = ui & 0x7fffffff;
+
+ /* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */
+ int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+ /* underflow: flush to zero; 113 encodes exponent -14 */
+ h = (em < (113 << 23)) ? 0 : h;
+
+ /* overflow: infinity; 143 encodes exponent 16 */
+ h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+ /* NaN; note that we convert all types of NaN to qNaN */
+ h = (em > (255 << 23)) ? 0x7e00 : h;
+
+ return (unsigned short)(s | h);
+}
+
+inline float meshopt_quantizeFloat(float v, int N)
+{
+ union { float f; unsigned int ui; } u = {v};
+ unsigned int ui = u.ui;
+
+ const int mask = (1 << (23 - N)) - 1;
+ const int round = (1 << (23 - N)) >> 1;
+
+ int e = ui & 0x7f800000;
+ unsigned int rui = (ui + round) & ~mask;
+
+ /* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */
+ ui = e == 0x7f800000 ? ui : rui;
+
+ /* flush denormals to zero */
+ ui = e == 0 ? 0 : ui;
+
+ u.ui = ui;
+ return u.f;
+}
+#endif
+
+/* Internal implementation helpers */
+#ifdef __cplusplus
+class meshopt_Allocator
+{
+public:
+ template <typename T>
+ struct StorageT
+ {
+ static void* (*allocate)(size_t);
+ static void (*deallocate)(void*);
+ };
+
+ typedef StorageT<void> Storage;
+
+ meshopt_Allocator()
+ : blocks()
+ , count(0)
+ {
+ }
+
+ ~meshopt_Allocator()
+ {
+ for (size_t i = count; i > 0; --i)
+ Storage::deallocate(blocks[i - 1]);
+ }
+
+ template <typename T> T* allocate(size_t size)
+ {
+ assert(count < sizeof(blocks) / sizeof(blocks[0]));
+ T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T)));
+ blocks[count++] = result;
+ return result;
+ }
+
+private:
+ void* blocks[24];
+ size_t count;
+};
+
+// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker
+template <typename T> void* (*meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new;
+template <typename T> void (*meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete;
+#endif
+
+/* Inline implementation for C++ templated wrappers */
+#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS)
+template <typename T, bool ZeroCopy = sizeof(T) == sizeof(unsigned int)>
+struct meshopt_IndexAdapter;
+
+template <typename T>
+struct meshopt_IndexAdapter<T, false>
+{
+ T* result;
+ unsigned int* data;
+ size_t count;
+
+ meshopt_IndexAdapter(T* result_, const T* input, size_t count_)
+ : result(result_)
+ , data(0)
+ , count(count_)
+ {
+ size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int);
+
+ data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size));
+
+ if (input)
+ {
+ for (size_t i = 0; i < count; ++i)
+ data[i] = input[i];
+ }
+ }
+
+ ~meshopt_IndexAdapter()
+ {
+ if (result)
+ {
+ for (size_t i = 0; i < count; ++i)
+ result[i] = T(data[i]);
+ }
+
+ meshopt_Allocator::Storage::deallocate(data);
+ }
+};
+
+template <typename T>
+struct meshopt_IndexAdapter<T, true>
+{
+ unsigned int* data;
+
+ meshopt_IndexAdapter(T* result, const T* input, size_t)
+ : data(reinterpret_cast<unsigned int*>(result ? result : const_cast<T*>(input)))
+ {
+ }
+};
+
+template <typename T>
+inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+ meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+ return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+
+ return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap)
+{
+ meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride);
+}
+
+template <typename T>
+inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size);
+}
+
+template <typename T>
+inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count);
+}
+
+template <typename T>
+inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+ meshopt_IndexAdapter<T> inout(indices, indices, index_count);
+
+ return meshopt_optimizeVertexFetch(destination, inout.data, index_count, vertices, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+ char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+ (void)index_size_valid;
+
+ return meshopt_decodeIndexBuffer(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count);
+}
+
+template <typename T>
+inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size)
+{
+ char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1];
+ (void)index_size_valid;
+
+ return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size);
+}
+
+template <typename T>
+inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error);
+}
+
+template <typename T>
+inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, target_index_count);
+
+ return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count);
+}
+
+template <typename T>
+inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 5);
+
+ return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3);
+
+ return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index));
+}
+
+template <typename T>
+inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size);
+}
+
+template <typename T>
+inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+}
+
+template <typename T>
+inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+ return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ meshopt_IndexAdapter<T> in(0, indices, index_count);
+ meshopt_IndexAdapter<T> out(destination, 0, index_count);
+
+ meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+#endif
+
+/**
+ * Copyright (c) 2016-2020 Arseny Kapoulkine
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
diff --git a/thirdparty/meshoptimizer/overdrawanalyzer.cpp b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
new file mode 100644
index 0000000000..8d5859ba39
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawanalyzer.cpp
@@ -0,0 +1,230 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Nicolas Capens. Advanced Rasterization. 2004
+namespace meshopt
+{
+
+const int kViewport = 256;
+
+struct OverdrawBuffer
+{
+ float z[kViewport][kViewport][2];
+ unsigned int overdraw[kViewport][kViewport][2];
+};
+
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+
+static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3)
+{
+ // z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1)
+ // z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1)
+ // (x2-x1 y2-y1)(dzdx) = (z2-z1)
+ // (x3-x1 y3-y1)(dzdy) (z3-z1)
+ // we'll solve it with Cramer's rule
+ float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1);
+ float invdet = (det == 0) ? 0 : 1 / det;
+
+ dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet;
+ dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet;
+
+ return det;
+}
+
+// half-space fixed point triangle rasterizer
+static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z)
+{
+ // compute depth gradients
+ float DZx, DZy;
+ float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z);
+ int sign = det > 0;
+
+ // flip backfacing triangles to simplify rasterization logic
+ if (sign)
+ {
+ // flipping v2 & v3 preserves depth gradients since they're based on v1
+ float t;
+ t = v2x, v2x = v3x, v3x = t;
+ t = v2y, v2y = v3y, v3y = t;
+ t = v2z, v2z = v3z, v3z = t;
+
+ // flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below
+ v1z = kViewport - v1z;
+ DZx = -DZx;
+ DZy = -DZy;
+ }
+
+ // coordinates, 28.4 fixed point
+ int X1 = int(16.0f * v1x + 0.5f);
+ int X2 = int(16.0f * v2x + 0.5f);
+ int X3 = int(16.0f * v3x + 0.5f);
+
+ int Y1 = int(16.0f * v1y + 0.5f);
+ int Y2 = int(16.0f * v2y + 0.5f);
+ int Y3 = int(16.0f * v3y + 0.5f);
+
+ // bounding rectangle, clipped against viewport
+ // since we rasterize pixels with covered centers, min >0.5 should round up
+ // as for max, due to top-left filling convention we will never rasterize right/bottom edges
+ // so max >= 0.5 should round down
+ int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0);
+ int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport);
+ int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0);
+ int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport);
+
+ // deltas, 28.4 fixed point
+ int DX12 = X1 - X2;
+ int DX23 = X2 - X3;
+ int DX31 = X3 - X1;
+
+ int DY12 = Y1 - Y2;
+ int DY23 = Y2 - Y3;
+ int DY31 = Y3 - Y1;
+
+ // fill convention correction
+ int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0);
+ int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0);
+ int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0);
+
+ // half edge equations, 24.8 fixed point
+ // note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers
+ int FX = (minx << 4) + 8;
+ int FY = (miny << 4) + 8;
+ int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1;
+ int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1;
+ int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1;
+ float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f);
+
+ for (int y = miny; y < maxy; y++)
+ {
+ int CX1 = CY1;
+ int CX2 = CY2;
+ int CX3 = CY3;
+ float ZX = ZY;
+
+ for (int x = minx; x < maxx; x++)
+ {
+ // check if all CXn are non-negative
+ if ((CX1 | CX2 | CX3) >= 0)
+ {
+ if (ZX >= buffer->z[y][x][sign])
+ {
+ buffer->z[y][x][sign] = ZX;
+ buffer->overdraw[y][x][sign]++;
+ }
+ }
+
+ // signed left shift is UB for negative numbers so use unsigned-signed casts
+ CX1 -= int(unsigned(DY12) << 4);
+ CX2 -= int(unsigned(DY23) << 4);
+ CX3 -= int(unsigned(DY31) << 4);
+ ZX += DZx;
+ }
+
+ // signed left shift is UB for negative numbers so use unsigned-signed casts
+ CY1 += int(unsigned(DX12) << 4);
+ CY2 += int(unsigned(DX23) << 4);
+ CY3 += int(unsigned(DX31) << 4);
+ ZY += DZy;
+ }
+}
+
+} // namespace meshopt
+
+meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ meshopt_OverdrawStatistics result = {};
+
+ float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+ float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ const float* v = vertex_positions + i * vertex_stride_float;
+
+ for (int j = 0; j < 3; ++j)
+ {
+ minv[j] = min(minv[j], v[j]);
+ maxv[j] = max(maxv[j], v[j]);
+ }
+ }
+
+ float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2]));
+ float scale = kViewport / extent;
+
+ float* triangles = allocator.allocate<float>(index_count * 3);
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ const float* v = vertex_positions + index * vertex_stride_float;
+
+ triangles[i * 3 + 0] = (v[0] - minv[0]) * scale;
+ triangles[i * 3 + 1] = (v[1] - minv[1]) * scale;
+ triangles[i * 3 + 2] = (v[2] - minv[2]) * scale;
+ }
+
+ OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1);
+
+ for (int axis = 0; axis < 3; ++axis)
+ {
+ memset(buffer, 0, sizeof(OverdrawBuffer));
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ const float* vn0 = &triangles[3 * (i + 0)];
+ const float* vn1 = &triangles[3 * (i + 1)];
+ const float* vn2 = &triangles[3 * (i + 2)];
+
+ switch (axis)
+ {
+ case 0:
+ rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]);
+ break;
+ case 1:
+ rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]);
+ break;
+ case 2:
+ rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]);
+ break;
+ }
+ }
+
+ for (int y = 0; y < kViewport; ++y)
+ for (int x = 0; x < kViewport; ++x)
+ for (int s = 0; s < 2; ++s)
+ {
+ unsigned int overdraw = buffer->overdraw[y][x][s];
+
+ result.pixels_covered += overdraw > 0;
+ result.pixels_shaded += overdraw;
+ }
+ }
+
+ result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f;
+
+ return result;
+}
diff --git a/thirdparty/meshoptimizer/overdrawoptimizer.cpp b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
new file mode 100644
index 0000000000..143656ed76
--- /dev/null
+++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp
@@ -0,0 +1,333 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+// This work is based on:
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count)
+{
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ float mesh_centroid[3] = {};
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ const float* p = vertex_positions + vertex_stride_float * indices[i];
+
+ mesh_centroid[0] += p[0];
+ mesh_centroid[1] += p[1];
+ mesh_centroid[2] += p[2];
+ }
+
+ mesh_centroid[0] /= index_count;
+ mesh_centroid[1] /= index_count;
+ mesh_centroid[2] /= index_count;
+
+ for (size_t cluster = 0; cluster < cluster_count; ++cluster)
+ {
+ size_t cluster_begin = clusters[cluster] * 3;
+ size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+ assert(cluster_begin < cluster_end);
+
+ float cluster_area = 0;
+ float cluster_centroid[3] = {};
+ float cluster_normal[3] = {};
+
+ for (size_t i = cluster_begin; i < cluster_end; i += 3)
+ {
+ const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0];
+ const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1];
+ const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2];
+
+ float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+ float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+ float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+ float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+ float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+ float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+
+ cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3);
+ cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3);
+ cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3);
+ cluster_normal[0] += normalx;
+ cluster_normal[1] += normaly;
+ cluster_normal[2] += normalz;
+ cluster_area += area;
+ }
+
+ float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area;
+
+ cluster_centroid[0] *= inv_cluster_area;
+ cluster_centroid[1] *= inv_cluster_area;
+ cluster_centroid[2] *= inv_cluster_area;
+
+ float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]);
+ float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length;
+
+ cluster_normal[0] *= inv_cluster_normal_length;
+ cluster_normal[1] *= inv_cluster_normal_length;
+ cluster_normal[2] *= inv_cluster_normal_length;
+
+ float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]};
+
+ sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2];
+ }
+}
+
+static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count)
+{
+ // compute sort data bounds and renormalize, using fixed point snorm
+ float sort_data_max = 1e-3f;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ float dpa = fabsf(sort_data[i]);
+
+ sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max;
+ }
+
+ const int sort_bits = 11;
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ // note that we flip distribution since high dot product should come first
+ float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max);
+
+ sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1);
+ }
+
+ // fill histogram for counting sort
+ unsigned int histogram[1 << sort_bits];
+ memset(histogram, 0, sizeof(histogram));
+
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ histogram[sort_keys[i]]++;
+ }
+
+ // compute offsets based on histogram data
+ size_t histogram_sum = 0;
+
+ for (size_t i = 0; i < 1 << sort_bits; ++i)
+ {
+ size_t count = histogram[i];
+ histogram[i] = unsigned(histogram_sum);
+ histogram_sum += count;
+ }
+
+ assert(histogram_sum == cluster_count);
+
+ // compute sort order based on offsets
+ for (size_t i = 0; i < cluster_count; ++i)
+ {
+ sort_order[histogram[sort_keys[i]]++] = unsigned(i);
+ }
+}
+
+static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp)
+{
+ unsigned int cache_misses = 0;
+
+ // if vertex is not in cache, put it in cache
+ if (timestamp - cache_timestamps[a] > cache_size)
+ {
+ cache_timestamps[a] = timestamp++;
+ cache_misses++;
+ }
+
+ if (timestamp - cache_timestamps[b] > cache_size)
+ {
+ cache_timestamps[b] = timestamp++;
+ cache_misses++;
+ }
+
+ if (timestamp - cache_timestamps[c] > cache_size)
+ {
+ cache_timestamps[c] = timestamp++;
+ cache_misses++;
+ }
+
+ return cache_misses;
+}
+
+static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps)
+{
+ memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+ unsigned int timestamp = cache_size + 1;
+
+ size_t face_count = index_count / 3;
+
+ size_t result = 0;
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+ // when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh
+ // that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently
+ // suggests an inefficiency in the vertex cache optimization algorithm
+ // usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0
+ if (i == 0 || m == 3)
+ {
+ destination[result++] = unsigned(i);
+ }
+ }
+
+ assert(result <= index_count / 3);
+
+ return result;
+}
+
+static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps)
+{
+ memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+ unsigned int timestamp = 0;
+
+ size_t result = 0;
+
+ for (size_t it = 0; it < cluster_count; ++it)
+ {
+ size_t start = clusters[it];
+ size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3;
+ assert(start < end);
+
+ // reset cache
+ timestamp += cache_size + 1;
+
+ // measure cluster ACMR
+ unsigned int cluster_misses = 0;
+
+ for (size_t i = start; i < end; ++i)
+ {
+ unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+ cluster_misses += m;
+ }
+
+ float cluster_threshold = threshold * (float(cluster_misses) / float(end - start));
+
+ // first cluster always starts from the hard cluster boundary
+ destination[result++] = unsigned(start);
+
+ // reset cache
+ timestamp += cache_size + 1;
+
+ unsigned int running_misses = 0;
+ unsigned int running_faces = 0;
+
+ for (size_t i = start; i < end; ++i)
+ {
+ unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp);
+
+ running_misses += m;
+ running_faces += 1;
+
+ if (float(running_misses) / float(running_faces) <= cluster_threshold)
+ {
+ // we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one
+ // note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last
+ // cluster is empty; however, the 'pop_back' after the loop will clean it up
+ destination[result++] = unsigned(i + 1);
+
+ // reset cache
+ timestamp += cache_size + 1;
+
+ running_misses = 0;
+ running_faces = 0;
+ }
+ }
+
+ // each time we reach the target ACMR we flush the cluster
+ // this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles
+ // in the last cluster, producing a very bad ACMR and significantly penalizing the overall results
+ // thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one
+ // there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end'
+ // to the cluster boundary array which we need to remove anyway - this code will do that automatically
+ if (destination[result - 1] != start)
+ {
+ result--;
+ }
+ }
+
+ assert(result >= cluster_count);
+ assert(result <= index_count / 3);
+
+ return result;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+
+ // guard for empty meshes
+ if (index_count == 0 || vertex_count == 0)
+ return;
+
+ // support in-place optimization
+ if (destination == indices)
+ {
+ unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+ memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+ indices = indices_copy;
+ }
+
+ unsigned int cache_size = 16;
+
+ unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+
+ // generate hard boundaries from full-triangle cache misses
+ unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3);
+ size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps);
+
+ // generate soft boundaries
+ unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1);
+ size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps);
+
+ const unsigned int* clusters = soft_clusters;
+ size_t cluster_count = soft_cluster_count;
+
+ // fill sort data
+ float* sort_data = allocator.allocate<float>(cluster_count);
+ calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count);
+
+ // sort clusters using sort data
+ unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count);
+ unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count);
+ calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count);
+
+ // fill output buffer
+ size_t offset = 0;
+
+ for (size_t it = 0; it < cluster_count; ++it)
+ {
+ unsigned int cluster = sort_order[it];
+ assert(cluster < cluster_count);
+
+ size_t cluster_begin = clusters[cluster] * 3;
+ size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count;
+ assert(cluster_begin < cluster_end);
+
+ memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int));
+ offset += cluster_end - cluster_begin;
+ }
+
+ assert(offset == index_count);
+}
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
new file mode 100644
index 0000000000..bd523275ce
--- /dev/null
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -0,0 +1,1529 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <math.h>
+#include <string.h>
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+// This work is based on:
+// Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997
+// Michael Garland. Quadric-based polygonal surface simplification. 1999
+// Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000
+// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003
+// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019
+namespace meshopt
+{
+
+struct EdgeAdjacency
+{
+ unsigned int* counts;
+ unsigned int* offsets;
+ unsigned int* data;
+};
+
+static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+ size_t face_count = index_count / 3;
+
+ // allocate arrays
+ adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+ adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+ adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+ // fill edge counts
+ memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ assert(indices[i] < vertex_count);
+
+ adjacency.counts[indices[i]]++;
+ }
+
+ // fill offset table
+ unsigned int offset = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ adjacency.offsets[i] = offset;
+ offset += adjacency.counts[i];
+ }
+
+ assert(offset == index_count);
+
+ // fill edge data
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+ adjacency.data[adjacency.offsets[a]++] = b;
+ adjacency.data[adjacency.offsets[b]++] = c;
+ adjacency.data[adjacency.offsets[c]++] = a;
+ }
+
+ // fix offsets that have been disturbed by the previous pass
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+ adjacency.offsets[i] -= adjacency.counts[i];
+ }
+}
+
+struct PositionHasher
+{
+ const float* vertex_positions;
+ size_t vertex_stride_float;
+
+ size_t hash(unsigned int index) const
+ {
+ const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float);
+
+ // Optimized Spatial Hashing for Collision Detection of Deformable Objects
+ return (key[0] * 73856093) ^ (key[1] * 19349663) ^ (key[2] * 83492791);
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ return memcmp(vertex_positions + lhs * vertex_stride_float, vertex_positions + rhs * vertex_stride_float, sizeof(float) * 3) == 0;
+ }
+};
+
+static size_t hashBuckets2(size_t count)
+{
+ size_t buckets = 1;
+ while (buckets < count)
+ buckets *= 2;
+
+ return buckets;
+}
+
+template <typename T, typename Hash>
+static T* hashLookup2(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty)
+{
+ assert(buckets > 0);
+ assert((buckets & (buckets - 1)) == 0);
+
+ size_t hashmod = buckets - 1;
+ size_t bucket = hash.hash(key) & hashmod;
+
+ for (size_t probe = 0; probe <= hashmod; ++probe)
+ {
+ T& item = table[bucket];
+
+ if (item == empty)
+ return &item;
+
+ if (hash.equal(item, key))
+ return &item;
+
+ // hash collision, quadratic probing
+ bucket = (bucket + probe + 1) & hashmod;
+ }
+
+ assert(false && "Hash table is full"); // unreachable
+ return 0;
+}
+
+static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+ PositionHasher hasher = {vertex_positions_data, vertex_positions_stride / sizeof(float)};
+
+ size_t table_size = hashBuckets2(vertex_count);
+ unsigned int* table = allocator.allocate<unsigned int>(table_size);
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ // build forward remap: for each vertex, which other (canonical) vertex does it map to?
+ // we use position equivalence for this, and remap vertices to other existing vertices
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int index = unsigned(i);
+ unsigned int* entry = hashLookup2(table, table_size, hasher, index, ~0u);
+
+ if (*entry == ~0u)
+ *entry = index;
+
+ remap[index] = *entry;
+ }
+
+ // build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex?
+ // entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i
+ for (size_t i = 0; i < vertex_count; ++i)
+ wedge[i] = unsigned(i);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ if (remap[i] != i)
+ {
+ unsigned int r = remap[i];
+
+ wedge[i] = wedge[r];
+ wedge[r] = unsigned(i);
+ }
+}
+
+enum VertexKind
+{
+ Kind_Manifold, // not on an attribute seam, not on any boundary
+ Kind_Border, // not on an attribute seam, has exactly two open edges
+ Kind_Seam, // on an attribute seam with exactly two attribute seam edges
+ Kind_Complex, // none of the above; these vertices can move as long as all wedges move to the target vertex
+ Kind_Locked, // none of the above; these vertices can't move
+
+ Kind_Count
+};
+
+// manifold vertices can collapse onto anything
+// border/seam vertices can only be collapsed onto border/seam respectively
+// complex vertices can collapse onto complex/locked
+// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex
+// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore
+const unsigned char kCanCollapse[Kind_Count][Kind_Count] = {
+ {1, 1, 1, 1, 1},
+ {0, 1, 0, 0, 0},
+ {0, 0, 1, 0, 0},
+ {0, 0, 0, 1, 1},
+ {0, 0, 0, 0, 0},
+};
+
+// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge
+// note that for seam edges, the opposite edge isn't present in the attribute-based topology
+// but is present if you consider a position-only mesh variant
+const unsigned char kHasOpposite[Kind_Count][Kind_Count] = {
+ {1, 1, 1, 0, 1},
+ {1, 0, 1, 0, 0},
+ {1, 1, 1, 0, 1},
+ {0, 0, 0, 0, 0},
+ {1, 0, 1, 0, 0},
+};
+
+static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b)
+{
+ unsigned int count = adjacency.counts[a];
+ const unsigned int* data = adjacency.data + adjacency.offsets[a];
+
+ for (size_t i = 0; i < count; ++i)
+ if (data[i] == b)
+ return true;
+
+ return false;
+}
+
+static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge)
+{
+ memset(loop, -1, vertex_count * sizeof(unsigned int));
+ memset(loopback, -1, vertex_count * sizeof(unsigned int));
+
+ // incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1
+ // note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam
+ // but here it's okay to fill the data out for other types of vertices as well
+ unsigned int* openinc = loopback;
+ unsigned int* openout = loop;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int vertex = unsigned(i);
+
+ unsigned int count = adjacency.counts[vertex];
+ const unsigned int* data = adjacency.data + adjacency.offsets[vertex];
+
+ for (size_t j = 0; j < count; ++j)
+ {
+ unsigned int target = data[j];
+
+ if (!hasEdge(adjacency, target, vertex))
+ {
+ openinc[target] = (openinc[target] == ~0u) ? vertex : target;
+ openout[vertex] = (openout[vertex] == ~0u) ? target : vertex;
+ }
+ }
+ }
+
+#if TRACE
+ size_t lockedstats[4] = {};
+#define TRACELOCKED(i) lockedstats[i]++;
+#else
+#define TRACELOCKED(i) (void)0
+#endif
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (remap[i] == i)
+ {
+ if (wedge[i] == i)
+ {
+ // no attribute seam, need to check if it's manifold
+ unsigned int openi = openinc[i], openo = openout[i];
+
+ // note: we classify any vertices with no open edges as manifold
+ // this is technically incorrect - if 4 triangles share an edge, we'll classify vertices as manifold
+ // it's unclear if this is a problem in practice
+ if (openi == ~0u && openo == ~0u)
+ {
+ result[i] = Kind_Manifold;
+ }
+ else if (openi != i && openo != i)
+ {
+ result[i] = Kind_Border;
+ }
+ else
+ {
+ result[i] = Kind_Locked;
+ TRACELOCKED(0);
+ }
+ }
+ else if (wedge[wedge[i]] == i)
+ {
+ // attribute seam; need to distinguish between Seam and Locked
+ unsigned int w = wedge[i];
+ unsigned int openiv = openinc[i], openov = openout[i];
+ unsigned int openiw = openinc[w], openow = openout[w];
+
+ // seam should have one open half-edge for each vertex, and the edges need to "connect" - point to the same vertex post-remap
+ if (openiv != ~0u && openiv != i && openov != ~0u && openov != i &&
+ openiw != ~0u && openiw != w && openow != ~0u && openow != w)
+ {
+ if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw])
+ {
+ result[i] = Kind_Seam;
+ }
+ else
+ {
+ result[i] = Kind_Locked;
+ TRACELOCKED(1);
+ }
+ }
+ else
+ {
+ result[i] = Kind_Locked;
+ TRACELOCKED(2);
+ }
+ }
+ else
+ {
+ // more than one vertex maps to this one; we don't have classification available
+ result[i] = Kind_Locked;
+ TRACELOCKED(3);
+ }
+ }
+ else
+ {
+ assert(remap[i] < i);
+
+ result[i] = result[remap[i]];
+ }
+ }
+
+#if TRACE
+ printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n",
+ int(lockedstats[0]), int(lockedstats[1]), int(lockedstats[2]), int(lockedstats[3]));
+#endif
+}
+
+struct Vector3
+{
+ float x, y, z;
+};
+
+static void rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+{
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+ float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ const float* v = vertex_positions_data + i * vertex_stride_float;
+
+ result[i].x = v[0];
+ result[i].y = v[1];
+ result[i].z = v[2];
+
+ for (int j = 0; j < 3; ++j)
+ {
+ float vj = v[j];
+
+ minv[j] = minv[j] > vj ? vj : minv[j];
+ maxv[j] = maxv[j] < vj ? vj : maxv[j];
+ }
+ }
+
+ float extent = 0.f;
+
+ extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+ extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+ extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+ float scale = extent == 0 ? 0.f : 1.f / extent;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ result[i].x = (result[i].x - minv[0]) * scale;
+ result[i].y = (result[i].y - minv[1]) * scale;
+ result[i].z = (result[i].z - minv[2]) * scale;
+ }
+}
+
+struct Quadric
+{
+ float a00, a11, a22;
+ float a10, a20, a21;
+ float b0, b1, b2, c;
+ float w;
+};
+
+struct Collapse
+{
+ unsigned int v0;
+ unsigned int v1;
+
+ union
+ {
+ unsigned int bidi;
+ float error;
+ unsigned int errorui;
+ };
+};
+
+static float normalize(Vector3& v)
+{
+ float length = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z);
+
+ if (length > 0)
+ {
+ v.x /= length;
+ v.y /= length;
+ v.z /= length;
+ }
+
+ return length;
+}
+
+static void quadricAdd(Quadric& Q, const Quadric& R)
+{
+ Q.a00 += R.a00;
+ Q.a11 += R.a11;
+ Q.a22 += R.a22;
+ Q.a10 += R.a10;
+ Q.a20 += R.a20;
+ Q.a21 += R.a21;
+ Q.b0 += R.b0;
+ Q.b1 += R.b1;
+ Q.b2 += R.b2;
+ Q.c += R.c;
+ Q.w += R.w;
+}
+
+static float quadricError(const Quadric& Q, const Vector3& v)
+{
+ float rx = Q.b0;
+ float ry = Q.b1;
+ float rz = Q.b2;
+
+ rx += Q.a10 * v.y;
+ ry += Q.a21 * v.z;
+ rz += Q.a20 * v.x;
+
+ rx *= 2;
+ ry *= 2;
+ rz *= 2;
+
+ rx += Q.a00 * v.x;
+ ry += Q.a11 * v.y;
+ rz += Q.a22 * v.z;
+
+ float r = Q.c;
+ r += rx * v.x;
+ r += ry * v.y;
+ r += rz * v.z;
+
+ float s = Q.w == 0.f ? 0.f : 1.f / Q.w;
+
+ return fabsf(r) * s;
+}
+
+static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w)
+{
+ float aw = a * w;
+ float bw = b * w;
+ float cw = c * w;
+ float dw = d * w;
+
+ Q.a00 = a * aw;
+ Q.a11 = b * bw;
+ Q.a22 = c * cw;
+ Q.a10 = a * bw;
+ Q.a20 = a * cw;
+ Q.a21 = b * cw;
+ Q.b0 = a * dw;
+ Q.b1 = b * dw;
+ Q.b2 = c * dw;
+ Q.c = d * dw;
+ Q.w = w;
+}
+
+static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w)
+{
+ // we need to encode (x - X) ^ 2 + (y - Y)^2 + (z - Z)^2 into the quadric
+ Q.a00 = w;
+ Q.a11 = w;
+ Q.a22 = w;
+ Q.a10 = 0.f;
+ Q.a20 = 0.f;
+ Q.a21 = 0.f;
+ Q.b0 = -2.f * x * w;
+ Q.b1 = -2.f * y * w;
+ Q.b2 = -2.f * z * w;
+ Q.c = (x * x + y * y + z * z) * w;
+ Q.w = w;
+}
+
+static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+ Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+ Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+
+ // normal = cross(p1 - p0, p2 - p0)
+ Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x};
+ float area = normalize(normal);
+
+ float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+ // we use sqrtf(area) so that the error is scaled linearly; this tends to improve silhouettes
+ quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, sqrtf(area) * weight);
+}
+
+static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight)
+{
+ Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z};
+ float length = normalize(p10);
+
+ // p20p = length of projection of p2-p0 onto normalize(p1 - p0)
+ Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z};
+ float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z;
+
+ // normal = altitude of triangle from point p2 onto edge p1-p0
+ Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p};
+ normalize(normal);
+
+ float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z;
+
+ // note: the weight is scaled linearly with edge length; this has to match the triangle weight
+ quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight);
+}
+
+static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap)
+{
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int i0 = indices[i + 0];
+ unsigned int i1 = indices[i + 1];
+ unsigned int i2 = indices[i + 2];
+
+ Quadric Q;
+ quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f);
+
+ quadricAdd(vertex_quadrics[remap[i0]], Q);
+ quadricAdd(vertex_quadrics[remap[i1]], Q);
+ quadricAdd(vertex_quadrics[remap[i2]], Q);
+ }
+}
+
+static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback)
+{
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ static const int next[3] = {1, 2, 0};
+
+ for (int e = 0; e < 3; ++e)
+ {
+ unsigned int i0 = indices[i + e];
+ unsigned int i1 = indices[i + next[e]];
+
+ unsigned char k0 = vertex_kind[i0];
+ unsigned char k1 = vertex_kind[i1];
+
+ // check that either i0 or i1 are border/seam and are on the same edge loop
+ // note that we need to add the error even for edged that connect e.g. border & locked
+ // if we don't do that, the adjacent border->border edge won't have correct errors for corners
+ if (k0 != Kind_Border && k0 != Kind_Seam && k1 != Kind_Border && k1 != Kind_Seam)
+ continue;
+
+ if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+ continue;
+
+ if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0)
+ continue;
+
+ // seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+ if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+ continue;
+
+ unsigned int i2 = indices[i + next[next[e]]];
+
+ // we try hard to maintain border edge geometry; seam edges can move more freely
+ // due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical
+ const float kEdgeWeightSeam = 1.f;
+ const float kEdgeWeightBorder = 10.f;
+
+ float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam;
+
+ Quadric Q;
+ quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight);
+
+ quadricAdd(vertex_quadrics[remap[i0]], Q);
+ quadricAdd(vertex_quadrics[remap[i1]], Q);
+ }
+ }
+}
+
+static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop)
+{
+ size_t collapse_count = 0;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ static const int next[3] = {1, 2, 0};
+
+ for (int e = 0; e < 3; ++e)
+ {
+ unsigned int i0 = indices[i + e];
+ unsigned int i1 = indices[i + next[e]];
+
+ // this can happen either when input has a zero-length edge, or when we perform collapses for complex
+ // topology w/seams and collapse a manifold vertex that connects to both wedges onto one of them
+ // we leave edges like this alone since they may be important for preserving mesh integrity
+ if (remap[i0] == remap[i1])
+ continue;
+
+ unsigned char k0 = vertex_kind[i0];
+ unsigned char k1 = vertex_kind[i1];
+
+ // the edge has to be collapsible in at least one direction
+ if (!(kCanCollapse[k0][k1] | kCanCollapse[k1][k0]))
+ continue;
+
+ // manifold and seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges
+ if (kHasOpposite[k0][k1] && remap[i1] > remap[i0])
+ continue;
+
+ // two vertices are on a border or a seam, but there's no direct edge between them
+ // this indicates that they belong to two different edge loops and we should not collapse this edge
+ // loop[] tracks half edges so we only need to check i0->i1
+ if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1)
+ continue;
+
+ // edge can be collapsed in either direction - we will pick the one with minimum error
+ // note: we evaluate error later during collapse ranking, here we just tag the edge as bidirectional
+ if (kCanCollapse[k0][k1] & kCanCollapse[k1][k0])
+ {
+ Collapse c = {i0, i1, {/* bidi= */ 1}};
+ collapses[collapse_count++] = c;
+ }
+ else
+ {
+ // edge can only be collapsed in one direction
+ unsigned int e0 = kCanCollapse[k0][k1] ? i0 : i1;
+ unsigned int e1 = kCanCollapse[k0][k1] ? i1 : i0;
+
+ Collapse c = {e0, e1, {/* bidi= */ 0}};
+ collapses[collapse_count++] = c;
+ }
+ }
+ }
+
+ return collapse_count;
+}
+
+static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap)
+{
+ for (size_t i = 0; i < collapse_count; ++i)
+ {
+ Collapse& c = collapses[i];
+
+ unsigned int i0 = c.v0;
+ unsigned int i1 = c.v1;
+
+ // most edges are bidirectional which means we need to evaluate errors for two collapses
+ // to keep this code branchless we just use the same edge for unidirectional edges
+ unsigned int j0 = c.bidi ? i1 : i0;
+ unsigned int j1 = c.bidi ? i0 : i1;
+
+ const Quadric& qi = vertex_quadrics[remap[i0]];
+ const Quadric& qj = vertex_quadrics[remap[j0]];
+
+ float ei = quadricError(qi, vertex_positions[i1]);
+ float ej = quadricError(qj, vertex_positions[j1]);
+
+ // pick edge direction with minimal error
+ c.v0 = ei <= ej ? i0 : j0;
+ c.v1 = ei <= ej ? i1 : j1;
+ c.error = ei <= ej ? ei : ej;
+ }
+}
+
+#if TRACE > 1
+static void dumpEdgeCollapses(const Collapse* collapses, size_t collapse_count, const unsigned char* vertex_kind)
+{
+ size_t ckinds[Kind_Count][Kind_Count] = {};
+ float cerrors[Kind_Count][Kind_Count] = {};
+
+ for (int k0 = 0; k0 < Kind_Count; ++k0)
+ for (int k1 = 0; k1 < Kind_Count; ++k1)
+ cerrors[k0][k1] = FLT_MAX;
+
+ for (size_t i = 0; i < collapse_count; ++i)
+ {
+ unsigned int i0 = collapses[i].v0;
+ unsigned int i1 = collapses[i].v1;
+
+ unsigned char k0 = vertex_kind[i0];
+ unsigned char k1 = vertex_kind[i1];
+
+ ckinds[k0][k1]++;
+ cerrors[k0][k1] = (collapses[i].error < cerrors[k0][k1]) ? collapses[i].error : cerrors[k0][k1];
+ }
+
+ for (int k0 = 0; k0 < Kind_Count; ++k0)
+ for (int k1 = 0; k1 < Kind_Count; ++k1)
+ if (ckinds[k0][k1])
+ printf("collapses %d -> %d: %d, min error %e\n", k0, k1, int(ckinds[k0][k1]), cerrors[k0][k1]);
+}
+
+static void dumpLockedCollapses(const unsigned int* indices, size_t index_count, const unsigned char* vertex_kind)
+{
+ size_t locked_collapses[Kind_Count][Kind_Count] = {};
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ static const int next[3] = {1, 2, 0};
+
+ for (int e = 0; e < 3; ++e)
+ {
+ unsigned int i0 = indices[i + e];
+ unsigned int i1 = indices[i + next[e]];
+
+ unsigned char k0 = vertex_kind[i0];
+ unsigned char k1 = vertex_kind[i1];
+
+ locked_collapses[k0][k1] += !kCanCollapse[k0][k1] && !kCanCollapse[k1][k0];
+ }
+ }
+
+ for (int k0 = 0; k0 < Kind_Count; ++k0)
+ for (int k1 = 0; k1 < Kind_Count; ++k1)
+ if (locked_collapses[k0][k1])
+ printf("locked collapses %d -> %d: %d\n", k0, k1, int(locked_collapses[k0][k1]));
+}
+#endif
+
+static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count)
+{
+ const int sort_bits = 11;
+
+ // fill histogram for counting sort
+ unsigned int histogram[1 << sort_bits];
+ memset(histogram, 0, sizeof(histogram));
+
+ for (size_t i = 0; i < collapse_count; ++i)
+ {
+ // skip sign bit since error is non-negative
+ unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+ histogram[key]++;
+ }
+
+ // compute offsets based on histogram data
+ size_t histogram_sum = 0;
+
+ for (size_t i = 0; i < 1 << sort_bits; ++i)
+ {
+ size_t count = histogram[i];
+ histogram[i] = unsigned(histogram_sum);
+ histogram_sum += count;
+ }
+
+ assert(histogram_sum == collapse_count);
+
+ // compute sort order based on offsets
+ for (size_t i = 0; i < collapse_count; ++i)
+ {
+ // skip sign bit since error is non-negative
+ unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits);
+
+ sort_order[histogram[key]++] = unsigned(i);
+ }
+}
+
+static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, size_t triangle_collapse_goal, float error_goal, float error_limit)
+{
+ size_t edge_collapses = 0;
+ size_t triangle_collapses = 0;
+
+ for (size_t i = 0; i < collapse_count; ++i)
+ {
+ const Collapse& c = collapses[collapse_order[i]];
+
+ if (c.error > error_limit)
+ break;
+
+ if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 10)
+ break;
+
+ if (triangle_collapses >= triangle_collapse_goal)
+ break;
+
+ unsigned int i0 = c.v0;
+ unsigned int i1 = c.v1;
+
+ unsigned int r0 = remap[i0];
+ unsigned int r1 = remap[i1];
+
+ // we don't collapse vertices that had source or target vertex involved in a collapse
+ // it's important to not move the vertices twice since it complicates the tracking/remapping logic
+ // it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass
+ if (collapse_locked[r0] | collapse_locked[r1])
+ continue;
+
+ assert(collapse_remap[r0] == r0);
+ assert(collapse_remap[r1] == r1);
+
+ quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]);
+
+ if (vertex_kind[i0] == Kind_Complex)
+ {
+ unsigned int v = i0;
+
+ do
+ {
+ collapse_remap[v] = r1;
+ v = wedge[v];
+ } while (v != i0);
+ }
+ else if (vertex_kind[i0] == Kind_Seam)
+ {
+ // remap v0 to v1 and seam pair of v0 to seam pair of v1
+ unsigned int s0 = wedge[i0];
+ unsigned int s1 = wedge[i1];
+
+ assert(s0 != i0 && s1 != i1);
+ assert(wedge[s0] == i0 && wedge[s1] == i1);
+
+ collapse_remap[i0] = i1;
+ collapse_remap[s0] = s1;
+ }
+ else
+ {
+ assert(wedge[i0] == i0);
+
+ collapse_remap[i0] = i1;
+ }
+
+ collapse_locked[r0] = 1;
+ collapse_locked[r1] = 1;
+
+ // border edges collapse 1 triangle, other edges collapse 2 or more
+ triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2;
+ edge_collapses++;
+ }
+
+ return edge_collapses;
+}
+
+static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap)
+{
+ size_t write = 0;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int v0 = collapse_remap[indices[i + 0]];
+ unsigned int v1 = collapse_remap[indices[i + 1]];
+ unsigned int v2 = collapse_remap[indices[i + 2]];
+
+ // we never move the vertex twice during a single pass
+ assert(collapse_remap[v0] == v0);
+ assert(collapse_remap[v1] == v1);
+ assert(collapse_remap[v2] == v2);
+
+ if (v0 != v1 && v0 != v2 && v1 != v2)
+ {
+ indices[write + 0] = v0;
+ indices[write + 1] = v1;
+ indices[write + 2] = v2;
+ write += 3;
+ }
+ }
+
+ return write;
+}
+
+static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsigned int* collapse_remap)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ if (loop[i] != ~0u)
+ {
+ unsigned int l = loop[i];
+ unsigned int r = collapse_remap[l];
+
+ // i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes
+ loop[i] = (i == r) ? loop[l] : r;
+ }
+ }
+}
+
+struct CellHasher
+{
+ const unsigned int* vertex_ids;
+
+ size_t hash(unsigned int i) const
+ {
+ unsigned int h = vertex_ids[i];
+
+ // MurmurHash2 finalizer
+ h ^= h >> 13;
+ h *= 0x5bd1e995;
+ h ^= h >> 15;
+ return h;
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ return vertex_ids[lhs] == vertex_ids[rhs];
+ }
+};
+
+struct IdHasher
+{
+ size_t hash(unsigned int id) const
+ {
+ unsigned int h = id;
+
+ // MurmurHash2 finalizer
+ h ^= h >> 13;
+ h *= 0x5bd1e995;
+ h ^= h >> 15;
+ return h;
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ return lhs == rhs;
+ }
+};
+
+struct TriangleHasher
+{
+ unsigned int* indices;
+
+ size_t hash(unsigned int i) const
+ {
+ const unsigned int* tri = indices + i * 3;
+
+ // Optimized Spatial Hashing for Collision Detection of Deformable Objects
+ return (tri[0] * 73856093) ^ (tri[1] * 19349663) ^ (tri[2] * 83492791);
+ }
+
+ bool equal(unsigned int lhs, unsigned int rhs) const
+ {
+ const unsigned int* lt = indices + lhs * 3;
+ const unsigned int* rt = indices + rhs * 3;
+
+ return lt[0] == rt[0] && lt[1] == rt[1] && lt[2] == rt[2];
+ }
+};
+
+static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size)
+{
+ assert(grid_size >= 1 && grid_size <= 1024);
+ float cell_scale = float(grid_size - 1);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ const Vector3& v = vertex_positions[i];
+
+ int xi = int(v.x * cell_scale + 0.5f);
+ int yi = int(v.y * cell_scale + 0.5f);
+ int zi = int(v.z * cell_scale + 0.5f);
+
+ vertex_ids[i] = (xi << 20) | (yi << 10) | zi;
+ }
+}
+
+static size_t countTriangles(const unsigned int* vertex_ids, const unsigned int* indices, size_t index_count)
+{
+ size_t result = 0;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int id0 = vertex_ids[indices[i + 0]];
+ unsigned int id1 = vertex_ids[indices[i + 1]];
+ unsigned int id2 = vertex_ids[indices[i + 2]];
+
+ result += (id0 != id1) & (id0 != id2) & (id1 != id2);
+ }
+
+ return result;
+}
+
+static size_t fillVertexCells(unsigned int* table, size_t table_size, unsigned int* vertex_cells, const unsigned int* vertex_ids, size_t vertex_count)
+{
+ CellHasher hasher = {vertex_ids};
+
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ size_t result = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int* entry = hashLookup2(table, table_size, hasher, unsigned(i), ~0u);
+
+ if (*entry == ~0u)
+ {
+ *entry = unsigned(i);
+ vertex_cells[i] = unsigned(result++);
+ }
+ else
+ {
+ vertex_cells[i] = vertex_cells[*entry];
+ }
+ }
+
+ return result;
+}
+
+static size_t countVertexCells(unsigned int* table, size_t table_size, const unsigned int* vertex_ids, size_t vertex_count)
+{
+ IdHasher hasher;
+
+ memset(table, -1, table_size * sizeof(unsigned int));
+
+ size_t result = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int id = vertex_ids[i];
+ unsigned int* entry = hashLookup2(table, table_size, hasher, id, ~0u);
+
+ result += (*entry == ~0u);
+ *entry = id;
+ }
+
+ return result;
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* vertex_cells)
+{
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int i0 = indices[i + 0];
+ unsigned int i1 = indices[i + 1];
+ unsigned int i2 = indices[i + 2];
+
+ unsigned int c0 = vertex_cells[i0];
+ unsigned int c1 = vertex_cells[i1];
+ unsigned int c2 = vertex_cells[i2];
+
+ bool single_cell = (c0 == c1) & (c0 == c2);
+
+ Quadric Q;
+ quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], single_cell ? 3.f : 1.f);
+
+ if (single_cell)
+ {
+ quadricAdd(cell_quadrics[c0], Q);
+ }
+ else
+ {
+ quadricAdd(cell_quadrics[c0], Q);
+ quadricAdd(cell_quadrics[c1], Q);
+ quadricAdd(cell_quadrics[c2], Q);
+ }
+ }
+}
+
+static void fillCellQuadrics(Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* vertex_cells)
+{
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int c = vertex_cells[i];
+ const Vector3& v = vertex_positions[i];
+
+ Quadric Q;
+ quadricFromPoint(Q, v.x, v.y, v.z, 1.f);
+
+ quadricAdd(cell_quadrics[c], Q);
+ }
+}
+
+static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count)
+{
+ memset(cell_remap, -1, cell_count * sizeof(unsigned int));
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned int cell = vertex_cells[i];
+ float error = quadricError(cell_quadrics[cell], vertex_positions[i]);
+
+ if (cell_remap[cell] == ~0u || cell_errors[cell] > error)
+ {
+ cell_remap[cell] = unsigned(i);
+ cell_errors[cell] = error;
+ }
+ }
+}
+
+static size_t filterTriangles(unsigned int* destination, unsigned int* tritable, size_t tritable_size, const unsigned int* indices, size_t index_count, const unsigned int* vertex_cells, const unsigned int* cell_remap)
+{
+ TriangleHasher hasher = {destination};
+
+ memset(tritable, -1, tritable_size * sizeof(unsigned int));
+
+ size_t result = 0;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int c0 = vertex_cells[indices[i + 0]];
+ unsigned int c1 = vertex_cells[indices[i + 1]];
+ unsigned int c2 = vertex_cells[indices[i + 2]];
+
+ if (c0 != c1 && c0 != c2 && c1 != c2)
+ {
+ unsigned int a = cell_remap[c0];
+ unsigned int b = cell_remap[c1];
+ unsigned int c = cell_remap[c2];
+
+ if (b < a && b < c)
+ {
+ unsigned int t = a;
+ a = b, b = c, c = t;
+ }
+ else if (c < a && c < b)
+ {
+ unsigned int t = c;
+ c = b, b = a, a = t;
+ }
+
+ destination[result * 3 + 0] = a;
+ destination[result * 3 + 1] = b;
+ destination[result * 3 + 2] = c;
+
+ unsigned int* entry = hashLookup2(tritable, tritable_size, hasher, unsigned(result), ~0u);
+
+ if (*entry == ~0u)
+ *entry = unsigned(result++);
+ }
+ }
+
+ return result * 3;
+}
+
+static float interpolate(float y, float x0, float y0, float x1, float y1, float x2, float y2)
+{
+ // three point interpolation from "revenge of interpolation search" paper
+ float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0);
+ float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2);
+ return x1 + num / den;
+}
+
+} // namespace meshopt
+
+#ifndef NDEBUG
+unsigned char* meshopt_simplifyDebugKind = 0;
+unsigned int* meshopt_simplifyDebugLoop = 0;
+unsigned int* meshopt_simplifyDebugLoopBack = 0;
+#endif
+
+size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(target_index_count <= index_count);
+
+ meshopt_Allocator allocator;
+
+ unsigned int* result = destination;
+
+ // build adjacency information
+ EdgeAdjacency adjacency = {};
+ buildEdgeAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+ // build position remap that maps each vertex to the one with identical position
+ unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+ unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count);
+ buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator);
+
+ // classify vertices; vertex kind determines collapse rules, see kCanCollapse
+ unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count);
+ unsigned int* loop = allocator.allocate<unsigned int>(vertex_count);
+ unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count);
+ classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge);
+
+#if TRACE
+ size_t unique_positions = 0;
+ for (size_t i = 0; i < vertex_count; ++i)
+ unique_positions += remap[i] == i;
+
+ printf("position remap: %d vertices => %d positions\n", int(vertex_count), int(unique_positions));
+
+ size_t kinds[Kind_Count] = {};
+ for (size_t i = 0; i < vertex_count; ++i)
+ kinds[vertex_kind[i]] += remap[i] == i;
+
+ printf("kinds: manifold %d, border %d, seam %d, complex %d, locked %d\n",
+ int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked]));
+#endif
+
+ Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+ rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+ Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count);
+ memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric));
+
+ fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap);
+ fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback);
+
+ if (result != indices)
+ memcpy(result, indices, index_count * sizeof(unsigned int));
+
+#if TRACE
+ size_t pass_count = 0;
+ float worst_error = 0;
+#endif
+
+ Collapse* edge_collapses = allocator.allocate<Collapse>(index_count);
+ unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count);
+ unsigned int* collapse_remap = allocator.allocate<unsigned int>(vertex_count);
+ unsigned char* collapse_locked = allocator.allocate<unsigned char>(vertex_count);
+
+ size_t result_count = index_count;
+
+ // target_error input is linear; we need to adjust it to match quadricError units
+ float error_limit = target_error * target_error;
+
+ while (result_count > target_index_count)
+ {
+ size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop);
+
+ // no edges can be collapsed any more due to topology restrictions
+ if (edge_collapse_count == 0)
+ break;
+
+ rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap);
+
+#if TRACE > 1
+ dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind);
+#endif
+
+ sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count);
+
+ // most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit
+ // note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses
+ size_t triangle_collapse_goal = (result_count - target_index_count) / 3;
+ size_t edge_collapse_goal = triangle_collapse_goal / 2;
+
+ // we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked
+ // as they will share vertices with other successfull collapses, we need to increase the acceptable error by this factor
+ const float kPassErrorBound = 1.5f;
+
+ float error_goal = edge_collapse_goal < edge_collapse_count ? edge_collapses[collapse_order[edge_collapse_goal]].error * kPassErrorBound : FLT_MAX;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ collapse_remap[i] = unsigned(i);
+
+ memset(collapse_locked, 0, vertex_count);
+
+ size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, triangle_collapse_goal, error_goal, error_limit);
+
+ // no edges can be collapsed any more due to hitting the error limit or triangle collapse limit
+ if (collapses == 0)
+ break;
+
+ remapEdgeLoops(loop, vertex_count, collapse_remap);
+ remapEdgeLoops(loopback, vertex_count, collapse_remap);
+
+ size_t new_count = remapIndexBuffer(result, result_count, collapse_remap);
+ assert(new_count < result_count);
+
+#if TRACE
+ float pass_error = 0.f;
+ for (size_t i = 0; i < edge_collapse_count; ++i)
+ {
+ Collapse& c = edge_collapses[collapse_order[i]];
+
+ if (collapse_remap[c.v0] == c.v1)
+ pass_error = c.error;
+ }
+
+ pass_count++;
+ worst_error = (worst_error < pass_error) ? pass_error : worst_error;
+
+ printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal);
+#endif
+
+ result_count = new_count;
+ }
+
+#if TRACE
+ printf("passes: %d, worst error: %e\n", int(pass_count), worst_error);
+#endif
+
+#if TRACE > 1
+ dumpLockedCollapses(result, result_count, vertex_kind);
+#endif
+
+#ifndef NDEBUG
+ if (meshopt_simplifyDebugKind)
+ memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count);
+
+ if (meshopt_simplifyDebugLoop)
+ memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int));
+
+ if (meshopt_simplifyDebugLoopBack)
+ memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int));
+#endif
+
+ return result_count;
+}
+
+size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(target_index_count <= index_count);
+
+ // we expect to get ~2 triangles/vertex in the output
+ size_t target_cell_count = target_index_count / 6;
+
+ if (target_cell_count == 0)
+ return 0;
+
+ meshopt_Allocator allocator;
+
+ Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+ rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+ // find the optimal grid size using guided binary search
+#if TRACE
+ printf("source: %d vertices, %d triangles\n", int(vertex_count), int(index_count / 3));
+ printf("target: %d cells, %d triangles\n", int(target_cell_count), int(target_index_count / 3));
+#endif
+
+ unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+ const int kInterpolationPasses = 5;
+
+ // invariant: # of triangles in min_grid <= target_count
+ int min_grid = 0;
+ int max_grid = 1025;
+ size_t min_triangles = 0;
+ size_t max_triangles = index_count / 3;
+
+ // instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+ int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+ for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+ {
+ assert(min_triangles < target_index_count / 3);
+ assert(max_grid - min_grid > 1);
+
+ // we clamp the prediction of the grid size to make sure that the search converges
+ int grid_size = next_grid_size;
+ grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+ computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+ size_t triangles = countTriangles(vertex_ids, indices, index_count);
+
+#if TRACE
+ printf("pass %d (%s): grid size %d, triangles %d, %s\n",
+ pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+ grid_size, int(triangles),
+ (triangles <= target_index_count / 3) ? "under" : "over");
+#endif
+
+ float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles));
+
+ if (triangles <= target_index_count / 3)
+ {
+ min_grid = grid_size;
+ min_triangles = triangles;
+ }
+ else
+ {
+ max_grid = grid_size;
+ max_triangles = triangles;
+ }
+
+ if (triangles == target_index_count / 3 || max_grid - min_grid <= 1)
+ break;
+
+ // we start by using interpolation search - it usually converges faster
+ // however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+ next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+ }
+
+ if (min_triangles == 0)
+ return 0;
+
+ // build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+ size_t table_size = hashBuckets2(vertex_count);
+ unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+ unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+ computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+ // build a quadric for each target cell
+ Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+ memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+ fillCellQuadrics(cell_quadrics, indices, index_count, vertex_positions, vertex_cells);
+
+ // for each target cell, find the vertex with the minimal error
+ unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+ float* cell_errors = allocator.allocate<float>(cell_count);
+
+ fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+ // collapse triangles!
+ // note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :(
+ size_t tritable_size = hashBuckets2(min_triangles);
+ unsigned int* tritable = allocator.allocate<unsigned int>(tritable_size);
+
+ size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap);
+ assert(write <= target_index_count);
+
+#if TRACE
+ printf("result: %d cells, %d triangles (%d unfiltered)\n", int(cell_count), int(write / 3), int(min_triangles));
+#endif
+
+ return write;
+}
+
+size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count)
+{
+ using namespace meshopt;
+
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+ assert(target_vertex_count <= vertex_count);
+
+ size_t target_cell_count = target_vertex_count;
+
+ if (target_cell_count == 0)
+ return 0;
+
+ meshopt_Allocator allocator;
+
+ Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count);
+ rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride);
+
+ // find the optimal grid size using guided binary search
+#if TRACE
+ printf("source: %d vertices\n", int(vertex_count));
+ printf("target: %d cells\n", int(target_cell_count));
+#endif
+
+ unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count);
+
+ size_t table_size = hashBuckets2(vertex_count);
+ unsigned int* table = allocator.allocate<unsigned int>(table_size);
+
+ const int kInterpolationPasses = 5;
+
+ // invariant: # of vertices in min_grid <= target_count
+ int min_grid = 0;
+ int max_grid = 1025;
+ size_t min_vertices = 0;
+ size_t max_vertices = vertex_count;
+
+ // instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size...
+ int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f);
+
+ for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass)
+ {
+ assert(min_vertices < target_vertex_count);
+ assert(max_grid - min_grid > 1);
+
+ // we clamp the prediction of the grid size to make sure that the search converges
+ int grid_size = next_grid_size;
+ grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size;
+
+ computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size);
+ size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count);
+
+#if TRACE
+ printf("pass %d (%s): grid size %d, vertices %d, %s\n",
+ pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary",
+ grid_size, int(vertices),
+ (vertices <= target_vertex_count) ? "under" : "over");
+#endif
+
+ float tip = interpolate(float(target_vertex_count), float(min_grid), float(min_vertices), float(grid_size), float(vertices), float(max_grid), float(max_vertices));
+
+ if (vertices <= target_vertex_count)
+ {
+ min_grid = grid_size;
+ min_vertices = vertices;
+ }
+ else
+ {
+ max_grid = grid_size;
+ max_vertices = vertices;
+ }
+
+ if (vertices == target_vertex_count || max_grid - min_grid <= 1)
+ break;
+
+ // we start by using interpolation search - it usually converges faster
+ // however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN)
+ next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2;
+ }
+
+ if (min_vertices == 0)
+ return 0;
+
+ // build vertex->cell association by mapping all vertices with the same quantized position to the same cell
+ unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count);
+
+ computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid);
+ size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count);
+
+ // build a quadric for each target cell
+ Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count);
+ memset(cell_quadrics, 0, cell_count * sizeof(Quadric));
+
+ fillCellQuadrics(cell_quadrics, vertex_positions, vertex_count, vertex_cells);
+
+ // for each target cell, find the vertex with the minimal error
+ unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count);
+ float* cell_errors = allocator.allocate<float>(cell_count);
+
+ fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count);
+
+ // copy results to the output
+ assert(cell_count <= target_vertex_count);
+ memcpy(destination, cell_remap, sizeof(unsigned int) * cell_count);
+
+#if TRACE
+ printf("result: %d cells\n", int(cell_count));
+#endif
+
+ return cell_count;
+}
diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp
new file mode 100644
index 0000000000..b09f80ac6f
--- /dev/null
+++ b/thirdparty/meshoptimizer/spatialorder.cpp
@@ -0,0 +1,194 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <float.h>
+#include <string.h>
+
+// This work is based on:
+// Fabian Giesen. Decoding Morton codes. 2009
+namespace meshopt
+{
+
+// "Insert" two 0 bits after each of the 10 low bits of x
+inline unsigned int part1By2(unsigned int x)
+{
+ x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210
+ x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210
+ x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210
+ x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10
+ x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0
+ return x;
+}
+
+static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride)
+{
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX};
+ float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX};
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ const float* v = vertex_positions_data + i * vertex_stride_float;
+
+ for (int j = 0; j < 3; ++j)
+ {
+ float vj = v[j];
+
+ minv[j] = minv[j] > vj ? vj : minv[j];
+ maxv[j] = maxv[j] < vj ? vj : maxv[j];
+ }
+ }
+
+ float extent = 0.f;
+
+ extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]);
+ extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]);
+ extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]);
+
+ float scale = extent == 0 ? 0.f : 1.f / extent;
+
+ // generate Morton order based on the position inside a unit cube
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ const float* v = vertex_positions_data + i * vertex_stride_float;
+
+ int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f);
+ int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f);
+ int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f);
+
+ result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2);
+ }
+}
+
+static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count)
+{
+ memset(hist, 0, sizeof(hist));
+
+ // compute 3 10-bit histograms in parallel
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = data[i];
+
+ hist[(id >> 0) & 1023][0]++;
+ hist[(id >> 10) & 1023][1]++;
+ hist[(id >> 20) & 1023][2]++;
+ }
+
+ unsigned int sumx = 0, sumy = 0, sumz = 0;
+
+ // replace histogram data with prefix histogram sums in-place
+ for (int i = 0; i < 1024; ++i)
+ {
+ unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2];
+
+ hist[i][0] = sumx;
+ hist[i][1] = sumy;
+ hist[i][2] = sumz;
+
+ sumx += hx;
+ sumy += hy;
+ sumz += hz;
+ }
+
+ assert(sumx == count && sumy == count && sumz == count);
+}
+
+static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass)
+{
+ int bitoff = pass * 10;
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int id = (keys[source[i]] >> bitoff) & 1023;
+
+ destination[hist[id][pass]++] = source[i];
+ }
+}
+
+} // namespace meshopt
+
+void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ meshopt_Allocator allocator;
+
+ unsigned int* keys = allocator.allocate<unsigned int>(vertex_count);
+ computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride);
+
+ unsigned int hist[1024][3];
+ computeHistogram(hist, keys, vertex_count);
+
+ unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ destination[i] = unsigned(i);
+
+ // 3-pass radix sort computes the resulting order into scratch
+ radixPass(scratch, destination, keys, vertex_count, hist, 0);
+ radixPass(destination, scratch, keys, vertex_count, hist, 1);
+ radixPass(scratch, destination, keys, vertex_count, hist, 2);
+
+ // since our remap table is mapping old=>new, we need to reverse it
+ for (size_t i = 0; i < vertex_count; ++i)
+ destination[scratch[i]] = unsigned(i);
+}
+
+void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+ assert(vertex_positions_stride % sizeof(float) == 0);
+
+ (void)vertex_count;
+
+ size_t face_count = index_count / 3;
+ size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+
+ meshopt_Allocator allocator;
+
+ float* centroids = allocator.allocate<float>(face_count * 3);
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ const float* va = vertex_positions + a * vertex_stride_float;
+ const float* vb = vertex_positions + b * vertex_stride_float;
+ const float* vc = vertex_positions + c * vertex_stride_float;
+
+ centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f;
+ centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f;
+ centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f;
+ }
+
+ unsigned int* remap = allocator.allocate<unsigned int>(face_count);
+
+ meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3);
+
+ // support in-order remap
+ if (destination == indices)
+ {
+ unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+ memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+ indices = indices_copy;
+ }
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+ unsigned int r = remap[i];
+
+ destination[r * 3 + 0] = a;
+ destination[r * 3 + 1] = b;
+ destination[r * 3 + 2] = c;
+ }
+}
diff --git a/thirdparty/meshoptimizer/stripifier.cpp b/thirdparty/meshoptimizer/stripifier.cpp
new file mode 100644
index 0000000000..8ce17ef3dc
--- /dev/null
+++ b/thirdparty/meshoptimizer/stripifier.cpp
@@ -0,0 +1,295 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <limits.h>
+#include <string.h>
+
+// This work is based on:
+// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996
+namespace meshopt
+{
+
+static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence)
+{
+ unsigned int index = 0;
+ unsigned int iv = ~0u;
+
+ for (size_t i = 0; i < buffer_size; ++i)
+ {
+ unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]];
+ unsigned int v = (va < vb && va < vc) ? va : (vb < vc) ? vb : vc;
+
+ if (v < iv)
+ {
+ index = unsigned(i);
+ iv = v;
+ }
+ }
+
+ return index;
+}
+
+static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1)
+{
+ for (size_t i = 0; i < buffer_size; ++i)
+ {
+ unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+ if (e0 == a && e1 == b)
+ return (int(i) << 2) | 2;
+ else if (e0 == b && e1 == c)
+ return (int(i) << 2) | 0;
+ else if (e0 == c && e1 == a)
+ return (int(i) << 2) | 1;
+ }
+
+ return -1;
+}
+
+} // namespace meshopt
+
+size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index)
+{
+ assert(destination != indices);
+ assert(index_count % 3 == 0);
+
+ using namespace meshopt;
+
+ meshopt_Allocator allocator;
+
+ const size_t buffer_capacity = 8;
+
+ unsigned int buffer[buffer_capacity][3] = {};
+ unsigned int buffer_size = 0;
+
+ size_t index_offset = 0;
+
+ unsigned int strip[2] = {};
+ unsigned int parity = 0;
+
+ size_t strip_size = 0;
+
+ // compute vertex valence; this is used to prioritize starting triangle for strips
+ unsigned int* valence = allocator.allocate<unsigned int>(vertex_count);
+ memset(valence, 0, vertex_count * sizeof(unsigned int));
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ valence[index]++;
+ }
+
+ int next = -1;
+
+ while (buffer_size > 0 || index_offset < index_count)
+ {
+ assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3));
+
+ // fill triangle buffer
+ while (buffer_size < buffer_capacity && index_offset < index_count)
+ {
+ buffer[buffer_size][0] = indices[index_offset + 0];
+ buffer[buffer_size][1] = indices[index_offset + 1];
+ buffer[buffer_size][2] = indices[index_offset + 2];
+
+ buffer_size++;
+ index_offset += 3;
+ }
+
+ assert(buffer_size > 0);
+
+ if (next >= 0)
+ {
+ unsigned int i = next >> 2;
+ unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+ unsigned int v = buffer[i][next & 3];
+
+ // ordered removal from the buffer
+ memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+ buffer_size--;
+
+ // update vertex valences for strip start heuristic
+ valence[a]--;
+ valence[b]--;
+ valence[c]--;
+
+ // find next triangle (note that edge order flips on every iteration)
+ // in some cases we need to perform a swap to pick a different outgoing triangle edge
+ // for [a b c], the default strip edge is [b c], but we might want to use [a c]
+ int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]);
+ int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1;
+
+ if (cont < 0 && swap >= 0)
+ {
+ // [a b c] => [a b a c]
+ destination[strip_size++] = strip[0];
+ destination[strip_size++] = v;
+
+ // next strip has same winding
+ // ? a b => b a v
+ strip[1] = v;
+
+ next = swap;
+ }
+ else
+ {
+ // emit the next vertex in the strip
+ destination[strip_size++] = v;
+
+ // next strip has flipped winding
+ strip[0] = strip[1];
+ strip[1] = v;
+ parity ^= 1;
+
+ next = cont;
+ }
+ }
+ else
+ {
+ // if we didn't find anything, we need to find the next new triangle
+ // we use a heuristic to maximize the strip length
+ unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]);
+ unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2];
+
+ // ordered removal from the buffer
+ memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0]));
+ buffer_size--;
+
+ // update vertex valences for strip start heuristic
+ valence[a]--;
+ valence[b]--;
+ valence[c]--;
+
+ // we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration
+ int ea = findStripNext(buffer, buffer_size, c, b);
+ int eb = findStripNext(buffer, buffer_size, a, c);
+ int ec = findStripNext(buffer, buffer_size, b, a);
+
+ // in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest
+ // triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear
+ // reasons - slightly improves the stripification efficiency
+ int mine = INT_MAX;
+ mine = (ea >= 0 && mine > ea) ? ea : mine;
+ mine = (eb >= 0 && mine > eb) ? eb : mine;
+ mine = (ec >= 0 && mine > ec) ? ec : mine;
+
+ if (ea == mine)
+ {
+ // keep abc
+ next = ea;
+ }
+ else if (eb == mine)
+ {
+ // abc -> bca
+ unsigned int t = a;
+ a = b, b = c, c = t;
+
+ next = eb;
+ }
+ else if (ec == mine)
+ {
+ // abc -> cab
+ unsigned int t = c;
+ c = b, b = a, a = t;
+
+ next = ec;
+ }
+
+ if (restart_index)
+ {
+ if (strip_size)
+ destination[strip_size++] = restart_index;
+
+ destination[strip_size++] = a;
+ destination[strip_size++] = b;
+ destination[strip_size++] = c;
+
+ // new strip always starts with the same edge winding
+ strip[0] = b;
+ strip[1] = c;
+ parity = 1;
+ }
+ else
+ {
+ if (strip_size)
+ {
+ // connect last strip using degenerate triangles
+ destination[strip_size++] = strip[1];
+ destination[strip_size++] = a;
+ }
+
+ // note that we may need to flip the emitted triangle based on parity
+ // we always end up with outgoing edge "cb" in the end
+ unsigned int e0 = parity ? c : b;
+ unsigned int e1 = parity ? b : c;
+
+ destination[strip_size++] = a;
+ destination[strip_size++] = e0;
+ destination[strip_size++] = e1;
+
+ strip[0] = e0;
+ strip[1] = e1;
+ parity ^= 1;
+ }
+ }
+ }
+
+ return strip_size;
+}
+
+size_t meshopt_stripifyBound(size_t index_count)
+{
+ assert(index_count % 3 == 0);
+
+ // worst case without restarts is 2 degenerate indices and 3 indices per triangle
+ // worst case with restarts is 1 restart index and 3 indices per triangle
+ return (index_count / 3) * 5;
+}
+
+size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index)
+{
+ assert(destination != indices);
+
+ size_t offset = 0;
+ size_t start = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ if (restart_index && indices[i] == restart_index)
+ {
+ start = i + 1;
+ }
+ else if (i - start >= 2)
+ {
+ unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i];
+
+ // flip winding for odd triangles
+ if ((i - start) & 1)
+ {
+ unsigned int t = a;
+ a = b, b = t;
+ }
+
+ // although we use restart indices, strip swaps still produce degenerate triangles, so skip them
+ if (a != b && a != c && b != c)
+ {
+ destination[offset + 0] = a;
+ destination[offset + 1] = b;
+ destination[offset + 2] = c;
+ offset += 3;
+ }
+ }
+ }
+
+ return offset;
+}
+
+size_t meshopt_unstripifyBound(size_t index_count)
+{
+ assert(index_count == 0 || index_count >= 3);
+
+ return (index_count == 0) ? 0 : (index_count - 2) * 3;
+}
diff --git a/thirdparty/meshoptimizer/vcacheanalyzer.cpp b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
new file mode 100644
index 0000000000..3682743820
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheanalyzer.cpp
@@ -0,0 +1,73 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size)
+{
+ assert(index_count % 3 == 0);
+ assert(cache_size >= 3);
+ assert(warp_size == 0 || warp_size >= 3);
+
+ meshopt_Allocator allocator;
+
+ meshopt_VertexCacheStatistics result = {};
+
+ unsigned int warp_offset = 0;
+ unsigned int primgroup_offset = 0;
+
+ unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+ memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+ unsigned int timestamp = cache_size + 1;
+
+ for (size_t i = 0; i < index_count; i += 3)
+ {
+ unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+ assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+ bool ac = (timestamp - cache_timestamps[a]) > cache_size;
+ bool bc = (timestamp - cache_timestamps[b]) > cache_size;
+ bool cc = (timestamp - cache_timestamps[c]) > cache_size;
+
+ // flush cache if triangle doesn't fit into warp or into the primitive buffer
+ if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size))
+ {
+ result.warps_executed += warp_offset > 0;
+
+ warp_offset = 0;
+ primgroup_offset = 0;
+
+ // reset cache
+ timestamp += cache_size + 1;
+ }
+
+ // update cache and add vertices to warp
+ for (int j = 0; j < 3; ++j)
+ {
+ unsigned int index = indices[i + j];
+
+ if (timestamp - cache_timestamps[index] > cache_size)
+ {
+ cache_timestamps[index] = timestamp++;
+ result.vertices_transformed++;
+ warp_offset++;
+ }
+ }
+
+ primgroup_offset++;
+ }
+
+ size_t unique_vertex_count = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ unique_vertex_count += cache_timestamps[i] > 0;
+
+ result.warps_executed += warp_offset > 0;
+
+ result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3);
+ result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count);
+
+ return result;
+}
diff --git a/thirdparty/meshoptimizer/vcacheoptimizer.cpp b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
new file mode 100644
index 0000000000..fb8ade4b77
--- /dev/null
+++ b/thirdparty/meshoptimizer/vcacheoptimizer.cpp
@@ -0,0 +1,473 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// This work is based on:
+// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006
+// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007
+namespace meshopt
+{
+
+const size_t kCacheSizeMax = 16;
+const size_t kValenceMax = 8;
+
+struct VertexScoreTable
+{
+ float cache[1 + kCacheSizeMax];
+ float live[1 + kValenceMax];
+};
+
+// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD
+static const VertexScoreTable kVertexScoreTable = {
+ {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f},
+ {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f},
+};
+
+// Tuned to minimize the encoded index buffer size
+static const VertexScoreTable kVertexScoreTableStrip = {
+ {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f},
+ {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f},
+};
+
+struct TriangleAdjacency
+{
+ unsigned int* counts;
+ unsigned int* offsets;
+ unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+ size_t face_count = index_count / 3;
+
+ // allocate arrays
+ adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+ adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+ adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+ // fill triangle counts
+ memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ assert(indices[i] < vertex_count);
+
+ adjacency.counts[indices[i]]++;
+ }
+
+ // fill offset table
+ unsigned int offset = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ adjacency.offsets[i] = offset;
+ offset += adjacency.counts[i];
+ }
+
+ assert(offset == index_count);
+
+ // fill triangle data
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+ adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+ adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+ adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+ }
+
+ // fix offsets that have been disturbed by the previous pass
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+ adjacency.offsets[i] -= adjacency.counts[i];
+ }
+}
+
+static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count)
+{
+ // check dead-end stack
+ while (dead_end_top)
+ {
+ unsigned int vertex = dead_end[--dead_end_top];
+
+ if (live_triangles[vertex] > 0)
+ return vertex;
+ }
+
+ // input order
+ while (input_cursor < vertex_count)
+ {
+ if (live_triangles[input_cursor] > 0)
+ return input_cursor;
+
+ ++input_cursor;
+ }
+
+ return ~0u;
+}
+
+static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size)
+{
+ unsigned int best_candidate = ~0u;
+ int best_priority = -1;
+
+ for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate)
+ {
+ unsigned int vertex = *next_candidate;
+
+ // otherwise we don't need to process it
+ if (live_triangles[vertex] > 0)
+ {
+ int priority = 0;
+
+ // will it be in cache after fanning?
+ if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size)
+ {
+ priority = timestamp - cache_timestamps[vertex]; // position in cache
+ }
+
+ if (priority > best_priority)
+ {
+ best_candidate = vertex;
+ best_priority = priority;
+ }
+ }
+ }
+
+ return best_candidate;
+}
+
+static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles)
+{
+ assert(cache_position >= -1 && cache_position < int(kCacheSizeMax));
+
+ unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax;
+
+ return table->cache[1 + cache_position] + table->live[live_triangles_clamped];
+}
+
+static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count)
+{
+ // input order
+ while (input_cursor < face_count)
+ {
+ if (!emitted_flags[input_cursor])
+ return input_cursor;
+
+ ++input_cursor;
+ }
+
+ return ~0u;
+}
+
+} // namespace meshopt
+
+void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+
+ meshopt_Allocator allocator;
+
+ // guard for empty meshes
+ if (index_count == 0 || vertex_count == 0)
+ return;
+
+ // support in-place optimization
+ if (destination == indices)
+ {
+ unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+ memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+ indices = indices_copy;
+ }
+
+ unsigned int cache_size = 16;
+ assert(cache_size <= kCacheSizeMax);
+
+ size_t face_count = index_count / 3;
+
+ // build adjacency information
+ TriangleAdjacency adjacency = {};
+ buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+ // live triangle counts
+ unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+ memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+ // emitted flags
+ unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+ memset(emitted_flags, 0, face_count);
+
+ // compute initial vertex scores
+ float* vertex_scores = allocator.allocate<float>(vertex_count);
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ vertex_scores[i] = vertexScore(table, -1, live_triangles[i]);
+
+ // compute triangle scores
+ float* triangle_scores = allocator.allocate<float>(face_count);
+
+ for (size_t i = 0; i < face_count; ++i)
+ {
+ unsigned int a = indices[i * 3 + 0];
+ unsigned int b = indices[i * 3 + 1];
+ unsigned int c = indices[i * 3 + 2];
+
+ triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c];
+ }
+
+ unsigned int cache_holder[2 * (kCacheSizeMax + 3)];
+ unsigned int* cache = cache_holder;
+ unsigned int* cache_new = cache_holder + kCacheSizeMax + 3;
+ size_t cache_count = 0;
+
+ unsigned int current_triangle = 0;
+ unsigned int input_cursor = 1;
+
+ unsigned int output_triangle = 0;
+
+ while (current_triangle != ~0u)
+ {
+ assert(output_triangle < face_count);
+
+ unsigned int a = indices[current_triangle * 3 + 0];
+ unsigned int b = indices[current_triangle * 3 + 1];
+ unsigned int c = indices[current_triangle * 3 + 2];
+
+ // output indices
+ destination[output_triangle * 3 + 0] = a;
+ destination[output_triangle * 3 + 1] = b;
+ destination[output_triangle * 3 + 2] = c;
+ output_triangle++;
+
+ // update emitted flags
+ emitted_flags[current_triangle] = true;
+ triangle_scores[current_triangle] = 0;
+
+ // new triangle
+ size_t cache_write = 0;
+ cache_new[cache_write++] = a;
+ cache_new[cache_write++] = b;
+ cache_new[cache_write++] = c;
+
+ // old triangles
+ for (size_t i = 0; i < cache_count; ++i)
+ {
+ unsigned int index = cache[i];
+
+ if (index != a && index != b && index != c)
+ {
+ cache_new[cache_write++] = index;
+ }
+ }
+
+ unsigned int* cache_temp = cache;
+ cache = cache_new, cache_new = cache_temp;
+ cache_count = cache_write > cache_size ? cache_size : cache_write;
+
+ // update live triangle counts
+ live_triangles[a]--;
+ live_triangles[b]--;
+ live_triangles[c]--;
+
+ // remove emitted triangle from adjacency data
+ // this makes sure that we spend less time traversing these lists on subsequent iterations
+ for (size_t k = 0; k < 3; ++k)
+ {
+ unsigned int index = indices[current_triangle * 3 + k];
+
+ unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+ size_t neighbours_size = adjacency.counts[index];
+
+ for (size_t i = 0; i < neighbours_size; ++i)
+ {
+ unsigned int tri = neighbours[i];
+
+ if (tri == current_triangle)
+ {
+ neighbours[i] = neighbours[neighbours_size - 1];
+ adjacency.counts[index]--;
+ break;
+ }
+ }
+ }
+
+ unsigned int best_triangle = ~0u;
+ float best_score = 0;
+
+ // update cache positions, vertex scores and triangle scores, and find next best triangle
+ for (size_t i = 0; i < cache_write; ++i)
+ {
+ unsigned int index = cache[i];
+
+ int cache_position = i >= cache_size ? -1 : int(i);
+
+ // update vertex score
+ float score = vertexScore(table, cache_position, live_triangles[index]);
+ float score_diff = score - vertex_scores[index];
+
+ vertex_scores[index] = score;
+
+ // update scores of vertex triangles
+ const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index];
+ const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index];
+
+ for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+ {
+ unsigned int tri = *it;
+ assert(!emitted_flags[tri]);
+
+ float tri_score = triangle_scores[tri] + score_diff;
+ assert(tri_score > 0);
+
+ if (best_score < tri_score)
+ {
+ best_triangle = tri;
+ best_score = tri_score;
+ }
+
+ triangle_scores[tri] = tri_score;
+ }
+ }
+
+ // step through input triangles in order if we hit a dead-end
+ current_triangle = best_triangle;
+
+ if (current_triangle == ~0u)
+ {
+ current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count);
+ }
+ }
+
+ assert(input_cursor == face_count);
+ assert(output_triangle == face_count);
+}
+
+void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+ meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable);
+}
+
+void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+ meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip);
+}
+
+void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size)
+{
+ using namespace meshopt;
+
+ assert(index_count % 3 == 0);
+ assert(cache_size >= 3);
+
+ meshopt_Allocator allocator;
+
+ // guard for empty meshes
+ if (index_count == 0 || vertex_count == 0)
+ return;
+
+ // support in-place optimization
+ if (destination == indices)
+ {
+ unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count);
+ memcpy(indices_copy, indices, index_count * sizeof(unsigned int));
+ indices = indices_copy;
+ }
+
+ size_t face_count = index_count / 3;
+
+ // build adjacency information
+ TriangleAdjacency adjacency = {};
+ buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+ // live triangle counts
+ unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+ memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+ // cache time stamps
+ unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count);
+ memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int));
+
+ // dead-end stack
+ unsigned int* dead_end = allocator.allocate<unsigned int>(index_count);
+ unsigned int dead_end_top = 0;
+
+ // emitted flags
+ unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+ memset(emitted_flags, 0, face_count);
+
+ unsigned int current_vertex = 0;
+
+ unsigned int timestamp = cache_size + 1;
+ unsigned int input_cursor = 1; // vertex to restart from in case of dead-end
+
+ unsigned int output_triangle = 0;
+
+ while (current_vertex != ~0u)
+ {
+ const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top;
+
+ // emit all vertex neighbours
+ const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex];
+ const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex];
+
+ for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it)
+ {
+ unsigned int triangle = *it;
+
+ if (!emitted_flags[triangle])
+ {
+ unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+
+ // output indices
+ destination[output_triangle * 3 + 0] = a;
+ destination[output_triangle * 3 + 1] = b;
+ destination[output_triangle * 3 + 2] = c;
+ output_triangle++;
+
+ // update dead-end stack
+ dead_end[dead_end_top + 0] = a;
+ dead_end[dead_end_top + 1] = b;
+ dead_end[dead_end_top + 2] = c;
+ dead_end_top += 3;
+
+ // update live triangle counts
+ live_triangles[a]--;
+ live_triangles[b]--;
+ live_triangles[c]--;
+
+ // update cache info
+ // if vertex is not in cache, put it in cache
+ if (timestamp - cache_timestamps[a] > cache_size)
+ cache_timestamps[a] = timestamp++;
+
+ if (timestamp - cache_timestamps[b] > cache_size)
+ cache_timestamps[b] = timestamp++;
+
+ if (timestamp - cache_timestamps[c] > cache_size)
+ cache_timestamps[c] = timestamp++;
+
+ // update emitted flags
+ emitted_flags[triangle] = true;
+ }
+ }
+
+ // next candidates are the ones we pushed to dead-end stack just now
+ const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top;
+
+ // get next vertex
+ current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size);
+
+ if (current_vertex == ~0u)
+ {
+ current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count);
+ }
+ }
+
+ assert(output_triangle == face_count);
+}
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
new file mode 100644
index 0000000000..784c9a13db
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -0,0 +1,1265 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings
+#if defined(__AVX__) || defined(__SSSE3__)
+#define SIMD_SSE
+#endif
+
+// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings
+#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__)
+#undef SIMD_SSE
+#define SIMD_AVX
+#endif
+
+// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#endif
+
+// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback
+#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__))
+#define SIMD_SSE
+#define SIMD_FALLBACK
+#define SIMD_TARGET __attribute__((target("ssse3")))
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#ifndef SIMD_TARGET
+#define SIMD_TARGET
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <tmmintrin.h>
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+#ifdef _MSC_VER
+#include <intrin.h> // __cpuid
+#else
+#include <cpuid.h> // __cpuid
+#endif
+#endif
+
+#ifdef SIMD_AVX
+#include <immintrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifndef TRACE
+#define TRACE 0
+#endif
+
+#if TRACE
+#include <stdio.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i)
+#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23)
+#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31)
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2)
+#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3)
+#endif
+
+namespace meshopt
+{
+
+const unsigned char kVertexHeader = 0xa0;
+
+static int gEncodeVertexVersion = 0;
+
+const size_t kVertexBlockSizeBytes = 8192;
+const size_t kVertexBlockMaxSize = 256;
+const size_t kByteGroupSize = 16;
+const size_t kByteGroupDecodeLimit = 24;
+const size_t kTailMaxSize = 32;
+
+static size_t getVertexBlockSize(size_t vertex_size)
+{
+ // make sure the entire block fits into the scratch buffer
+ size_t result = kVertexBlockSizeBytes / vertex_size;
+
+ // align to byte group size; we encode each byte as a byte group
+ // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size
+ result &= ~(kByteGroupSize - 1);
+
+ return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize;
+}
+
+inline unsigned char zigzag8(unsigned char v)
+{
+ return ((signed char)(v) >> 7) ^ (v << 1);
+}
+
+inline unsigned char unzigzag8(unsigned char v)
+{
+ return -(v & 1) ^ (v >> 1);
+}
+
+#if TRACE
+struct Stats
+{
+ size_t size;
+ size_t header;
+ size_t bitg[4];
+ size_t bitb[4];
+};
+
+Stats* bytestats;
+Stats vertexstats[256];
+#endif
+
+static bool encodeBytesGroupZero(const unsigned char* buffer)
+{
+ for (size_t i = 0; i < kByteGroupSize; ++i)
+ if (buffer[i])
+ return false;
+
+ return true;
+}
+
+static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits)
+{
+ assert(bits >= 1 && bits <= 8);
+
+ if (bits == 1)
+ return encodeBytesGroupZero(buffer) ? 0 : size_t(-1);
+
+ if (bits == 8)
+ return kByteGroupSize;
+
+ size_t result = kByteGroupSize * bits / 8;
+
+ unsigned char sentinel = (1 << bits) - 1;
+
+ for (size_t i = 0; i < kByteGroupSize; ++i)
+ result += buffer[i] >= sentinel;
+
+ return result;
+}
+
+static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits)
+{
+ assert(bits >= 1 && bits <= 8);
+
+ if (bits == 1)
+ return data;
+
+ if (bits == 8)
+ {
+ memcpy(data, buffer, kByteGroupSize);
+ return data + kByteGroupSize;
+ }
+
+ size_t byte_size = 8 / bits;
+ assert(kByteGroupSize % byte_size == 0);
+
+ // fixed portion: bits bits for each value
+ // variable portion: full byte for each out-of-range value (using 1...1 as sentinel)
+ unsigned char sentinel = (1 << bits) - 1;
+
+ for (size_t i = 0; i < kByteGroupSize; i += byte_size)
+ {
+ unsigned char byte = 0;
+
+ for (size_t k = 0; k < byte_size; ++k)
+ {
+ unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k];
+
+ byte <<= bits;
+ byte |= enc;
+ }
+
+ *data++ = byte;
+ }
+
+ for (size_t i = 0; i < kByteGroupSize; ++i)
+ {
+ if (buffer[i] >= sentinel)
+ {
+ *data++ = buffer[i];
+ }
+ }
+
+ return data;
+}
+
+static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size)
+{
+ assert(buffer_size % kByteGroupSize == 0);
+
+ unsigned char* header = data;
+
+ // round number of groups to 4 to get number of header bytes
+ size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+ if (size_t(data_end - data) < header_size)
+ return 0;
+
+ data += header_size;
+
+ memset(header, 0, header_size);
+
+ for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+ {
+ if (size_t(data_end - data) < kByteGroupDecodeLimit)
+ return 0;
+
+ int best_bits = 8;
+ size_t best_size = encodeBytesGroupMeasure(buffer + i, 8);
+
+ for (int bits = 1; bits < 8; bits *= 2)
+ {
+ size_t size = encodeBytesGroupMeasure(buffer + i, bits);
+
+ if (size < best_size)
+ {
+ best_bits = bits;
+ best_size = size;
+ }
+ }
+
+ int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3;
+ assert((1 << bitslog2) == best_bits);
+
+ size_t header_offset = i / kByteGroupSize;
+
+ header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2);
+
+ unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits);
+
+ assert(data + best_size == next);
+ data = next;
+
+#if TRACE > 1
+ bytestats->bitg[bitslog2]++;
+ bytestats->bitb[bitslog2] += best_size;
+#endif
+ }
+
+#if TRACE > 1
+ bytestats->header += header_size;
+#endif
+
+ return data;
+}
+
+static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+ assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+ unsigned char buffer[kVertexBlockMaxSize];
+ assert(sizeof(buffer) % kByteGroupSize == 0);
+
+ // we sometimes encode elements we didn't fill when rounding to kByteGroupSize
+ memset(buffer, 0, sizeof(buffer));
+
+ for (size_t k = 0; k < vertex_size; ++k)
+ {
+ size_t vertex_offset = k;
+
+ unsigned char p = last_vertex[k];
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ buffer[i] = zigzag8(vertex_data[vertex_offset] - p);
+
+ p = vertex_data[vertex_offset];
+
+ vertex_offset += vertex_size;
+ }
+
+#if TRACE
+ const unsigned char* olddata = data;
+ bytestats = &vertexstats[k];
+#endif
+
+ data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1));
+ if (!data)
+ return 0;
+
+#if TRACE
+ bytestats = 0;
+ vertexstats[k].size += data - olddata;
+#endif
+ }
+
+ memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size);
+
+ return data;
+}
+
+#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX))
+static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+#define READ() byte = *data++
+#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1)
+
+ unsigned char byte, enc, encv;
+ const unsigned char* data_var;
+
+ switch (bitslog2)
+ {
+ case 0:
+ memset(buffer, 0, kByteGroupSize);
+ return data;
+ case 1:
+ data_var = data + 4;
+
+ // 4 groups with 4 2-bit values in each byte
+ READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+ READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+ READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+ READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2);
+
+ return data_var;
+ case 2:
+ data_var = data + 8;
+
+ // 8 groups with 2 4-bit values in each byte
+ READ(), NEXT(4), NEXT(4);
+ READ(), NEXT(4), NEXT(4);
+ READ(), NEXT(4), NEXT(4);
+ READ(), NEXT(4), NEXT(4);
+ READ(), NEXT(4), NEXT(4);
+ READ(), NEXT(4), NEXT(4);
+ READ(), NEXT(4), NEXT(4);
+ READ(), NEXT(4), NEXT(4);
+
+ return data_var;
+ case 3:
+ memcpy(buffer, data, kByteGroupSize);
+ return data + kByteGroupSize;
+ default:
+ assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+ return data;
+ }
+
+#undef READ
+#undef NEXT
+}
+
+static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+ assert(buffer_size % kByteGroupSize == 0);
+
+ const unsigned char* header = data;
+
+ // round number of groups to 4 to get number of header bytes
+ size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+ if (size_t(data_end - data) < header_size)
+ return 0;
+
+ data += header_size;
+
+ for (size_t i = 0; i < buffer_size; i += kByteGroupSize)
+ {
+ if (size_t(data_end - data) < kByteGroupDecodeLimit)
+ return 0;
+
+ size_t header_offset = i / kByteGroupSize;
+
+ int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+ data = decodeBytesGroup(data, buffer + i, bitslog2);
+ }
+
+ return data;
+}
+
+static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+ assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+ unsigned char buffer[kVertexBlockMaxSize];
+ unsigned char transposed[kVertexBlockSizeBytes];
+
+ size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+ for (size_t k = 0; k < vertex_size; ++k)
+ {
+ data = decodeBytes(data, data_end, buffer, vertex_count_aligned);
+ if (!data)
+ return 0;
+
+ size_t vertex_offset = k;
+
+ unsigned char p = last_vertex[k];
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ {
+ unsigned char v = unzigzag8(buffer[i]) + p;
+
+ transposed[vertex_offset] = v;
+ p = v;
+
+ vertex_offset += vertex_size;
+ }
+ }
+
+ memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+ memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+ return data;
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+static unsigned char kDecodeBytesGroupShuffle[256][8];
+static unsigned char kDecodeBytesGroupCount[256];
+
+#ifdef __wasm__
+__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop!
+#endif
+static bool
+decodeBytesGroupBuildTables()
+{
+ for (int mask = 0; mask < 256; ++mask)
+ {
+ unsigned char shuffle[8];
+ unsigned char count = 0;
+
+ for (int i = 0; i < 8; ++i)
+ {
+ int maski = (mask >> i) & 1;
+ shuffle[i] = maski ? count : 0x80;
+ count += (unsigned char)(maski);
+ }
+
+ memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8);
+ kDecodeBytesGroupCount[mask] = count;
+ }
+
+ return true;
+}
+
+static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables();
+#endif
+
+#ifdef SIMD_SSE
+SIMD_TARGET
+static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+ __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0]));
+ __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1]));
+ __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]);
+
+ __m128i sm1r = _mm_add_epi8(sm1, sm1off);
+
+ return _mm_unpacklo_epi64(sm0, sm1r);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+ switch (bitslog2)
+ {
+ case 0:
+ {
+ __m128i result = _mm_setzero_si128();
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data;
+ }
+
+ case 1:
+ {
+#ifdef __GNUC__
+ typedef int __attribute__((aligned(1))) unaligned_int;
+#else
+ typedef int unaligned_int;
+#endif
+
+ __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data));
+ __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4));
+
+ __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2);
+ __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22);
+ __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3));
+
+ __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3));
+ int mask16 = _mm_movemask_epi8(mask);
+ unsigned char mask0 = (unsigned char)(mask16 & 255);
+ unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+ __m128i shuf = decodeShuffleMask(mask0, mask1);
+
+ __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
+ case 2:
+ {
+ __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+ __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8));
+
+ __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4);
+ __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15));
+
+ __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15));
+ int mask16 = _mm_movemask_epi8(mask);
+ unsigned char mask0 = (unsigned char)(mask16 & 255);
+ unsigned char mask1 = (unsigned char)(mask16 >> 8);
+
+ __m128i shuf = decodeShuffleMask(mask0, mask1);
+
+ __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel));
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
+ case 3:
+ {
+ __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data + 16;
+ }
+
+ default:
+ assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+ return data;
+ }
+}
+#endif
+
+#ifdef SIMD_AVX
+static const __m128i decodeBytesGroupConfig[] = {
+ _mm_set1_epi8(3),
+ _mm_set1_epi8(15),
+ _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24),
+ _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56),
+};
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+ switch (bitslog2)
+ {
+ case 0:
+ {
+ __m128i result = _mm_setzero_si128();
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data;
+ }
+
+ case 1:
+ case 2:
+ {
+ const unsigned char* skip = data + (bitslog2 << 2);
+
+ __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data));
+ __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip));
+
+ __m128i sent = decodeBytesGroupConfig[bitslog2 - 1];
+ __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1];
+
+ __m128i selw = _mm_shuffle_epi32(selb, 0x44);
+ __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw));
+ __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ);
+
+ __m128i result = _mm_mask_expand_epi8(sel, mask16, rest);
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return skip + _mm_popcnt_u32(mask16);
+ }
+
+ case 3:
+ {
+ __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data));
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result);
+
+ return data + 16;
+ }
+
+ default:
+ assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+ return data;
+ }
+}
+#endif
+
+#ifdef SIMD_NEON
+static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1)
+{
+ uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]);
+ uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]);
+
+ uint8x8_t r0 = vtbl1_u8(rest0, sm0);
+ uint8x8_t r1 = vtbl1_u8(rest1, sm1);
+
+ return vcombine_u8(r0, r1);
+}
+
+static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+ static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128};
+
+ uint8x16_t byte_mask = vld1q_u8(byte_mask_data);
+ uint8x16_t masked = vandq_u8(mask, byte_mask);
+
+#ifdef __aarch64__
+ // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc
+ mask0 = vaddv_u8(vget_low_u8(masked));
+ mask1 = vaddv_u8(vget_high_u8(masked));
+#else
+ // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8)
+ uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked));
+ uint8x8_t sum2 = vpadd_u8(sum1, sum1);
+ uint8x8_t sum3 = vpadd_u8(sum2, sum2);
+
+ mask0 = vget_lane_u8(sum3, 0);
+ mask1 = vget_lane_u8(sum3, 1);
+#endif
+}
+
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+ switch (bitslog2)
+ {
+ case 0:
+ {
+ uint8x16_t result = vdupq_n_u8(0);
+
+ vst1q_u8(buffer, result);
+
+ return data;
+ }
+
+ case 1:
+ {
+ uint8x8_t sel2 = vld1_u8(data);
+ uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0];
+ uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22);
+ uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3));
+
+ uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3));
+ unsigned char mask0, mask1;
+ neonMoveMask(mask, mask0, mask1);
+
+ uint8x8_t rest0 = vld1_u8(data + 4);
+ uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]);
+
+ uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+ vst1q_u8(buffer, result);
+
+ return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
+ case 2:
+ {
+ uint8x8_t sel4 = vld1_u8(data);
+ uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15)));
+ uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]);
+
+ uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15));
+ unsigned char mask0, mask1;
+ neonMoveMask(mask, mask0, mask1);
+
+ uint8x8_t rest0 = vld1_u8(data + 8);
+ uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]);
+
+ uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel);
+
+ vst1q_u8(buffer, result);
+
+ return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
+ case 3:
+ {
+ uint8x16_t result = vld1q_u8(data);
+
+ vst1q_u8(buffer, result);
+
+ return data + 16;
+ }
+
+ default:
+ assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+ return data;
+ }
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
+{
+ v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]);
+ v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]);
+
+ v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]);
+ sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+
+ v128_t sm1r = wasm_i8x16_add(sm1, sm1off);
+
+ return wasmx_unpacklo_v64x2(sm0, sm1r);
+}
+
+SIMD_TARGET
+static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
+{
+ v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
+
+ uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
+ uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
+
+ // TODO: This can use v8x16_bitmask in the future
+ uint64_t mask_2 = mask_1a | mask_1b;
+ uint64_t mask_4 = mask_2 | (mask_2 >> 16);
+ uint64_t mask_8 = mask_4 | (mask_4 >> 8);
+
+ mask0 = uint8_t(mask_8);
+ mask1 = uint8_t(mask_8 >> 32);
+}
+
+SIMD_TARGET
+static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2)
+{
+ unsigned char byte, enc, encv;
+ const unsigned char* data_var;
+
+ switch (bitslog2)
+ {
+ case 0:
+ {
+ v128_t result = wasm_i8x16_splat(0);
+
+ wasm_v128_store(buffer, result);
+
+ return data;
+ }
+
+ case 1:
+ {
+ v128_t sel2 = wasm_v128_load(data);
+ v128_t rest = wasm_v128_load(data + 4);
+
+ v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2);
+ v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22);
+ v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3));
+
+ v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3));
+
+ unsigned char mask0, mask1;
+ wasmMoveMask(mask, mask0, mask1);
+
+ v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+ v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+ wasm_v128_store(buffer, result);
+
+ return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
+ case 2:
+ {
+ v128_t sel4 = wasm_v128_load(data);
+ v128_t rest = wasm_v128_load(data + 8);
+
+ v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4);
+ v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15));
+
+ v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15));
+
+ unsigned char mask0, mask1;
+ wasmMoveMask(mask, mask0, mask1);
+
+ v128_t shuf = decodeShuffleMask(mask0, mask1);
+
+ v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask);
+
+ wasm_v128_store(buffer, result);
+
+ return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1];
+ }
+
+ case 3:
+ {
+ v128_t result = wasm_v128_load(data);
+
+ wasm_v128_store(buffer, result);
+
+ return data + 16;
+ }
+
+ default:
+ assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value
+ return data;
+ }
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+SIMD_TARGET
+static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3)
+{
+ __m128i t0 = _mm_unpacklo_epi8(x0, x1);
+ __m128i t1 = _mm_unpackhi_epi8(x0, x1);
+ __m128i t2 = _mm_unpacklo_epi8(x2, x3);
+ __m128i t3 = _mm_unpackhi_epi8(x2, x3);
+
+ x0 = _mm_unpacklo_epi16(t0, t2);
+ x1 = _mm_unpackhi_epi16(t0, t2);
+ x2 = _mm_unpacklo_epi16(t1, t3);
+ x3 = _mm_unpackhi_epi16(t1, t3);
+}
+
+SIMD_TARGET
+static __m128i unzigzag8(__m128i v)
+{
+ __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1)));
+ __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127));
+
+ return _mm_xor_si128(xl, xr);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3)
+{
+ uint8x16x2_t t01 = vzipq_u8(x0, x1);
+ uint8x16x2_t t23 = vzipq_u8(x2, x3);
+
+ uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0]));
+ uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1]));
+
+ x0 = vreinterpretq_u8_u16(x01.val[0]);
+ x1 = vreinterpretq_u8_u16(x01.val[1]);
+ x2 = vreinterpretq_u8_u16(x23.val[0]);
+ x3 = vreinterpretq_u8_u16(x23.val[1]);
+}
+
+static uint8x16_t unzigzag8(uint8x16_t v)
+{
+ uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1)))));
+ uint8x16_t xr = vshrq_n_u8(v, 1);
+
+ return veorq_u8(xl, xr);
+}
+#endif
+
+#ifdef SIMD_WASM
+SIMD_TARGET
+static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3)
+{
+ v128_t t0 = wasmx_unpacklo_v8x16(x0, x1);
+ v128_t t1 = wasmx_unpackhi_v8x16(x0, x1);
+ v128_t t2 = wasmx_unpacklo_v8x16(x2, x3);
+ v128_t t3 = wasmx_unpackhi_v8x16(x2, x3);
+
+ x0 = wasmx_unpacklo_v16x8(t0, t2);
+ x1 = wasmx_unpackhi_v16x8(t0, t2);
+ x2 = wasmx_unpacklo_v16x8(t1, t3);
+ x3 = wasmx_unpackhi_v16x8(t1, t3);
+}
+
+SIMD_TARGET
+static v128_t unzigzag8(v128_t v)
+{
+ v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1)));
+ v128_t xr = wasm_u8x16_shr(v, 1);
+
+ return wasm_v128_xor(xl, xr);
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+SIMD_TARGET
+static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size)
+{
+ assert(buffer_size % kByteGroupSize == 0);
+ assert(kByteGroupSize == 16);
+
+ const unsigned char* header = data;
+
+ // round number of groups to 4 to get number of header bytes
+ size_t header_size = (buffer_size / kByteGroupSize + 3) / 4;
+
+ if (size_t(data_end - data) < header_size)
+ return 0;
+
+ data += header_size;
+
+ size_t i = 0;
+
+ // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b
+ for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4)
+ {
+ size_t header_offset = i / kByteGroupSize;
+ unsigned char header_byte = header[header_offset / 4];
+
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3);
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3);
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3);
+ data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3);
+ }
+
+ // slow-path: process remaining groups
+ for (; i < buffer_size; i += kByteGroupSize)
+ {
+ if (size_t(data_end - data) < kByteGroupDecodeLimit)
+ return 0;
+
+ size_t header_offset = i / kByteGroupSize;
+
+ int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3;
+
+ data = decodeBytesGroupSimd(data, buffer + i, bitslog2);
+ }
+
+ return data;
+}
+
+SIMD_TARGET
+static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256])
+{
+ assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize);
+
+ unsigned char buffer[kVertexBlockMaxSize * 4];
+ unsigned char transposed[kVertexBlockSizeBytes];
+
+ size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1);
+
+ for (size_t k = 0; k < vertex_size; k += 4)
+ {
+ for (size_t j = 0; j < 4; ++j)
+ {
+ data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned);
+ if (!data)
+ return 0;
+ }
+
+#if defined(SIMD_SSE) || defined(SIMD_AVX)
+#define TEMP __m128i
+#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k))
+#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned))
+#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3)
+#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size
+#endif
+
+#ifdef SIMD_NEON
+#define TEMP uint8x8_t
+#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0))
+#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1))
+#define FIXD(i) t##i = pi = vadd_u8(pi, t##i)
+#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size
+#endif
+
+#ifdef SIMD_WASM
+#define TEMP v128_t
+#define PREP() v128_t pi = wasm_v128_load(last_vertex + k)
+#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned)
+#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3)
+#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i)
+#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size
+#endif
+
+ PREP();
+
+ unsigned char* savep = transposed + k;
+
+ for (size_t j = 0; j < vertex_count_aligned; j += 16)
+ {
+ LOAD(0);
+ LOAD(1);
+ LOAD(2);
+ LOAD(3);
+
+ r0 = unzigzag8(r0);
+ r1 = unzigzag8(r1);
+ r2 = unzigzag8(r2);
+ r3 = unzigzag8(r3);
+
+ transpose8(r0, r1, r2, r3);
+
+ TEMP t0, t1, t2, t3;
+
+ GRP4(0);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ GRP4(1);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ GRP4(2);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+ GRP4(3);
+ FIXD(0), FIXD(1), FIXD(2), FIXD(3);
+ SAVE(0), SAVE(1), SAVE(2), SAVE(3);
+
+#undef TEMP
+#undef PREP
+#undef LOAD
+#undef GRP4
+#undef FIXD
+#undef SAVE
+ }
+ }
+
+ memcpy(vertex_data, transposed, vertex_count * vertex_size);
+
+ memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size);
+
+ return data;
+}
+#endif
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+static unsigned int getCpuFeatures()
+{
+ int cpuinfo[4] = {};
+#ifdef _MSC_VER
+ __cpuid(cpuinfo, 1);
+#else
+ __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]);
+#endif
+ return cpuinfo[2];
+}
+
+unsigned int cpuid = getCpuFeatures();
+#endif
+
+} // namespace meshopt
+
+size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_size > 0 && vertex_size <= 256);
+ assert(vertex_size % 4 == 0);
+
+#if TRACE
+ memset(vertexstats, 0, sizeof(vertexstats));
+#endif
+
+ const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices);
+
+ unsigned char* data = buffer;
+ unsigned char* data_end = buffer + buffer_size;
+
+ if (size_t(data_end - data) < 1 + vertex_size)
+ return 0;
+
+ int version = gEncodeVertexVersion;
+
+ *data++ = (unsigned char)(kVertexHeader | version);
+
+ unsigned char first_vertex[256] = {};
+ if (vertex_count > 0)
+ memcpy(first_vertex, vertex_data, vertex_size);
+
+ unsigned char last_vertex[256] = {};
+ memcpy(last_vertex, first_vertex, vertex_size);
+
+ size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+ size_t vertex_offset = 0;
+
+ while (vertex_offset < vertex_count)
+ {
+ size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+ data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+ if (!data)
+ return 0;
+
+ vertex_offset += block_size;
+ }
+
+ size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+ if (size_t(data_end - data) < tail_size)
+ return 0;
+
+ // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder
+ if (vertex_size < kTailMaxSize)
+ {
+ memset(data, 0, kTailMaxSize - vertex_size);
+ data += kTailMaxSize - vertex_size;
+ }
+
+ memcpy(data, first_vertex, vertex_size);
+ data += vertex_size;
+
+ assert(data >= buffer + tail_size);
+ assert(data <= buffer + buffer_size);
+
+#if TRACE
+ size_t total_size = data - buffer;
+
+ for (size_t k = 0; k < vertex_size; ++k)
+ {
+ const Stats& vsk = vertexstats[k];
+
+ printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8);
+
+#if TRACE > 1
+ printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)",
+ int(vsk.header),
+ int(vsk.bitg[0]), int(vsk.bitb[0]),
+ int(vsk.bitg[1]), int(vsk.bitb[1]),
+ int(vsk.bitg[2]), int(vsk.bitb[2]),
+ int(vsk.bitg[3]), int(vsk.bitb[3]));
+#endif
+
+ printf("\n");
+ }
+#endif
+
+ return data - buffer;
+}
+
+size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_size > 0 && vertex_size <= 256);
+ assert(vertex_size % 4 == 0);
+
+ size_t vertex_block_size = getVertexBlockSize(vertex_size);
+ size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size;
+
+ size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4;
+ size_t vertex_block_data_size = vertex_block_size;
+
+ size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+ return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size;
+}
+
+void meshopt_encodeVertexVersion(int version)
+{
+ assert(unsigned(version) <= 0);
+
+ meshopt::gEncodeVertexVersion = version;
+}
+
+int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_size > 0 && vertex_size <= 256);
+ assert(vertex_size % 4 == 0);
+
+ const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0;
+
+#if defined(SIMD_SSE) && defined(SIMD_FALLBACK)
+ decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock;
+#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM)
+ decode = decodeVertexBlockSimd;
+#else
+ decode = decodeVertexBlock;
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+ assert(gDecodeBytesGroupInitialized);
+ (void)gDecodeBytesGroupInitialized;
+#endif
+
+ unsigned char* vertex_data = static_cast<unsigned char*>(destination);
+
+ const unsigned char* data = buffer;
+ const unsigned char* data_end = buffer + buffer_size;
+
+ if (size_t(data_end - data) < 1 + vertex_size)
+ return -2;
+
+ unsigned char data_header = *data++;
+
+ if ((data_header & 0xf0) != kVertexHeader)
+ return -1;
+
+ int version = data_header & 0x0f;
+ if (version > 0)
+ return -1;
+
+ unsigned char last_vertex[256];
+ memcpy(last_vertex, data_end - vertex_size, vertex_size);
+
+ size_t vertex_block_size = getVertexBlockSize(vertex_size);
+
+ size_t vertex_offset = 0;
+
+ while (vertex_offset < vertex_count)
+ {
+ size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset;
+
+ data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex);
+ if (!data)
+ return -2;
+
+ vertex_offset += block_size;
+ }
+
+ size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size;
+
+ if (size_t(data_end - data) != tail_size)
+ return -3;
+
+ return 0;
+}
+
+#undef SIMD_NEON
+#undef SIMD_SSE
+#undef SIMD_AVX
+#undef SIMD_WASM
+#undef SIMD_FALLBACK
+#undef SIMD_TARGET
diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp
new file mode 100644
index 0000000000..e7ad2c9d39
--- /dev/null
+++ b/thirdparty/meshoptimizer/vertexfilter.cpp
@@ -0,0 +1,825 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <math.h>
+
+// The block below auto-detects SIMD ISA that can be used on the target platform
+#ifndef MESHOPTIMIZER_NO_SIMD
+
+// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings
+#if defined(__SSE2__)
+#define SIMD_SSE
+#endif
+
+// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2
+#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64))
+#define SIMD_SSE
+#endif
+
+// GCC/clang define these when NEON support is available
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#define SIMD_NEON
+#endif
+
+// On MSVC, we assume that ARM builds always target NEON-capable devices
+#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+#define SIMD_NEON
+#endif
+
+// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD
+#if defined(__wasm_simd128__)
+#define SIMD_WASM
+#endif
+
+#endif // !MESHOPTIMIZER_NO_SIMD
+
+#ifdef SIMD_SSE
+#include <emmintrin.h>
+#include <stdint.h>
+#endif
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#ifdef SIMD_NEON
+#if defined(_MSC_VER) && defined(_M_ARM64)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+#endif
+
+#ifdef SIMD_WASM
+#include <wasm_simd128.h>
+#endif
+
+#ifdef SIMD_WASM
+#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11)
+#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15)
+#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6)
+#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7)
+#endif
+
+namespace meshopt
+{
+
+#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM)
+template <typename T>
+static void decodeFilterOct(T* data, size_t count)
+{
+ const float max = float((1 << (sizeof(T) * 8 - 1)) - 1);
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+ float x = float(data[i * 4 + 0]);
+ float y = float(data[i * 4 + 1]);
+ float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y);
+
+ // fixup octahedral coordinates for z<0
+ float t = (z >= 0.f) ? 0.f : z;
+
+ x += (x >= 0.f) ? t : -t;
+ y += (y >= 0.f) ? t : -t;
+
+ // compute normal length & scale
+ float l = sqrtf(x * x + y * y + z * z);
+ float s = max / l;
+
+ // rounded signed float->int
+ int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f));
+ int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f));
+ int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f));
+
+ data[i * 4 + 0] = T(xf);
+ data[i * 4 + 1] = T(yf);
+ data[i * 4 + 2] = T(zf);
+ }
+}
+
+static void decodeFilterQuat(short* data, size_t count)
+{
+ const float scale = 1.f / sqrtf(2.f);
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ // recover scale from the high byte of the component
+ int sf = data[i * 4 + 3] | 3;
+ float ss = scale / float(sf);
+
+ // convert x/y/z to [-1..1] (scaled...)
+ float x = float(data[i * 4 + 0]) * ss;
+ float y = float(data[i * 4 + 1]) * ss;
+ float z = float(data[i * 4 + 2]) * ss;
+
+ // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+ float ww = 1.f - x * x - y * y - z * z;
+ float w = sqrtf(ww >= 0.f ? ww : 0.f);
+
+ // rounded signed float->int
+ int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f));
+ int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f));
+ int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f));
+ int wf = int(w * 32767.f + 0.5f);
+
+ int qc = data[i * 4 + 3] & 3;
+
+ // output order is dictated by input index
+ data[i * 4 + ((qc + 1) & 3)] = short(xf);
+ data[i * 4 + ((qc + 2) & 3)] = short(yf);
+ data[i * 4 + ((qc + 3) & 3)] = short(zf);
+ data[i * 4 + ((qc + 0) & 3)] = short(wf);
+ }
+}
+
+static void decodeFilterExp(unsigned int* data, size_t count)
+{
+ for (size_t i = 0; i < count; ++i)
+ {
+ unsigned int v = data[i];
+
+ // decode mantissa and exponent
+ int m = int(v << 8) >> 8;
+ int e = int(v) >> 24;
+
+ union
+ {
+ float f;
+ unsigned int ui;
+ } u;
+
+ // optimized version of ldexp(float(m), e)
+ u.ui = unsigned(e + 127) << 23;
+ u.f = u.f * float(m);
+
+ data[i] = u.ui;
+ }
+}
+#endif
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+inline uint64_t rotateleft64(uint64_t v, int x)
+{
+#if defined(_MSC_VER) && !defined(__clang__)
+ return _rotl64(v, x);
+// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for
+// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions
+#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11)
+ return __builtin_rotateleft64(v, x);
+#else
+ return (v << (x & 63)) | (v >> ((64 - x) & 63));
+#endif
+}
+#endif
+
+#ifdef SIMD_SSE
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+ const __m128 sign = _mm_set1_ps(-0.f);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ __m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4]));
+
+ // sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+ __m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24);
+ __m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24);
+
+ // unpack z; note that z is unsigned so we technically don't need to sign extend it
+ __m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24);
+
+ // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+ __m128 x = _mm_cvtepi32_ps(xf);
+ __m128 y = _mm_cvtepi32_ps(yf);
+ __m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+ // fixup octahedral coordinates for z<0
+ __m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+ x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+ y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+ // compute normal length & scale
+ __m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+ __m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll));
+
+ // rounded signed float->int
+ __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+ __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+ __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+ // combine xr/yr/zr into final value
+ __m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000));
+ res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff)));
+ res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8));
+ res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16));
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res);
+ }
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+ const __m128 sign = _mm_set1_ps(-0.f);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ __m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+ __m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+ // gather both x/y 16-bit pairs in each 32-bit lane
+ __m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+
+ // sign-extends each of x,y in [x y] with arithmetic shifts
+ __m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16);
+ __m128i yf = _mm_srai_epi32(n4, 16);
+
+ // unpack z; note that z is unsigned so we don't need to sign extend it
+ __m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+ __m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff));
+
+ // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+ __m128 x = _mm_cvtepi32_ps(xf);
+ __m128 y = _mm_cvtepi32_ps(yf);
+ __m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y)));
+
+ // fixup octahedral coordinates for z<0
+ __m128 t = _mm_min_ps(z, _mm_setzero_ps());
+
+ x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign)));
+ y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign)));
+
+ // compute normal length & scale
+ __m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)));
+ __m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll));
+
+ // rounded signed float->int
+ __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+ __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+ __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+
+ // mix x/z and y/0 to make 16-bit unpack easier
+ __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+ __m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff));
+
+ // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+ __m128i res_0 = _mm_unpacklo_epi16(xzr, y0r);
+ __m128i res_1 = _mm_unpackhi_epi16(xzr, y0r);
+
+ // patch in .w
+ res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000)));
+ res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000)));
+
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1);
+ }
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+ const float scale = 1.f / sqrtf(2.f);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ __m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4]));
+ __m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4]));
+
+ // gather both x/y 16-bit pairs in each 32-bit lane
+ __m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0)));
+ __m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1)));
+
+ // sign-extends each of x,y in [x y] with arithmetic shifts
+ __m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16);
+ __m128i yf = _mm_srai_epi32(q4_xy, 16);
+ __m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16);
+ __m128i cf = _mm_srai_epi32(q4_zc, 16);
+
+ // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+ __m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3));
+ __m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf));
+
+ // convert x/y/z to [-1..1] (scaled...)
+ __m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss);
+ __m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss);
+ __m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss);
+
+ // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+ __m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))));
+ __m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps()));
+
+ __m128 s = _mm_set1_ps(32767.f);
+
+ // rounded signed float->int
+ __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s));
+ __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s));
+ __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s));
+ __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s));
+
+ // mix x/z and w/y to make 16-bit unpack easier
+ __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16));
+ __m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16));
+
+ // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+ __m128i res_0 = _mm_unpacklo_epi16(wyr, xzr);
+ __m128i res_1 = _mm_unpackhi_epi16(wyr, xzr);
+
+ // store results to stack so that we can rotate using scalar instructions
+ uint64_t res[4];
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0);
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1);
+
+ // rotate and store
+ uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+ out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4);
+ out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4);
+ out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4);
+ out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4);
+ }
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+ for (size_t i = 0; i < count; i += 4)
+ {
+ __m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i]));
+
+ // decode exponent into 2^x directly
+ __m128i ef = _mm_srai_epi32(v, 24);
+ __m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23);
+
+ // decode 24-bit mantissa into floating-point value
+ __m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8);
+ __m128 m = _mm_cvtepi32_ps(mf);
+
+ __m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m);
+
+ _mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r);
+ }
+}
+#endif
+
+#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64)
+inline float32x4_t vsqrtq_f32(float32x4_t x)
+{
+ float32x4_t r = vrsqrteq_f32(x);
+ r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate
+ return vmulq_f32(r, x);
+}
+
+inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y)
+{
+ float32x4_t r = vrecpeq_f32(y);
+ r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate
+ return vmulq_f32(x, r);
+}
+#endif
+
+#ifdef SIMD_NEON
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+ const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]));
+
+ // sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+ int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24);
+ int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24);
+
+ // unpack z; note that z is unsigned so we technically don't need to sign extend it
+ int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24);
+
+ // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+ float32x4_t x = vcvtq_f32_s32(xf);
+ float32x4_t y = vcvtq_f32_s32(yf);
+ float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+ // fixup octahedral coordinates for z<0
+ float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+ x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+ y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+ // compute normal length & scale
+ float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+ float32x4_t rl = vrsqrteq_f32(ll);
+ float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl);
+
+ // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+ // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+ const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+ int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+ int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+ int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+ // combine xr/yr/zr into final value
+ int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000));
+ res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff)));
+ res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8));
+ res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16));
+
+ vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res);
+ }
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+ const int32x4_t sign = vdupq_n_s32(0x80000000);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+ int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+ // gather both x/y 16-bit pairs in each 32-bit lane
+ int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0];
+
+ // sign-extends each of x,y in [x y] with arithmetic shifts
+ int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16);
+ int32x4_t yf = vshrq_n_s32(n4, 16);
+
+ // unpack z; note that z is unsigned so we don't need to sign extend it
+ int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1];
+ int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff));
+
+ // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+ float32x4_t x = vcvtq_f32_s32(xf);
+ float32x4_t y = vcvtq_f32_s32(yf);
+ float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y)));
+
+ // fixup octahedral coordinates for z<0
+ float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f));
+
+ x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign))));
+ y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign))));
+
+ // compute normal length & scale
+ float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)));
+ float32x4_t rl = vrsqrteq_f32(ll);
+ rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate
+ float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl);
+
+ // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+ // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+ const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+ int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+ int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+ int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+
+ // mix x/z and y/0 to make 16-bit unpack easier
+ int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+ int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff));
+
+ // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+ int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]);
+ int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]);
+
+ // patch in .w
+ res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0);
+ res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1);
+
+ vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0);
+ vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1);
+ }
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+ const float scale = 1.f / sqrtf(2.f);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]));
+ int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]));
+
+ // gather both x/y 16-bit pairs in each 32-bit lane
+ int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0];
+ int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1];
+
+ // sign-extends each of x,y in [x y] with arithmetic shifts
+ int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16);
+ int32x4_t yf = vshrq_n_s32(q4_xy, 16);
+ int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16);
+ int32x4_t cf = vshrq_n_s32(q4_zc, 16);
+
+ // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+ int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3));
+ float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf));
+
+ // convert x/y/z to [-1..1] (scaled...)
+ float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss);
+ float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss);
+ float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss);
+
+ // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+ float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))));
+ float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f)));
+
+ float32x4_t s = vdupq_n_f32(32767.f);
+
+ // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+ // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+ const float32x4_t fsnap = vdupq_n_f32(3 << 22);
+
+ int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap));
+ int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap));
+ int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap));
+ int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap));
+
+ // mix x/z and w/y to make 16-bit unpack easier
+ int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16));
+ int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16));
+
+ // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+ int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]);
+ int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]);
+
+ // rotate and store
+ uint64_t* out = (uint64_t*)&data[i * 4];
+
+ out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4);
+ out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4);
+ out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4);
+ out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4);
+ }
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+ for (size_t i = 0; i < count; i += 4)
+ {
+ int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i]));
+
+ // decode exponent into 2^x directly
+ int32x4_t ef = vshrq_n_s32(v, 24);
+ int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23);
+
+ // decode 24-bit mantissa into floating-point value
+ int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8);
+ float32x4_t m = vcvtq_f32_s32(mf);
+
+ float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m);
+
+ vst1q_f32(reinterpret_cast<float*>(&data[i]), r);
+ }
+}
+#endif
+
+#ifdef SIMD_WASM
+static void decodeFilterOctSimd(signed char* data, size_t count)
+{
+ const v128_t sign = wasm_f32x4_splat(-0.f);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ v128_t n4 = wasm_v128_load(&data[i * 4]);
+
+ // sign-extends each of x,y in [x y ? ?] with arithmetic shifts
+ v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24);
+ v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24);
+
+ // unpack z; note that z is unsigned so we technically don't need to sign extend it
+ v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24);
+
+ // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+ v128_t x = wasm_f32x4_convert_i32x4(xf);
+ v128_t y = wasm_f32x4_convert_i32x4(yf);
+ v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+ // fixup octahedral coordinates for z<0
+ // note: i32x4_min with 0 is equvalent to f32x4_min
+ v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+ x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+ y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+ // compute normal length & scale
+ v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+ v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll));
+
+ // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+ // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction
+ const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+ v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+ v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+ v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+ // combine xr/yr/zr into final value
+ v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000));
+ res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff)));
+ res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8));
+ res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16));
+
+ wasm_v128_store(&data[i * 4], res);
+ }
+}
+
+static void decodeFilterOctSimd(short* data, size_t count)
+{
+ const v128_t sign = wasm_f32x4_splat(-0.f);
+ const v128_t zmask = wasm_i32x4_splat(0x7fff);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+ v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+ // gather both x/y 16-bit pairs in each 32-bit lane
+ v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1);
+
+ // sign-extends each of x,y in [x y] with arithmetic shifts
+ v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16);
+ v128_t yf = wasm_i32x4_shr(n4, 16);
+
+ // unpack z; note that z is unsigned so we don't need to sign extend it
+ v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1);
+ v128_t zf = wasm_v128_and(z4, zmask);
+
+ // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count
+ v128_t x = wasm_f32x4_convert_i32x4(xf);
+ v128_t y = wasm_f32x4_convert_i32x4(yf);
+ v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y)));
+
+ // fixup octahedral coordinates for z<0
+ // note: i32x4_min with 0 is equvalent to f32x4_min
+ v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0));
+
+ x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign)));
+ y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign)));
+
+ // compute normal length & scale
+ v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)));
+ v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll));
+
+ // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+ // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+ const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+ v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+ v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+ v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+
+ // mix x/z and y/0 to make 16-bit unpack easier
+ v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+ v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff));
+
+ // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w
+ v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r);
+ v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r);
+
+ // patch in .w
+ res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000)));
+ res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000)));
+
+ wasm_v128_store(&data[(i + 0) * 4], res_0);
+ wasm_v128_store(&data[(i + 2) * 4], res_1);
+ }
+}
+
+static void decodeFilterQuatSimd(short* data, size_t count)
+{
+ const float scale = 1.f / sqrtf(2.f);
+
+ for (size_t i = 0; i < count; i += 4)
+ {
+ v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]);
+ v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]);
+
+ // gather both x/y 16-bit pairs in each 32-bit lane
+ v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1);
+ v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1);
+
+ // sign-extends each of x,y in [x y] with arithmetic shifts
+ v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16);
+ v128_t yf = wasm_i32x4_shr(q4_xy, 16);
+ v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16);
+ v128_t cf = wasm_i32x4_shr(q4_zc, 16);
+
+ // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f)
+ v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3));
+ v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf));
+
+ // convert x/y/z to [-1..1] (scaled...)
+ v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss);
+ v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss);
+ v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss);
+
+ // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors
+ // note: i32x4_max with 0 is equivalent to f32x4_max
+ v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))));
+ v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0)));
+
+ v128_t s = wasm_f32x4_splat(32767.f);
+
+ // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value
+ // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction
+ const v128_t fsnap = wasm_f32x4_splat(3 << 22);
+
+ v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap);
+ v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap);
+ v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap);
+ v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap);
+
+ // mix x/z and w/y to make 16-bit unpack easier
+ v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16));
+ v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16));
+
+ // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0)
+ v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr);
+ v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr);
+
+ // compute component index shifted left by 4 (and moved into i32x4 slot)
+ // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449
+ volatile v128_t cm = wasm_i32x4_shl(cf, 4);
+
+ // rotate and store
+ uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]);
+
+ out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0));
+ out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1));
+ out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2));
+ out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3));
+ }
+}
+
+static void decodeFilterExpSimd(unsigned int* data, size_t count)
+{
+ for (size_t i = 0; i < count; i += 4)
+ {
+ v128_t v = wasm_v128_load(&data[i]);
+
+ // decode exponent into 2^x directly
+ v128_t ef = wasm_i32x4_shr(v, 24);
+ v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23);
+
+ // decode 24-bit mantissa into floating-point value
+ v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8);
+ v128_t m = wasm_f32x4_convert_i32x4(mf);
+
+ v128_t r = wasm_f32x4_mul(es, m);
+
+ wasm_v128_store(&data[i], r);
+ }
+}
+#endif
+
+} // namespace meshopt
+
+void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_count % 4 == 0);
+ assert(vertex_size == 4 || vertex_size == 8);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+ if (vertex_size == 4)
+ decodeFilterOctSimd(static_cast<signed char*>(buffer), vertex_count);
+ else
+ decodeFilterOctSimd(static_cast<short*>(buffer), vertex_count);
+#else
+ if (vertex_size == 4)
+ decodeFilterOct(static_cast<signed char*>(buffer), vertex_count);
+ else
+ decodeFilterOct(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_count % 4 == 0);
+ assert(vertex_size == 8);
+ (void)vertex_size;
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+ decodeFilterQuatSimd(static_cast<short*>(buffer), vertex_count);
+#else
+ decodeFilterQuat(static_cast<short*>(buffer), vertex_count);
+#endif
+}
+
+void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size)
+{
+ using namespace meshopt;
+
+ assert(vertex_count % 4 == 0);
+ assert(vertex_size % 4 == 0);
+
+#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM)
+ decodeFilterExpSimd(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#else
+ decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4));
+#endif
+}
+
+#undef SIMD_SSE
+#undef SIMD_NEON
+#undef SIMD_WASM
diff --git a/thirdparty/meshoptimizer/vfetchanalyzer.cpp b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
new file mode 100644
index 0000000000..51dca873f8
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchanalyzer.cpp
@@ -0,0 +1,58 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size)
+{
+ assert(index_count % 3 == 0);
+ assert(vertex_size > 0 && vertex_size <= 256);
+
+ meshopt_Allocator allocator;
+
+ meshopt_VertexFetchStatistics result = {};
+
+ unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count);
+ memset(vertex_visited, 0, vertex_count);
+
+ const size_t kCacheLine = 64;
+ const size_t kCacheSize = 128 * 1024;
+
+ // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway
+ size_t cache[kCacheSize / kCacheLine] = {};
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ vertex_visited[index] = 1;
+
+ size_t start_address = index * vertex_size;
+ size_t end_address = start_address + vertex_size;
+
+ size_t start_tag = start_address / kCacheLine;
+ size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine;
+
+ assert(start_tag < end_tag);
+
+ for (size_t tag = start_tag; tag < end_tag; ++tag)
+ {
+ size_t line = tag % (sizeof(cache) / sizeof(cache[0]));
+
+ // we store +1 since cache is filled with 0 by default
+ result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine;
+ cache[line] = tag + 1;
+ }
+ }
+
+ size_t unique_vertex_count = 0;
+
+ for (size_t i = 0; i < vertex_count; ++i)
+ unique_vertex_count += vertex_visited[i];
+
+ result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size);
+
+ return result;
+}
diff --git a/thirdparty/meshoptimizer/vfetchoptimizer.cpp b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
new file mode 100644
index 0000000000..465d6df5ca
--- /dev/null
+++ b/thirdparty/meshoptimizer/vfetchoptimizer.cpp
@@ -0,0 +1,74 @@
+// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details
+#include "meshoptimizer.h"
+
+#include <assert.h>
+#include <string.h>
+
+size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count)
+{
+ assert(index_count % 3 == 0);
+
+ memset(destination, -1, vertex_count * sizeof(unsigned int));
+
+ unsigned int next_vertex = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ if (destination[index] == ~0u)
+ {
+ destination[index] = next_vertex++;
+ }
+ }
+
+ assert(next_vertex <= vertex_count);
+
+ return next_vertex;
+}
+
+size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
+{
+ assert(index_count % 3 == 0);
+ assert(vertex_size > 0 && vertex_size <= 256);
+
+ meshopt_Allocator allocator;
+
+ // support in-place optimization
+ if (destination == vertices)
+ {
+ unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size);
+ memcpy(vertices_copy, vertices, vertex_count * vertex_size);
+ vertices = vertices_copy;
+ }
+
+ // build vertex remap table
+ unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count);
+ memset(vertex_remap, -1, vertex_count * sizeof(unsigned int));
+
+ unsigned int next_vertex = 0;
+
+ for (size_t i = 0; i < index_count; ++i)
+ {
+ unsigned int index = indices[i];
+ assert(index < vertex_count);
+
+ unsigned int& remap = vertex_remap[index];
+
+ if (remap == ~0u) // vertex was not added to destination VB
+ {
+ // add vertex
+ memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size);
+
+ remap = next_vertex++;
+ }
+
+ // modify indices in place
+ indices[i] = remap;
+ }
+
+ assert(next_vertex <= vertex_count);
+
+ return next_vertex;
+}
diff --git a/thirdparty/minimp3/LICENSE b/thirdparty/minimp3/LICENSE
new file mode 100644
index 0000000000..2c4afabdb6
--- /dev/null
+++ b/thirdparty/minimp3/LICENSE
@@ -0,0 +1,117 @@
+CC0 1.0 Universal
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator and
+subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for the
+purpose of contributing to a commons of creative, cultural and scientific
+works ("Commons") that the public can reliably and without fear of later
+claims of infringement build upon, modify, incorporate in other works, reuse
+and redistribute as freely as possible in any form whatsoever and for any
+purposes, including without limitation commercial purposes. These owners may
+contribute to the Commons to promote the ideal of a free culture and the
+further production of creative, cultural and scientific works, or to gain
+reputation or greater distribution for their Work in part through the use and
+efforts of others.
+
+For these and/or other purposes and motivations, and without any expectation
+of additional consideration or compensation, the person associating CC0 with a
+Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
+and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
+and publicly distribute the Work under its terms, with knowledge of his or her
+Copyright and Related Rights in the Work and the meaning and intended legal
+effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not limited
+to, the following:
+
+ i. the right to reproduce, adapt, distribute, perform, display, communicate,
+ and translate a Work;
+
+ ii. moral rights retained by the original author(s) and/or performer(s);
+
+ iii. publicity and privacy rights pertaining to a person's image or likeness
+ depicted in a Work;
+
+ iv. rights protecting against unfair competition in regards to a Work,
+ subject to the limitations in paragraph 4(a), below;
+
+ v. rights protecting the extraction, dissemination, use and reuse of data in
+ a Work;
+
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+ European Parliament and of the Council of 11 March 1996 on the legal
+ protection of databases, and under any national implementation thereof,
+ including any amended or successor version of such directive); and
+
+ vii. other similar, equivalent or corresponding rights throughout the world
+ based on applicable law or treaty, and any national implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention of,
+applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
+unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
+and Related Rights and associated claims and causes of action, whether now
+known or unknown (including existing as well as future claims and causes of
+action), in the Work (i) in all territories worldwide, (ii) for the maximum
+duration provided by applicable law or treaty (including future time
+extensions), (iii) in any current or future medium and for any number of
+copies, and (iv) for any purpose whatsoever, including without limitation
+commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
+the Waiver for the benefit of each member of the public at large and to the
+detriment of Affirmer's heirs and successors, fully intending that such Waiver
+shall not be subject to revocation, rescission, cancellation, termination, or
+any other legal or equitable action to disrupt the quiet enjoyment of the Work
+by the public as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason be
+judged legally invalid or ineffective under applicable law, then the Waiver
+shall be preserved to the maximum extent permitted taking into account
+Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
+is so judged Affirmer hereby grants to each affected person a royalty-free,
+non transferable, non sublicensable, non exclusive, irrevocable and
+unconditional license to exercise Affirmer's Copyright and Related Rights in
+the Work (i) in all territories worldwide, (ii) for the maximum duration
+provided by applicable law or treaty (including future time extensions), (iii)
+in any current or future medium and for any number of copies, and (iv) for any
+purpose whatsoever, including without limitation commercial, advertising or
+promotional purposes (the "License"). The License shall be deemed effective as
+of the date CC0 was applied by Affirmer to the Work. Should any part of the
+License for any reason be judged legally invalid or ineffective under
+applicable law, such partial invalidity or ineffectiveness shall not
+invalidate the remainder of the License, and in such case Affirmer hereby
+affirms that he or she will not (i) exercise any of his or her remaining
+Copyright and Related Rights in the Work or (ii) assert any associated claims
+and causes of action with respect to the Work, in either case contrary to
+Affirmer's express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+ surrendered, licensed or otherwise affected by this document.
+
+ b. Affirmer offers the Work as-is and makes no representations or warranties
+ of any kind concerning the Work, express, implied, statutory or otherwise,
+ including without limitation warranties of title, merchantability, fitness
+ for a particular purpose, non infringement, or the absence of latent or
+ other defects, accuracy, or the present or absence of errors, whether or not
+ discoverable, all to the greatest extent permissible under applicable law.
+
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+ that may apply to the Work or any use thereof, including without limitation
+ any person's Copyright and Related Rights in the Work. Further, Affirmer
+ disclaims responsibility for obtaining any necessary consents, permissions
+ or other rights required for any use of the Work.
+
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+ party to this document and has no duty or obligation with respect to this
+ CC0 or use of the Work.
+
+For more information, please see
+<http://creativecommons.org/publicdomain/zero/1.0/>
+
diff --git a/thirdparty/minimp3/minimp3.h b/thirdparty/minimp3/minimp3.h
new file mode 100644
index 0000000000..796cbc1f8e
--- /dev/null
+++ b/thirdparty/minimp3/minimp3.h
@@ -0,0 +1,1855 @@
+#ifndef MINIMP3_H
+#define MINIMP3_H
+/*
+ https://github.com/lieff/minimp3
+ To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
+ This software is distributed without any warranty.
+ See <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#include <stdint.h>
+
+#define MINIMP3_MAX_SAMPLES_PER_FRAME (1152*2)
+
+typedef struct
+{
+ int frame_bytes, frame_offset, channels, hz, layer, bitrate_kbps;
+} mp3dec_frame_info_t;
+
+typedef struct
+{
+ float mdct_overlap[2][9*32], qmf_state[15*2*32];
+ int reserv, free_format_bytes;
+ unsigned char header[4], reserv_buf[511];
+} mp3dec_t;
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+void mp3dec_init(mp3dec_t *dec);
+#ifndef MINIMP3_FLOAT_OUTPUT
+typedef int16_t mp3d_sample_t;
+#else /* MINIMP3_FLOAT_OUTPUT */
+typedef float mp3d_sample_t;
+void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples);
+#endif /* MINIMP3_FLOAT_OUTPUT */
+int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* MINIMP3_H */
+#if defined(MINIMP3_IMPLEMENTATION) && !defined(_MINIMP3_IMPLEMENTATION_GUARD)
+#define _MINIMP3_IMPLEMENTATION_GUARD
+
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX_FREE_FORMAT_FRAME_SIZE 2304 /* more than ISO spec's */
+#ifndef MAX_FRAME_SYNC_MATCHES
+#define MAX_FRAME_SYNC_MATCHES 10
+#endif /* MAX_FRAME_SYNC_MATCHES */
+
+#define MAX_L3_FRAME_PAYLOAD_BYTES MAX_FREE_FORMAT_FRAME_SIZE /* MUST be >= 320000/8/32000*1152 = 1440 */
+
+#define MAX_BITRESERVOIR_BYTES 511
+#define SHORT_BLOCK_TYPE 2
+#define STOP_BLOCK_TYPE 3
+#define MODE_MONO 3
+#define MODE_JOINT_STEREO 1
+#define HDR_SIZE 4
+#define HDR_IS_MONO(h) (((h[3]) & 0xC0) == 0xC0)
+#define HDR_IS_MS_STEREO(h) (((h[3]) & 0xE0) == 0x60)
+#define HDR_IS_FREE_FORMAT(h) (((h[2]) & 0xF0) == 0)
+#define HDR_IS_CRC(h) (!((h[1]) & 1))
+#define HDR_TEST_PADDING(h) ((h[2]) & 0x2)
+#define HDR_TEST_MPEG1(h) ((h[1]) & 0x8)
+#define HDR_TEST_NOT_MPEG25(h) ((h[1]) & 0x10)
+#define HDR_TEST_I_STEREO(h) ((h[3]) & 0x10)
+#define HDR_TEST_MS_STEREO(h) ((h[3]) & 0x20)
+#define HDR_GET_STEREO_MODE(h) (((h[3]) >> 6) & 3)
+#define HDR_GET_STEREO_MODE_EXT(h) (((h[3]) >> 4) & 3)
+#define HDR_GET_LAYER(h) (((h[1]) >> 1) & 3)
+#define HDR_GET_BITRATE(h) ((h[2]) >> 4)
+#define HDR_GET_SAMPLE_RATE(h) (((h[2]) >> 2) & 3)
+#define HDR_GET_MY_SAMPLE_RATE(h) (HDR_GET_SAMPLE_RATE(h) + (((h[1] >> 3) & 1) + ((h[1] >> 4) & 1))*3)
+#define HDR_IS_FRAME_576(h) ((h[1] & 14) == 2)
+#define HDR_IS_LAYER_1(h) ((h[1] & 6) == 6)
+
+#define BITS_DEQUANTIZER_OUT -1
+#define MAX_SCF (255 + BITS_DEQUANTIZER_OUT*4 - 210)
+#define MAX_SCFI ((MAX_SCF + 3) & ~3)
+
+#define MINIMP3_MIN(a, b) ((a) > (b) ? (b) : (a))
+#define MINIMP3_MAX(a, b) ((a) < (b) ? (b) : (a))
+
+#if !defined(MINIMP3_NO_SIMD)
+
+#if !defined(MINIMP3_ONLY_SIMD) && (defined(_M_X64) || defined(__x86_64__) || defined(__aarch64__) || defined(_M_ARM64))
+/* x64 always have SSE2, arm64 always have neon, no need for generic code */
+#define MINIMP3_ONLY_SIMD
+#endif /* SIMD checks... */
+
+#if (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) || ((defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__))
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif /* defined(_MSC_VER) */
+#include <immintrin.h>
+#define HAVE_SSE 1
+#define HAVE_SIMD 1
+#define VSTORE _mm_storeu_ps
+#define VLD _mm_loadu_ps
+#define VSET _mm_set1_ps
+#define VADD _mm_add_ps
+#define VSUB _mm_sub_ps
+#define VMUL _mm_mul_ps
+#define VMAC(a, x, y) _mm_add_ps(a, _mm_mul_ps(x, y))
+#define VMSB(a, x, y) _mm_sub_ps(a, _mm_mul_ps(x, y))
+#define VMUL_S(x, s) _mm_mul_ps(x, _mm_set1_ps(s))
+#define VREV(x) _mm_shuffle_ps(x, x, _MM_SHUFFLE(0, 1, 2, 3))
+typedef __m128 f4;
+#if defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD)
+#define minimp3_cpuid __cpuid
+#else /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
+static __inline__ __attribute__((always_inline)) void minimp3_cpuid(int CPUInfo[], const int InfoType)
+{
+#if defined(__PIC__)
+ __asm__ __volatile__(
+#if defined(__x86_64__)
+ "push %%rbx\n"
+ "cpuid\n"
+ "xchgl %%ebx, %1\n"
+ "pop %%rbx\n"
+#else /* defined(__x86_64__) */
+ "xchgl %%ebx, %1\n"
+ "cpuid\n"
+ "xchgl %%ebx, %1\n"
+#endif /* defined(__x86_64__) */
+ : "=a" (CPUInfo[0]), "=r" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+ : "a" (InfoType));
+#else /* defined(__PIC__) */
+ __asm__ __volatile__(
+ "cpuid"
+ : "=a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
+ : "a" (InfoType));
+#endif /* defined(__PIC__)*/
+}
+#endif /* defined(_MSC_VER) || defined(MINIMP3_ONLY_SIMD) */
+static int have_simd(void)
+{
+#ifdef MINIMP3_ONLY_SIMD
+ return 1;
+#else /* MINIMP3_ONLY_SIMD */
+ static int g_have_simd;
+ int CPUInfo[4];
+#ifdef MINIMP3_TEST
+ static int g_counter;
+ if (g_counter++ > 100)
+ return 0;
+#endif /* MINIMP3_TEST */
+ if (g_have_simd)
+ goto end;
+ minimp3_cpuid(CPUInfo, 0);
+ g_have_simd = 1;
+ if (CPUInfo[0] > 0)
+ {
+ minimp3_cpuid(CPUInfo, 1);
+ g_have_simd = (CPUInfo[3] & (1 << 26)) + 1; /* SSE2 */
+ }
+end:
+ return g_have_simd - 1;
+#endif /* MINIMP3_ONLY_SIMD */
+}
+#elif defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM64)
+#include <arm_neon.h>
+#define HAVE_SSE 0
+#define HAVE_SIMD 1
+#define VSTORE vst1q_f32
+#define VLD vld1q_f32
+#define VSET vmovq_n_f32
+#define VADD vaddq_f32
+#define VSUB vsubq_f32
+#define VMUL vmulq_f32
+#define VMAC(a, x, y) vmlaq_f32(a, x, y)
+#define VMSB(a, x, y) vmlsq_f32(a, x, y)
+#define VMUL_S(x, s) vmulq_f32(x, vmovq_n_f32(s))
+#define VREV(x) vcombine_f32(vget_high_f32(vrev64q_f32(x)), vget_low_f32(vrev64q_f32(x)))
+typedef float32x4_t f4;
+static int have_simd()
+{ /* TODO: detect neon for !MINIMP3_ONLY_SIMD */
+ return 1;
+}
+#else /* SIMD checks... */
+#define HAVE_SSE 0
+#define HAVE_SIMD 0
+#ifdef MINIMP3_ONLY_SIMD
+#error MINIMP3_ONLY_SIMD used, but SSE/NEON not enabled
+#endif /* MINIMP3_ONLY_SIMD */
+#endif /* SIMD checks... */
+#else /* !defined(MINIMP3_NO_SIMD) */
+#define HAVE_SIMD 0
+#endif /* !defined(MINIMP3_NO_SIMD) */
+
+#if defined(__ARM_ARCH) && (__ARM_ARCH >= 6) && !defined(__aarch64__) && !defined(_M_ARM64)
+#define HAVE_ARMV6 1
+static __inline__ __attribute__((always_inline)) int32_t minimp3_clip_int16_arm(int32_t a)
+{
+ int32_t x = 0;
+ __asm__ ("ssat %0, #16, %1" : "=r"(x) : "r"(a));
+ return x;
+}
+#else
+#define HAVE_ARMV6 0
+#endif
+
+typedef struct
+{
+ const uint8_t *buf;
+ int pos, limit;
+} bs_t;
+
+typedef struct
+{
+ float scf[3*64];
+ uint8_t total_bands, stereo_bands, bitalloc[64], scfcod[64];
+} L12_scale_info;
+
+typedef struct
+{
+ uint8_t tab_offset, code_tab_width, band_count;
+} L12_subband_alloc_t;
+
+typedef struct
+{
+ const uint8_t *sfbtab;
+ uint16_t part_23_length, big_values, scalefac_compress;
+ uint8_t global_gain, block_type, mixed_block_flag, n_long_sfb, n_short_sfb;
+ uint8_t table_select[3], region_count[3], subblock_gain[3];
+ uint8_t preflag, scalefac_scale, count1_table, scfsi;
+} L3_gr_info_t;
+
+typedef struct
+{
+ bs_t bs;
+ uint8_t maindata[MAX_BITRESERVOIR_BYTES + MAX_L3_FRAME_PAYLOAD_BYTES];
+ L3_gr_info_t gr_info[4];
+ float grbuf[2][576], scf[40], syn[18 + 15][2*32];
+ uint8_t ist_pos[2][39];
+} mp3dec_scratch_t;
+
+static void bs_init(bs_t *bs, const uint8_t *data, int bytes)
+{
+ bs->buf = data;
+ bs->pos = 0;
+ bs->limit = bytes*8;
+}
+
+static uint32_t get_bits(bs_t *bs, int n)
+{
+ uint32_t next, cache = 0, s = bs->pos & 7;
+ int shl = n + s;
+ const uint8_t *p = bs->buf + (bs->pos >> 3);
+ if ((bs->pos += n) > bs->limit)
+ return 0;
+ next = *p++ & (255 >> s);
+ while ((shl -= 8) > 0)
+ {
+ cache |= next << shl;
+ next = *p++;
+ }
+ return cache | (next >> -shl);
+}
+
+static int hdr_valid(const uint8_t *h)
+{
+ return h[0] == 0xff &&
+ ((h[1] & 0xF0) == 0xf0 || (h[1] & 0xFE) == 0xe2) &&
+ (HDR_GET_LAYER(h) != 0) &&
+ (HDR_GET_BITRATE(h) != 15) &&
+ (HDR_GET_SAMPLE_RATE(h) != 3);
+}
+
+static int hdr_compare(const uint8_t *h1, const uint8_t *h2)
+{
+ return hdr_valid(h2) &&
+ ((h1[1] ^ h2[1]) & 0xFE) == 0 &&
+ ((h1[2] ^ h2[2]) & 0x0C) == 0 &&
+ !(HDR_IS_FREE_FORMAT(h1) ^ HDR_IS_FREE_FORMAT(h2));
+}
+
+static unsigned hdr_bitrate_kbps(const uint8_t *h)
+{
+ static const uint8_t halfrate[2][3][15] = {
+ { { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,4,8,12,16,20,24,28,32,40,48,56,64,72,80 }, { 0,16,24,28,32,40,48,56,64,72,80,88,96,112,128 } },
+ { { 0,16,20,24,28,32,40,48,56,64,80,96,112,128,160 }, { 0,16,24,28,32,40,48,56,64,80,96,112,128,160,192 }, { 0,16,32,48,64,80,96,112,128,144,160,176,192,208,224 } },
+ };
+ return 2*halfrate[!!HDR_TEST_MPEG1(h)][HDR_GET_LAYER(h) - 1][HDR_GET_BITRATE(h)];
+}
+
+static unsigned hdr_sample_rate_hz(const uint8_t *h)
+{
+ static const unsigned g_hz[3] = { 44100, 48000, 32000 };
+ return g_hz[HDR_GET_SAMPLE_RATE(h)] >> (int)!HDR_TEST_MPEG1(h) >> (int)!HDR_TEST_NOT_MPEG25(h);
+}
+
+static unsigned hdr_frame_samples(const uint8_t *h)
+{
+ return HDR_IS_LAYER_1(h) ? 384 : (1152 >> (int)HDR_IS_FRAME_576(h));
+}
+
+static int hdr_frame_bytes(const uint8_t *h, int free_format_size)
+{
+ int frame_bytes = hdr_frame_samples(h)*hdr_bitrate_kbps(h)*125/hdr_sample_rate_hz(h);
+ if (HDR_IS_LAYER_1(h))
+ {
+ frame_bytes &= ~3; /* slot align */
+ }
+ return frame_bytes ? frame_bytes : free_format_size;
+}
+
+static int hdr_padding(const uint8_t *h)
+{
+ return HDR_TEST_PADDING(h) ? (HDR_IS_LAYER_1(h) ? 4 : 1) : 0;
+}
+
+#ifndef MINIMP3_ONLY_MP3
+static const L12_subband_alloc_t *L12_subband_alloc_table(const uint8_t *hdr, L12_scale_info *sci)
+{
+ const L12_subband_alloc_t *alloc;
+ int mode = HDR_GET_STEREO_MODE(hdr);
+ int nbands, stereo_bands = (mode == MODE_MONO) ? 0 : (mode == MODE_JOINT_STEREO) ? (HDR_GET_STEREO_MODE_EXT(hdr) << 2) + 4 : 32;
+
+ if (HDR_IS_LAYER_1(hdr))
+ {
+ static const L12_subband_alloc_t g_alloc_L1[] = { { 76, 4, 32 } };
+ alloc = g_alloc_L1;
+ nbands = 32;
+ } else if (!HDR_TEST_MPEG1(hdr))
+ {
+ static const L12_subband_alloc_t g_alloc_L2M2[] = { { 60, 4, 4 }, { 44, 3, 7 }, { 44, 2, 19 } };
+ alloc = g_alloc_L2M2;
+ nbands = 30;
+ } else
+ {
+ static const L12_subband_alloc_t g_alloc_L2M1[] = { { 0, 4, 3 }, { 16, 4, 8 }, { 32, 3, 12 }, { 40, 2, 7 } };
+ int sample_rate_idx = HDR_GET_SAMPLE_RATE(hdr);
+ unsigned kbps = hdr_bitrate_kbps(hdr) >> (int)(mode != MODE_MONO);
+ if (!kbps) /* free-format */
+ {
+ kbps = 192;
+ }
+
+ alloc = g_alloc_L2M1;
+ nbands = 27;
+ if (kbps < 56)
+ {
+ static const L12_subband_alloc_t g_alloc_L2M1_lowrate[] = { { 44, 4, 2 }, { 44, 3, 10 } };
+ alloc = g_alloc_L2M1_lowrate;
+ nbands = sample_rate_idx == 2 ? 12 : 8;
+ } else if (kbps >= 96 && sample_rate_idx != 1)
+ {
+ nbands = 30;
+ }
+ }
+
+ sci->total_bands = (uint8_t)nbands;
+ sci->stereo_bands = (uint8_t)MINIMP3_MIN(stereo_bands, nbands);
+
+ return alloc;
+}
+
+static void L12_read_scalefactors(bs_t *bs, uint8_t *pba, uint8_t *scfcod, int bands, float *scf)
+{
+ static const float g_deq_L12[18*3] = {
+#define DQ(x) 9.53674316e-07f/x, 7.56931807e-07f/x, 6.00777173e-07f/x
+ DQ(3),DQ(7),DQ(15),DQ(31),DQ(63),DQ(127),DQ(255),DQ(511),DQ(1023),DQ(2047),DQ(4095),DQ(8191),DQ(16383),DQ(32767),DQ(65535),DQ(3),DQ(5),DQ(9)
+ };
+ int i, m;
+ for (i = 0; i < bands; i++)
+ {
+ float s = 0;
+ int ba = *pba++;
+ int mask = ba ? 4 + ((19 >> scfcod[i]) & 3) : 0;
+ for (m = 4; m; m >>= 1)
+ {
+ if (mask & m)
+ {
+ int b = get_bits(bs, 6);
+ s = g_deq_L12[ba*3 - 6 + b % 3]*(1 << 21 >> b/3);
+ }
+ *scf++ = s;
+ }
+ }
+}
+
+static void L12_read_scale_info(const uint8_t *hdr, bs_t *bs, L12_scale_info *sci)
+{
+ static const uint8_t g_bitalloc_code_tab[] = {
+ 0,17, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16,
+ 0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,16,
+ 0,17,18, 3,19,4,5,16,
+ 0,17,18,16,
+ 0,17,18,19, 4,5,6, 7,8, 9,10,11,12,13,14,15,
+ 0,17,18, 3,19,4,5, 6,7, 8, 9,10,11,12,13,14,
+ 0, 2, 3, 4, 5,6,7, 8,9,10,11,12,13,14,15,16
+ };
+ const L12_subband_alloc_t *subband_alloc = L12_subband_alloc_table(hdr, sci);
+
+ int i, k = 0, ba_bits = 0;
+ const uint8_t *ba_code_tab = g_bitalloc_code_tab;
+
+ for (i = 0; i < sci->total_bands; i++)
+ {
+ uint8_t ba;
+ if (i == k)
+ {
+ k += subband_alloc->band_count;
+ ba_bits = subband_alloc->code_tab_width;
+ ba_code_tab = g_bitalloc_code_tab + subband_alloc->tab_offset;
+ subband_alloc++;
+ }
+ ba = ba_code_tab[get_bits(bs, ba_bits)];
+ sci->bitalloc[2*i] = ba;
+ if (i < sci->stereo_bands)
+ {
+ ba = ba_code_tab[get_bits(bs, ba_bits)];
+ }
+ sci->bitalloc[2*i + 1] = sci->stereo_bands ? ba : 0;
+ }
+
+ for (i = 0; i < 2*sci->total_bands; i++)
+ {
+ sci->scfcod[i] = sci->bitalloc[i] ? HDR_IS_LAYER_1(hdr) ? 2 : get_bits(bs, 2) : 6;
+ }
+
+ L12_read_scalefactors(bs, sci->bitalloc, sci->scfcod, sci->total_bands*2, sci->scf);
+
+ for (i = sci->stereo_bands; i < sci->total_bands; i++)
+ {
+ sci->bitalloc[2*i + 1] = 0;
+ }
+}
+
+static int L12_dequantize_granule(float *grbuf, bs_t *bs, L12_scale_info *sci, int group_size)
+{
+ int i, j, k, choff = 576;
+ for (j = 0; j < 4; j++)
+ {
+ float *dst = grbuf + group_size*j;
+ for (i = 0; i < 2*sci->total_bands; i++)
+ {
+ int ba = sci->bitalloc[i];
+ if (ba != 0)
+ {
+ if (ba < 17)
+ {
+ int half = (1 << (ba - 1)) - 1;
+ for (k = 0; k < group_size; k++)
+ {
+ dst[k] = (float)((int)get_bits(bs, ba) - half);
+ }
+ } else
+ {
+ unsigned mod = (2 << (ba - 17)) + 1; /* 3, 5, 9 */
+ unsigned code = get_bits(bs, mod + 2 - (mod >> 3)); /* 5, 7, 10 */
+ for (k = 0; k < group_size; k++, code /= mod)
+ {
+ dst[k] = (float)((int)(code % mod - mod/2));
+ }
+ }
+ }
+ dst += choff;
+ choff = 18 - choff;
+ }
+ }
+ return group_size*4;
+}
+
+static void L12_apply_scf_384(L12_scale_info *sci, const float *scf, float *dst)
+{
+ int i, k;
+ memcpy(dst + 576 + sci->stereo_bands*18, dst + sci->stereo_bands*18, (sci->total_bands - sci->stereo_bands)*18*sizeof(float));
+ for (i = 0; i < sci->total_bands; i++, dst += 18, scf += 6)
+ {
+ for (k = 0; k < 12; k++)
+ {
+ dst[k + 0] *= scf[0];
+ dst[k + 576] *= scf[3];
+ }
+ }
+}
+#endif /* MINIMP3_ONLY_MP3 */
+
+static int L3_read_side_info(bs_t *bs, L3_gr_info_t *gr, const uint8_t *hdr)
+{
+ static const uint8_t g_scf_long[8][23] = {
+ { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+ { 12,12,12,12,12,12,16,20,24,28,32,40,48,56,64,76,90,2,2,2,2,2,0 },
+ { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+ { 6,6,6,6,6,6,8,10,12,14,16,18,22,26,32,38,46,54,62,70,76,36,0 },
+ { 6,6,6,6,6,6,8,10,12,14,16,20,24,28,32,38,46,52,60,68,58,54,0 },
+ { 4,4,4,4,4,4,6,6,8,8,10,12,16,20,24,28,34,42,50,54,76,158,0 },
+ { 4,4,4,4,4,4,6,6,6,8,10,12,16,18,22,28,34,40,46,54,54,192,0 },
+ { 4,4,4,4,4,4,6,6,8,10,12,16,20,24,30,38,46,56,68,84,102,26,0 }
+ };
+ static const uint8_t g_scf_short[8][40] = {
+ { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+ { 8,8,8,8,8,8,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+ { 4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+ { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+ { 4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+ { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+ { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+ { 4,4,4,4,4,4,4,4,4,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+ };
+ static const uint8_t g_scf_mixed[8][40] = {
+ { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+ { 12,12,12,4,4,4,8,8,8,12,12,12,16,16,16,20,20,20,24,24,24,28,28,28,36,36,36,2,2,2,2,2,2,2,2,2,26,26,26,0 },
+ { 6,6,6,6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,14,14,14,18,18,18,26,26,26,32,32,32,42,42,42,18,18,18,0 },
+ { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,32,32,32,44,44,44,12,12,12,0 },
+ { 6,6,6,6,6,6,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,24,24,24,30,30,30,40,40,40,18,18,18,0 },
+ { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,10,10,10,12,12,12,14,14,14,18,18,18,22,22,22,30,30,30,56,56,56,0 },
+ { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,6,6,6,10,10,10,12,12,12,14,14,14,16,16,16,20,20,20,26,26,26,66,66,66,0 },
+ { 4,4,4,4,4,4,6,6,4,4,4,6,6,6,8,8,8,12,12,12,16,16,16,20,20,20,26,26,26,34,34,34,42,42,42,12,12,12,0 }
+ };
+
+ unsigned tables, scfsi = 0;
+ int main_data_begin, part_23_sum = 0;
+ int sr_idx = HDR_GET_MY_SAMPLE_RATE(hdr); sr_idx -= (sr_idx != 0);
+ int gr_count = HDR_IS_MONO(hdr) ? 1 : 2;
+
+ if (HDR_TEST_MPEG1(hdr))
+ {
+ gr_count *= 2;
+ main_data_begin = get_bits(bs, 9);
+ scfsi = get_bits(bs, 7 + gr_count);
+ } else
+ {
+ main_data_begin = get_bits(bs, 8 + gr_count) >> gr_count;
+ }
+
+ do
+ {
+ if (HDR_IS_MONO(hdr))
+ {
+ scfsi <<= 4;
+ }
+ gr->part_23_length = (uint16_t)get_bits(bs, 12);
+ part_23_sum += gr->part_23_length;
+ gr->big_values = (uint16_t)get_bits(bs, 9);
+ if (gr->big_values > 288)
+ {
+ return -1;
+ }
+ gr->global_gain = (uint8_t)get_bits(bs, 8);
+ gr->scalefac_compress = (uint16_t)get_bits(bs, HDR_TEST_MPEG1(hdr) ? 4 : 9);
+ gr->sfbtab = g_scf_long[sr_idx];
+ gr->n_long_sfb = 22;
+ gr->n_short_sfb = 0;
+ if (get_bits(bs, 1))
+ {
+ gr->block_type = (uint8_t)get_bits(bs, 2);
+ if (!gr->block_type)
+ {
+ return -1;
+ }
+ gr->mixed_block_flag = (uint8_t)get_bits(bs, 1);
+ gr->region_count[0] = 7;
+ gr->region_count[1] = 255;
+ if (gr->block_type == SHORT_BLOCK_TYPE)
+ {
+ scfsi &= 0x0F0F;
+ if (!gr->mixed_block_flag)
+ {
+ gr->region_count[0] = 8;
+ gr->sfbtab = g_scf_short[sr_idx];
+ gr->n_long_sfb = 0;
+ gr->n_short_sfb = 39;
+ } else
+ {
+ gr->sfbtab = g_scf_mixed[sr_idx];
+ gr->n_long_sfb = HDR_TEST_MPEG1(hdr) ? 8 : 6;
+ gr->n_short_sfb = 30;
+ }
+ }
+ tables = get_bits(bs, 10);
+ tables <<= 5;
+ gr->subblock_gain[0] = (uint8_t)get_bits(bs, 3);
+ gr->subblock_gain[1] = (uint8_t)get_bits(bs, 3);
+ gr->subblock_gain[2] = (uint8_t)get_bits(bs, 3);
+ } else
+ {
+ gr->block_type = 0;
+ gr->mixed_block_flag = 0;
+ tables = get_bits(bs, 15);
+ gr->region_count[0] = (uint8_t)get_bits(bs, 4);
+ gr->region_count[1] = (uint8_t)get_bits(bs, 3);
+ gr->region_count[2] = 255;
+ }
+ gr->table_select[0] = (uint8_t)(tables >> 10);
+ gr->table_select[1] = (uint8_t)((tables >> 5) & 31);
+ gr->table_select[2] = (uint8_t)((tables) & 31);
+ gr->preflag = HDR_TEST_MPEG1(hdr) ? get_bits(bs, 1) : (gr->scalefac_compress >= 500);
+ gr->scalefac_scale = (uint8_t)get_bits(bs, 1);
+ gr->count1_table = (uint8_t)get_bits(bs, 1);
+ gr->scfsi = (uint8_t)((scfsi >> 12) & 15);
+ scfsi <<= 4;
+ gr++;
+ } while(--gr_count);
+
+ if (part_23_sum + bs->pos > bs->limit + main_data_begin*8)
+ {
+ return -1;
+ }
+
+ return main_data_begin;
+}
+
+static void L3_read_scalefactors(uint8_t *scf, uint8_t *ist_pos, const uint8_t *scf_size, const uint8_t *scf_count, bs_t *bitbuf, int scfsi)
+{
+ int i, k;
+ for (i = 0; i < 4 && scf_count[i]; i++, scfsi *= 2)
+ {
+ int cnt = scf_count[i];
+ if (scfsi & 8)
+ {
+ memcpy(scf, ist_pos, cnt);
+ } else
+ {
+ int bits = scf_size[i];
+ if (!bits)
+ {
+ memset(scf, 0, cnt);
+ memset(ist_pos, 0, cnt);
+ } else
+ {
+ int max_scf = (scfsi < 0) ? (1 << bits) - 1 : -1;
+ for (k = 0; k < cnt; k++)
+ {
+ int s = get_bits(bitbuf, bits);
+ ist_pos[k] = (s == max_scf ? -1 : s);
+ scf[k] = s;
+ }
+ }
+ }
+ ist_pos += cnt;
+ scf += cnt;
+ }
+ scf[0] = scf[1] = scf[2] = 0;
+}
+
+static float L3_ldexp_q2(float y, int exp_q2)
+{
+ static const float g_expfrac[4] = { 9.31322575e-10f,7.83145814e-10f,6.58544508e-10f,5.53767716e-10f };
+ int e;
+ do
+ {
+ e = MINIMP3_MIN(30*4, exp_q2);
+ y *= g_expfrac[e & 3]*(1 << 30 >> (e >> 2));
+ } while ((exp_q2 -= e) > 0);
+ return y;
+}
+
+static void L3_decode_scalefactors(const uint8_t *hdr, uint8_t *ist_pos, bs_t *bs, const L3_gr_info_t *gr, float *scf, int ch)
+{
+ static const uint8_t g_scf_partitions[3][28] = {
+ { 6,5,5, 5,6,5,5,5,6,5, 7,3,11,10,0,0, 7, 7, 7,0, 6, 6,6,3, 8, 8,5,0 },
+ { 8,9,6,12,6,9,9,9,6,9,12,6,15,18,0,0, 6,15,12,0, 6,12,9,6, 6,18,9,0 },
+ { 9,9,6,12,9,9,9,9,9,9,12,6,18,18,0,0,12,12,12,0,12, 9,9,6,15,12,9,0 }
+ };
+ const uint8_t *scf_partition = g_scf_partitions[!!gr->n_short_sfb + !gr->n_long_sfb];
+ uint8_t scf_size[4], iscf[40];
+ int i, scf_shift = gr->scalefac_scale + 1, gain_exp, scfsi = gr->scfsi;
+ float gain;
+
+ if (HDR_TEST_MPEG1(hdr))
+ {
+ static const uint8_t g_scfc_decode[16] = { 0,1,2,3, 12,5,6,7, 9,10,11,13, 14,15,18,19 };
+ int part = g_scfc_decode[gr->scalefac_compress];
+ scf_size[1] = scf_size[0] = (uint8_t)(part >> 2);
+ scf_size[3] = scf_size[2] = (uint8_t)(part & 3);
+ } else
+ {
+ static const uint8_t g_mod[6*4] = { 5,5,4,4,5,5,4,1,4,3,1,1,5,6,6,1,4,4,4,1,4,3,1,1 };
+ int k, modprod, sfc, ist = HDR_TEST_I_STEREO(hdr) && ch;
+ sfc = gr->scalefac_compress >> ist;
+ for (k = ist*3*4; sfc >= 0; sfc -= modprod, k += 4)
+ {
+ for (modprod = 1, i = 3; i >= 0; i--)
+ {
+ scf_size[i] = (uint8_t)(sfc / modprod % g_mod[k + i]);
+ modprod *= g_mod[k + i];
+ }
+ }
+ scf_partition += k;
+ scfsi = -16;
+ }
+ L3_read_scalefactors(iscf, ist_pos, scf_size, scf_partition, bs, scfsi);
+
+ if (gr->n_short_sfb)
+ {
+ int sh = 3 - scf_shift;
+ for (i = 0; i < gr->n_short_sfb; i += 3)
+ {
+ iscf[gr->n_long_sfb + i + 0] += gr->subblock_gain[0] << sh;
+ iscf[gr->n_long_sfb + i + 1] += gr->subblock_gain[1] << sh;
+ iscf[gr->n_long_sfb + i + 2] += gr->subblock_gain[2] << sh;
+ }
+ } else if (gr->preflag)
+ {
+ static const uint8_t g_preamp[10] = { 1,1,1,1,2,2,3,3,3,2 };
+ for (i = 0; i < 10; i++)
+ {
+ iscf[11 + i] += g_preamp[i];
+ }
+ }
+
+ gain_exp = gr->global_gain + BITS_DEQUANTIZER_OUT*4 - 210 - (HDR_IS_MS_STEREO(hdr) ? 2 : 0);
+ gain = L3_ldexp_q2(1 << (MAX_SCFI/4), MAX_SCFI - gain_exp);
+ for (i = 0; i < (int)(gr->n_long_sfb + gr->n_short_sfb); i++)
+ {
+ scf[i] = L3_ldexp_q2(gain, iscf[i] << scf_shift);
+ }
+}
+
+static const float g_pow43[129 + 16] = {
+ 0,-1,-2.519842f,-4.326749f,-6.349604f,-8.549880f,-10.902724f,-13.390518f,-16.000000f,-18.720754f,-21.544347f,-24.463781f,-27.473142f,-30.567351f,-33.741992f,-36.993181f,
+ 0,1,2.519842f,4.326749f,6.349604f,8.549880f,10.902724f,13.390518f,16.000000f,18.720754f,21.544347f,24.463781f,27.473142f,30.567351f,33.741992f,36.993181f,40.317474f,43.711787f,47.173345f,50.699631f,54.288352f,57.937408f,61.644865f,65.408941f,69.227979f,73.100443f,77.024898f,81.000000f,85.024491f,89.097188f,93.216975f,97.382800f,101.593667f,105.848633f,110.146801f,114.487321f,118.869381f,123.292209f,127.755065f,132.257246f,136.798076f,141.376907f,145.993119f,150.646117f,155.335327f,160.060199f,164.820202f,169.614826f,174.443577f,179.305980f,184.201575f,189.129918f,194.090580f,199.083145f,204.107210f,209.162385f,214.248292f,219.364564f,224.510845f,229.686789f,234.892058f,240.126328f,245.389280f,250.680604f,256.000000f,261.347174f,266.721841f,272.123723f,277.552547f,283.008049f,288.489971f,293.998060f,299.532071f,305.091761f,310.676898f,316.287249f,321.922592f,327.582707f,333.267377f,338.976394f,344.709550f,350.466646f,356.247482f,362.051866f,367.879608f,373.730522f,379.604427f,385.501143f,391.420496f,397.362314f,403.326427f,409.312672f,415.320884f,421.350905f,427.402579f,433.475750f,439.570269f,445.685987f,451.822757f,457.980436f,464.158883f,470.357960f,476.577530f,482.817459f,489.077615f,495.357868f,501.658090f,507.978156f,514.317941f,520.677324f,527.056184f,533.454404f,539.871867f,546.308458f,552.764065f,559.238575f,565.731879f,572.243870f,578.774440f,585.323483f,591.890898f,598.476581f,605.080431f,611.702349f,618.342238f,625.000000f,631.675540f,638.368763f,645.079578f
+};
+
+static float L3_pow_43(int x)
+{
+ float frac;
+ int sign, mult = 256;
+
+ if (x < 129)
+ {
+ return g_pow43[16 + x];
+ }
+
+ if (x < 1024)
+ {
+ mult = 16;
+ x <<= 3;
+ }
+
+ sign = 2*x & 64;
+ frac = (float)((x & 63) - sign) / ((x & ~63) + sign);
+ return g_pow43[16 + ((x + sign) >> 6)]*(1.f + frac*((4.f/3) + frac*(2.f/9)))*mult;
+}
+
+static void L3_huffman(float *dst, bs_t *bs, const L3_gr_info_t *gr_info, const float *scf, int layer3gr_limit)
+{
+ static const int16_t tabs[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 785,785,785,785,784,784,784,784,513,513,513,513,513,513,513,513,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,
+ -255,1313,1298,1282,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,290,288,
+ -255,1313,1298,1282,769,769,769,769,529,529,529,529,529,529,529,529,528,528,528,528,528,528,528,528,512,512,512,512,512,512,512,512,290,288,
+ -253,-318,-351,-367,785,785,785,785,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,819,818,547,547,275,275,275,275,561,560,515,546,289,274,288,258,
+ -254,-287,1329,1299,1314,1312,1057,1057,1042,1042,1026,1026,784,784,784,784,529,529,529,529,529,529,529,529,769,769,769,769,768,768,768,768,563,560,306,306,291,259,
+ -252,-413,-477,-542,1298,-575,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-383,-399,1107,1092,1106,1061,849,849,789,789,1104,1091,773,773,1076,1075,341,340,325,309,834,804,577,577,532,532,516,516,832,818,803,816,561,561,531,531,515,546,289,289,288,258,
+ -252,-429,-493,-559,1057,1057,1042,1042,529,529,529,529,529,529,529,529,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,-382,1077,-415,1106,1061,1104,849,849,789,789,1091,1076,1029,1075,834,834,597,581,340,340,339,324,804,833,532,532,832,772,818,803,817,787,816,771,290,290,290,290,288,258,
+ -253,-349,-414,-447,-463,1329,1299,-479,1314,1312,1057,1057,1042,1042,1026,1026,785,785,785,785,784,784,784,784,769,769,769,769,768,768,768,768,-319,851,821,-335,836,850,805,849,341,340,325,336,533,533,579,579,564,564,773,832,578,548,563,516,321,276,306,291,304,259,
+ -251,-572,-733,-830,-863,-879,1041,1041,784,784,784,784,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,1396,1351,1381,1366,1395,1335,1380,-559,1334,1138,1138,1063,1063,1350,1392,1031,1031,1062,1062,1364,1363,1120,1120,1333,1348,881,881,881,881,375,374,359,373,343,358,341,325,791,791,1123,1122,-703,1105,1045,-719,865,865,790,790,774,774,1104,1029,338,293,323,308,-799,-815,833,788,772,818,803,816,322,292,307,320,561,531,515,546,289,274,288,258,
+ -251,-525,-605,-685,-765,-831,-846,1298,1057,1057,1312,1282,785,785,785,785,784,784,784,784,769,769,769,769,512,512,512,512,512,512,512,512,1399,1398,1383,1367,1382,1396,1351,-511,1381,1366,1139,1139,1079,1079,1124,1124,1364,1349,1363,1333,882,882,882,882,807,807,807,807,1094,1094,1136,1136,373,341,535,535,881,775,867,822,774,-591,324,338,-671,849,550,550,866,864,609,609,293,336,534,534,789,835,773,-751,834,804,308,307,833,788,832,772,562,562,547,547,305,275,560,515,290,290,
+ -252,-397,-477,-557,-622,-653,-719,-735,-750,1329,1299,1314,1057,1057,1042,1042,1312,1282,1024,1024,785,785,785,785,784,784,784,784,769,769,769,769,-383,1127,1141,1111,1126,1140,1095,1110,869,869,883,883,1079,1109,882,882,375,374,807,868,838,881,791,-463,867,822,368,263,852,837,836,-543,610,610,550,550,352,336,534,534,865,774,851,821,850,805,593,533,579,564,773,832,578,578,548,548,577,577,307,276,306,291,516,560,259,259,
+ -250,-2107,-2507,-2764,-2909,-2974,-3007,-3023,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-767,-1052,-1213,-1277,-1358,-1405,-1469,-1535,-1550,-1582,-1614,-1647,-1662,-1694,-1726,-1759,-1774,-1807,-1822,-1854,-1886,1565,-1919,-1935,-1951,-1967,1731,1730,1580,1717,-1983,1729,1564,-1999,1548,-2015,-2031,1715,1595,-2047,1714,-2063,1610,-2079,1609,-2095,1323,1323,1457,1457,1307,1307,1712,1547,1641,1700,1699,1594,1685,1625,1442,1442,1322,1322,-780,-973,-910,1279,1278,1277,1262,1276,1261,1275,1215,1260,1229,-959,974,974,989,989,-943,735,478,478,495,463,506,414,-1039,1003,958,1017,927,942,987,957,431,476,1272,1167,1228,-1183,1256,-1199,895,895,941,941,1242,1227,1212,1135,1014,1014,490,489,503,487,910,1013,985,925,863,894,970,955,1012,847,-1343,831,755,755,984,909,428,366,754,559,-1391,752,486,457,924,997,698,698,983,893,740,740,908,877,739,739,667,667,953,938,497,287,271,271,683,606,590,712,726,574,302,302,738,736,481,286,526,725,605,711,636,724,696,651,589,681,666,710,364,467,573,695,466,466,301,465,379,379,709,604,665,679,316,316,634,633,436,436,464,269,424,394,452,332,438,363,347,408,393,448,331,422,362,407,392,421,346,406,391,376,375,359,1441,1306,-2367,1290,-2383,1337,-2399,-2415,1426,1321,-2431,1411,1336,-2447,-2463,-2479,1169,1169,1049,1049,1424,1289,1412,1352,1319,-2495,1154,1154,1064,1064,1153,1153,416,390,360,404,403,389,344,374,373,343,358,372,327,357,342,311,356,326,1395,1394,1137,1137,1047,1047,1365,1392,1287,1379,1334,1364,1349,1378,1318,1363,792,792,792,792,1152,1152,1032,1032,1121,1121,1046,1046,1120,1120,1030,1030,-2895,1106,1061,1104,849,849,789,789,1091,1076,1029,1090,1060,1075,833,833,309,324,532,532,832,772,818,803,561,561,531,560,515,546,289,274,288,258,
+ -250,-1179,-1579,-1836,-1996,-2124,-2253,-2333,-2413,-2477,-2542,-2574,-2607,-2622,-2655,1314,1313,1298,1312,1282,785,785,785,785,1040,1040,1025,1025,768,768,768,768,-766,-798,-830,-862,-895,-911,-927,-943,-959,-975,-991,-1007,-1023,-1039,-1055,-1070,1724,1647,-1103,-1119,1631,1767,1662,1738,1708,1723,-1135,1780,1615,1779,1599,1677,1646,1778,1583,-1151,1777,1567,1737,1692,1765,1722,1707,1630,1751,1661,1764,1614,1736,1676,1763,1750,1645,1598,1721,1691,1762,1706,1582,1761,1566,-1167,1749,1629,767,766,751,765,494,494,735,764,719,749,734,763,447,447,748,718,477,506,431,491,446,476,461,505,415,430,475,445,504,399,460,489,414,503,383,474,429,459,502,502,746,752,488,398,501,473,413,472,486,271,480,270,-1439,-1455,1357,-1471,-1487,-1503,1341,1325,-1519,1489,1463,1403,1309,-1535,1372,1448,1418,1476,1356,1462,1387,-1551,1475,1340,1447,1402,1386,-1567,1068,1068,1474,1461,455,380,468,440,395,425,410,454,364,467,466,464,453,269,409,448,268,432,1371,1473,1432,1417,1308,1460,1355,1446,1459,1431,1083,1083,1401,1416,1458,1445,1067,1067,1370,1457,1051,1051,1291,1430,1385,1444,1354,1415,1400,1443,1082,1082,1173,1113,1186,1066,1185,1050,-1967,1158,1128,1172,1097,1171,1081,-1983,1157,1112,416,266,375,400,1170,1142,1127,1065,793,793,1169,1033,1156,1096,1141,1111,1155,1080,1126,1140,898,898,808,808,897,897,792,792,1095,1152,1032,1125,1110,1139,1079,1124,882,807,838,881,853,791,-2319,867,368,263,822,852,837,866,806,865,-2399,851,352,262,534,534,821,836,594,594,549,549,593,593,533,533,848,773,579,579,564,578,548,563,276,276,577,576,306,291,516,560,305,305,275,259,
+ -251,-892,-2058,-2620,-2828,-2957,-3023,-3039,1041,1041,1040,1040,769,769,769,769,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,256,-511,-527,-543,-559,1530,-575,-591,1528,1527,1407,1526,1391,1023,1023,1023,1023,1525,1375,1268,1268,1103,1103,1087,1087,1039,1039,1523,-604,815,815,815,815,510,495,509,479,508,463,507,447,431,505,415,399,-734,-782,1262,-815,1259,1244,-831,1258,1228,-847,-863,1196,-879,1253,987,987,748,-767,493,493,462,477,414,414,686,669,478,446,461,445,474,429,487,458,412,471,1266,1264,1009,1009,799,799,-1019,-1276,-1452,-1581,-1677,-1757,-1821,-1886,-1933,-1997,1257,1257,1483,1468,1512,1422,1497,1406,1467,1496,1421,1510,1134,1134,1225,1225,1466,1451,1374,1405,1252,1252,1358,1480,1164,1164,1251,1251,1238,1238,1389,1465,-1407,1054,1101,-1423,1207,-1439,830,830,1248,1038,1237,1117,1223,1148,1236,1208,411,426,395,410,379,269,1193,1222,1132,1235,1221,1116,976,976,1192,1162,1177,1220,1131,1191,963,963,-1647,961,780,-1663,558,558,994,993,437,408,393,407,829,978,813,797,947,-1743,721,721,377,392,844,950,828,890,706,706,812,859,796,960,948,843,934,874,571,571,-1919,690,555,689,421,346,539,539,944,779,918,873,932,842,903,888,570,570,931,917,674,674,-2575,1562,-2591,1609,-2607,1654,1322,1322,1441,1441,1696,1546,1683,1593,1669,1624,1426,1426,1321,1321,1639,1680,1425,1425,1305,1305,1545,1668,1608,1623,1667,1592,1638,1666,1320,1320,1652,1607,1409,1409,1304,1304,1288,1288,1664,1637,1395,1395,1335,1335,1622,1636,1394,1394,1319,1319,1606,1621,1392,1392,1137,1137,1137,1137,345,390,360,375,404,373,1047,-2751,-2767,-2783,1062,1121,1046,-2799,1077,-2815,1106,1061,789,789,1105,1104,263,355,310,340,325,354,352,262,339,324,1091,1076,1029,1090,1060,1075,833,833,788,788,1088,1028,818,818,803,803,561,561,531,531,816,771,546,546,289,274,288,258,
+ -253,-317,-381,-446,-478,-509,1279,1279,-811,-1179,-1451,-1756,-1900,-2028,-2189,-2253,-2333,-2414,-2445,-2511,-2526,1313,1298,-2559,1041,1041,1040,1040,1025,1025,1024,1024,1022,1007,1021,991,1020,975,1019,959,687,687,1018,1017,671,671,655,655,1016,1015,639,639,758,758,623,623,757,607,756,591,755,575,754,559,543,543,1009,783,-575,-621,-685,-749,496,-590,750,749,734,748,974,989,1003,958,988,973,1002,942,987,957,972,1001,926,986,941,971,956,1000,910,985,925,999,894,970,-1071,-1087,-1102,1390,-1135,1436,1509,1451,1374,-1151,1405,1358,1480,1420,-1167,1507,1494,1389,1342,1465,1435,1450,1326,1505,1310,1493,1373,1479,1404,1492,1464,1419,428,443,472,397,736,526,464,464,486,457,442,471,484,482,1357,1449,1434,1478,1388,1491,1341,1490,1325,1489,1463,1403,1309,1477,1372,1448,1418,1433,1476,1356,1462,1387,-1439,1475,1340,1447,1402,1474,1324,1461,1371,1473,269,448,1432,1417,1308,1460,-1711,1459,-1727,1441,1099,1099,1446,1386,1431,1401,-1743,1289,1083,1083,1160,1160,1458,1445,1067,1067,1370,1457,1307,1430,1129,1129,1098,1098,268,432,267,416,266,400,-1887,1144,1187,1082,1173,1113,1186,1066,1050,1158,1128,1143,1172,1097,1171,1081,420,391,1157,1112,1170,1142,1127,1065,1169,1049,1156,1096,1141,1111,1155,1080,1126,1154,1064,1153,1140,1095,1048,-2159,1125,1110,1137,-2175,823,823,1139,1138,807,807,384,264,368,263,868,838,853,791,867,822,852,837,866,806,865,790,-2319,851,821,836,352,262,850,805,849,-2399,533,533,835,820,336,261,578,548,563,577,532,532,832,772,562,562,547,547,305,275,560,515,290,290,288,258 };
+ static const uint8_t tab32[] = { 130,162,193,209,44,28,76,140,9,9,9,9,9,9,9,9,190,254,222,238,126,94,157,157,109,61,173,205 };
+ static const uint8_t tab33[] = { 252,236,220,204,188,172,156,140,124,108,92,76,60,44,28,12 };
+ static const int16_t tabindex[2*16] = { 0,32,64,98,0,132,180,218,292,364,426,538,648,746,0,1126,1460,1460,1460,1460,1460,1460,1460,1460,1842,1842,1842,1842,1842,1842,1842,1842 };
+ static const uint8_t g_linbits[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,6,8,10,13,4,5,6,7,8,9,11,13 };
+
+#define PEEK_BITS(n) (bs_cache >> (32 - n))
+#define FLUSH_BITS(n) { bs_cache <<= (n); bs_sh += (n); }
+#define CHECK_BITS while (bs_sh >= 0) { bs_cache |= (uint32_t)*bs_next_ptr++ << bs_sh; bs_sh -= 8; }
+#define BSPOS ((bs_next_ptr - bs->buf)*8 - 24 + bs_sh)
+
+ float one = 0.0f;
+ int ireg = 0, big_val_cnt = gr_info->big_values;
+ const uint8_t *sfb = gr_info->sfbtab;
+ const uint8_t *bs_next_ptr = bs->buf + bs->pos/8;
+ uint32_t bs_cache = (((bs_next_ptr[0]*256u + bs_next_ptr[1])*256u + bs_next_ptr[2])*256u + bs_next_ptr[3]) << (bs->pos & 7);
+ int pairs_to_decode, np, bs_sh = (bs->pos & 7) - 8;
+ bs_next_ptr += 4;
+
+ while (big_val_cnt > 0)
+ {
+ int tab_num = gr_info->table_select[ireg];
+ int sfb_cnt = gr_info->region_count[ireg++];
+ const int16_t *codebook = tabs + tabindex[tab_num];
+ int linbits = g_linbits[tab_num];
+ if (linbits)
+ {
+ do
+ {
+ np = *sfb++ / 2;
+ pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
+ one = *scf++;
+ do
+ {
+ int j, w = 5;
+ int leaf = codebook[PEEK_BITS(w)];
+ while (leaf < 0)
+ {
+ FLUSH_BITS(w);
+ w = leaf & 7;
+ leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
+ }
+ FLUSH_BITS(leaf >> 8);
+
+ for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+ {
+ int lsb = leaf & 0x0F;
+ if (lsb == 15)
+ {
+ lsb += PEEK_BITS(linbits);
+ FLUSH_BITS(linbits);
+ CHECK_BITS;
+ *dst = one*L3_pow_43(lsb)*((int32_t)bs_cache < 0 ? -1: 1);
+ } else
+ {
+ *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+ }
+ FLUSH_BITS(lsb ? 1 : 0);
+ }
+ CHECK_BITS;
+ } while (--pairs_to_decode);
+ } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+ } else
+ {
+ do
+ {
+ np = *sfb++ / 2;
+ pairs_to_decode = MINIMP3_MIN(big_val_cnt, np);
+ one = *scf++;
+ do
+ {
+ int j, w = 5;
+ int leaf = codebook[PEEK_BITS(w)];
+ while (leaf < 0)
+ {
+ FLUSH_BITS(w);
+ w = leaf & 7;
+ leaf = codebook[PEEK_BITS(w) - (leaf >> 3)];
+ }
+ FLUSH_BITS(leaf >> 8);
+
+ for (j = 0; j < 2; j++, dst++, leaf >>= 4)
+ {
+ int lsb = leaf & 0x0F;
+ *dst = g_pow43[16 + lsb - 16*(bs_cache >> 31)]*one;
+ FLUSH_BITS(lsb ? 1 : 0);
+ }
+ CHECK_BITS;
+ } while (--pairs_to_decode);
+ } while ((big_val_cnt -= np) > 0 && --sfb_cnt >= 0);
+ }
+ }
+
+ for (np = 1 - big_val_cnt;; dst += 4)
+ {
+ const uint8_t *codebook_count1 = (gr_info->count1_table) ? tab33 : tab32;
+ int leaf = codebook_count1[PEEK_BITS(4)];
+ if (!(leaf & 8))
+ {
+ leaf = codebook_count1[(leaf >> 3) + (bs_cache << 4 >> (32 - (leaf & 3)))];
+ }
+ FLUSH_BITS(leaf & 7);
+ if (BSPOS > layer3gr_limit)
+ {
+ break;
+ }
+#define RELOAD_SCALEFACTOR if (!--np) { np = *sfb++/2; if (!np) break; one = *scf++; }
+#define DEQ_COUNT1(s) if (leaf & (128 >> s)) { dst[s] = ((int32_t)bs_cache < 0) ? -one : one; FLUSH_BITS(1) }
+ RELOAD_SCALEFACTOR;
+ DEQ_COUNT1(0);
+ DEQ_COUNT1(1);
+ RELOAD_SCALEFACTOR;
+ DEQ_COUNT1(2);
+ DEQ_COUNT1(3);
+ CHECK_BITS;
+ }
+
+ bs->pos = layer3gr_limit;
+}
+
+static void L3_midside_stereo(float *left, int n)
+{
+ int i = 0;
+ float *right = left + 576;
+#if HAVE_SIMD
+ if (have_simd()) for (; i < n - 3; i += 4)
+ {
+ f4 vl = VLD(left + i);
+ f4 vr = VLD(right + i);
+ VSTORE(left + i, VADD(vl, vr));
+ VSTORE(right + i, VSUB(vl, vr));
+ }
+#endif /* HAVE_SIMD */
+ for (; i < n; i++)
+ {
+ float a = left[i];
+ float b = right[i];
+ left[i] = a + b;
+ right[i] = a - b;
+ }
+}
+
+static void L3_intensity_stereo_band(float *left, int n, float kl, float kr)
+{
+ int i;
+ for (i = 0; i < n; i++)
+ {
+ left[i + 576] = left[i]*kr;
+ left[i] = left[i]*kl;
+ }
+}
+
+static void L3_stereo_top_band(const float *right, const uint8_t *sfb, int nbands, int max_band[3])
+{
+ int i, k;
+
+ max_band[0] = max_band[1] = max_band[2] = -1;
+
+ for (i = 0; i < nbands; i++)
+ {
+ for (k = 0; k < sfb[i]; k += 2)
+ {
+ if (right[k] != 0 || right[k + 1] != 0)
+ {
+ max_band[i % 3] = i;
+ break;
+ }
+ }
+ right += sfb[i];
+ }
+}
+
+static void L3_stereo_process(float *left, const uint8_t *ist_pos, const uint8_t *sfb, const uint8_t *hdr, int max_band[3], int mpeg2_sh)
+{
+ static const float g_pan[7*2] = { 0,1,0.21132487f,0.78867513f,0.36602540f,0.63397460f,0.5f,0.5f,0.63397460f,0.36602540f,0.78867513f,0.21132487f,1,0 };
+ unsigned i, max_pos = HDR_TEST_MPEG1(hdr) ? 7 : 64;
+
+ for (i = 0; sfb[i]; i++)
+ {
+ unsigned ipos = ist_pos[i];
+ if ((int)i > max_band[i % 3] && ipos < max_pos)
+ {
+ float kl, kr, s = HDR_TEST_MS_STEREO(hdr) ? 1.41421356f : 1;
+ if (HDR_TEST_MPEG1(hdr))
+ {
+ kl = g_pan[2*ipos];
+ kr = g_pan[2*ipos + 1];
+ } else
+ {
+ kl = 1;
+ kr = L3_ldexp_q2(1, (ipos + 1) >> 1 << mpeg2_sh);
+ if (ipos & 1)
+ {
+ kl = kr;
+ kr = 1;
+ }
+ }
+ L3_intensity_stereo_band(left, sfb[i], kl*s, kr*s);
+ } else if (HDR_TEST_MS_STEREO(hdr))
+ {
+ L3_midside_stereo(left, sfb[i]);
+ }
+ left += sfb[i];
+ }
+}
+
+static void L3_intensity_stereo(float *left, uint8_t *ist_pos, const L3_gr_info_t *gr, const uint8_t *hdr)
+{
+ int max_band[3], n_sfb = gr->n_long_sfb + gr->n_short_sfb;
+ int i, max_blocks = gr->n_short_sfb ? 3 : 1;
+
+ L3_stereo_top_band(left + 576, gr->sfbtab, n_sfb, max_band);
+ if (gr->n_long_sfb)
+ {
+ max_band[0] = max_band[1] = max_band[2] = MINIMP3_MAX(MINIMP3_MAX(max_band[0], max_band[1]), max_band[2]);
+ }
+ for (i = 0; i < max_blocks; i++)
+ {
+ int default_pos = HDR_TEST_MPEG1(hdr) ? 3 : 0;
+ int itop = n_sfb - max_blocks + i;
+ int prev = itop - max_blocks;
+ ist_pos[itop] = max_band[i] >= prev ? default_pos : ist_pos[prev];
+ }
+ L3_stereo_process(left, ist_pos, gr->sfbtab, hdr, max_band, gr[1].scalefac_compress & 1);
+}
+
+static void L3_reorder(float *grbuf, float *scratch, const uint8_t *sfb)
+{
+ int i, len;
+ float *src = grbuf, *dst = scratch;
+
+ for (;0 != (len = *sfb); sfb += 3, src += 2*len)
+ {
+ for (i = 0; i < len; i++, src++)
+ {
+ *dst++ = src[0*len];
+ *dst++ = src[1*len];
+ *dst++ = src[2*len];
+ }
+ }
+ memcpy(grbuf, scratch, (dst - scratch)*sizeof(float));
+}
+
+static void L3_antialias(float *grbuf, int nbands)
+{
+ static const float g_aa[2][8] = {
+ {0.85749293f,0.88174200f,0.94962865f,0.98331459f,0.99551782f,0.99916056f,0.99989920f,0.99999316f},
+ {0.51449576f,0.47173197f,0.31337745f,0.18191320f,0.09457419f,0.04096558f,0.01419856f,0.00369997f}
+ };
+
+ for (; nbands > 0; nbands--, grbuf += 18)
+ {
+ int i = 0;
+#if HAVE_SIMD
+ if (have_simd()) for (; i < 8; i += 4)
+ {
+ f4 vu = VLD(grbuf + 18 + i);
+ f4 vd = VLD(grbuf + 14 - i);
+ f4 vc0 = VLD(g_aa[0] + i);
+ f4 vc1 = VLD(g_aa[1] + i);
+ vd = VREV(vd);
+ VSTORE(grbuf + 18 + i, VSUB(VMUL(vu, vc0), VMUL(vd, vc1)));
+ vd = VADD(VMUL(vu, vc1), VMUL(vd, vc0));
+ VSTORE(grbuf + 14 - i, VREV(vd));
+ }
+#endif /* HAVE_SIMD */
+#ifndef MINIMP3_ONLY_SIMD
+ for(; i < 8; i++)
+ {
+ float u = grbuf[18 + i];
+ float d = grbuf[17 - i];
+ grbuf[18 + i] = u*g_aa[0][i] - d*g_aa[1][i];
+ grbuf[17 - i] = u*g_aa[1][i] + d*g_aa[0][i];
+ }
+#endif /* MINIMP3_ONLY_SIMD */
+ }
+}
+
+static void L3_dct3_9(float *y)
+{
+ float s0, s1, s2, s3, s4, s5, s6, s7, s8, t0, t2, t4;
+
+ s0 = y[0]; s2 = y[2]; s4 = y[4]; s6 = y[6]; s8 = y[8];
+ t0 = s0 + s6*0.5f;
+ s0 -= s6;
+ t4 = (s4 + s2)*0.93969262f;
+ t2 = (s8 + s2)*0.76604444f;
+ s6 = (s4 - s8)*0.17364818f;
+ s4 += s8 - s2;
+
+ s2 = s0 - s4*0.5f;
+ y[4] = s4 + s0;
+ s8 = t0 - t2 + s6;
+ s0 = t0 - t4 + t2;
+ s4 = t0 + t4 - s6;
+
+ s1 = y[1]; s3 = y[3]; s5 = y[5]; s7 = y[7];
+
+ s3 *= 0.86602540f;
+ t0 = (s5 + s1)*0.98480775f;
+ t4 = (s5 - s7)*0.34202014f;
+ t2 = (s1 + s7)*0.64278761f;
+ s1 = (s1 - s5 - s7)*0.86602540f;
+
+ s5 = t0 - s3 - t2;
+ s7 = t4 - s3 - t0;
+ s3 = t4 + s3 - t2;
+
+ y[0] = s4 - s7;
+ y[1] = s2 + s1;
+ y[2] = s0 - s3;
+ y[3] = s8 + s5;
+ y[5] = s8 - s5;
+ y[6] = s0 + s3;
+ y[7] = s2 - s1;
+ y[8] = s4 + s7;
+}
+
+static void L3_imdct36(float *grbuf, float *overlap, const float *window, int nbands)
+{
+ int i, j;
+ static const float g_twid9[18] = {
+ 0.73727734f,0.79335334f,0.84339145f,0.88701083f,0.92387953f,0.95371695f,0.97629601f,0.99144486f,0.99904822f,0.67559021f,0.60876143f,0.53729961f,0.46174861f,0.38268343f,0.30070580f,0.21643961f,0.13052619f,0.04361938f
+ };
+
+ for (j = 0; j < nbands; j++, grbuf += 18, overlap += 9)
+ {
+ float co[9], si[9];
+ co[0] = -grbuf[0];
+ si[0] = grbuf[17];
+ for (i = 0; i < 4; i++)
+ {
+ si[8 - 2*i] = grbuf[4*i + 1] - grbuf[4*i + 2];
+ co[1 + 2*i] = grbuf[4*i + 1] + grbuf[4*i + 2];
+ si[7 - 2*i] = grbuf[4*i + 4] - grbuf[4*i + 3];
+ co[2 + 2*i] = -(grbuf[4*i + 3] + grbuf[4*i + 4]);
+ }
+ L3_dct3_9(co);
+ L3_dct3_9(si);
+
+ si[1] = -si[1];
+ si[3] = -si[3];
+ si[5] = -si[5];
+ si[7] = -si[7];
+
+ i = 0;
+
+#if HAVE_SIMD
+ if (have_simd()) for (; i < 8; i += 4)
+ {
+ f4 vovl = VLD(overlap + i);
+ f4 vc = VLD(co + i);
+ f4 vs = VLD(si + i);
+ f4 vr0 = VLD(g_twid9 + i);
+ f4 vr1 = VLD(g_twid9 + 9 + i);
+ f4 vw0 = VLD(window + i);
+ f4 vw1 = VLD(window + 9 + i);
+ f4 vsum = VADD(VMUL(vc, vr1), VMUL(vs, vr0));
+ VSTORE(overlap + i, VSUB(VMUL(vc, vr0), VMUL(vs, vr1)));
+ VSTORE(grbuf + i, VSUB(VMUL(vovl, vw0), VMUL(vsum, vw1)));
+ vsum = VADD(VMUL(vovl, vw1), VMUL(vsum, vw0));
+ VSTORE(grbuf + 14 - i, VREV(vsum));
+ }
+#endif /* HAVE_SIMD */
+ for (; i < 9; i++)
+ {
+ float ovl = overlap[i];
+ float sum = co[i]*g_twid9[9 + i] + si[i]*g_twid9[0 + i];
+ overlap[i] = co[i]*g_twid9[0 + i] - si[i]*g_twid9[9 + i];
+ grbuf[i] = ovl*window[0 + i] - sum*window[9 + i];
+ grbuf[17 - i] = ovl*window[9 + i] + sum*window[0 + i];
+ }
+ }
+}
+
+static void L3_idct3(float x0, float x1, float x2, float *dst)
+{
+ float m1 = x1*0.86602540f;
+ float a1 = x0 - x2*0.5f;
+ dst[1] = x0 + x2;
+ dst[0] = a1 + m1;
+ dst[2] = a1 - m1;
+}
+
+static void L3_imdct12(float *x, float *dst, float *overlap)
+{
+ static const float g_twid3[6] = { 0.79335334f,0.92387953f,0.99144486f, 0.60876143f,0.38268343f,0.13052619f };
+ float co[3], si[3];
+ int i;
+
+ L3_idct3(-x[0], x[6] + x[3], x[12] + x[9], co);
+ L3_idct3(x[15], x[12] - x[9], x[6] - x[3], si);
+ si[1] = -si[1];
+
+ for (i = 0; i < 3; i++)
+ {
+ float ovl = overlap[i];
+ float sum = co[i]*g_twid3[3 + i] + si[i]*g_twid3[0 + i];
+ overlap[i] = co[i]*g_twid3[0 + i] - si[i]*g_twid3[3 + i];
+ dst[i] = ovl*g_twid3[2 - i] - sum*g_twid3[5 - i];
+ dst[5 - i] = ovl*g_twid3[5 - i] + sum*g_twid3[2 - i];
+ }
+}
+
+static void L3_imdct_short(float *grbuf, float *overlap, int nbands)
+{
+ for (;nbands > 0; nbands--, overlap += 9, grbuf += 18)
+ {
+ float tmp[18];
+ memcpy(tmp, grbuf, sizeof(tmp));
+ memcpy(grbuf, overlap, 6*sizeof(float));
+ L3_imdct12(tmp, grbuf + 6, overlap + 6);
+ L3_imdct12(tmp + 1, grbuf + 12, overlap + 6);
+ L3_imdct12(tmp + 2, overlap, overlap + 6);
+ }
+}
+
+static void L3_change_sign(float *grbuf)
+{
+ int b, i;
+ for (b = 0, grbuf += 18; b < 32; b += 2, grbuf += 36)
+ for (i = 1; i < 18; i += 2)
+ grbuf[i] = -grbuf[i];
+}
+
+static void L3_imdct_gr(float *grbuf, float *overlap, unsigned block_type, unsigned n_long_bands)
+{
+ static const float g_mdct_window[2][18] = {
+ { 0.99904822f,0.99144486f,0.97629601f,0.95371695f,0.92387953f,0.88701083f,0.84339145f,0.79335334f,0.73727734f,0.04361938f,0.13052619f,0.21643961f,0.30070580f,0.38268343f,0.46174861f,0.53729961f,0.60876143f,0.67559021f },
+ { 1,1,1,1,1,1,0.99144486f,0.92387953f,0.79335334f,0,0,0,0,0,0,0.13052619f,0.38268343f,0.60876143f }
+ };
+ if (n_long_bands)
+ {
+ L3_imdct36(grbuf, overlap, g_mdct_window[0], n_long_bands);
+ grbuf += 18*n_long_bands;
+ overlap += 9*n_long_bands;
+ }
+ if (block_type == SHORT_BLOCK_TYPE)
+ L3_imdct_short(grbuf, overlap, 32 - n_long_bands);
+ else
+ L3_imdct36(grbuf, overlap, g_mdct_window[block_type == STOP_BLOCK_TYPE], 32 - n_long_bands);
+}
+
+static void L3_save_reservoir(mp3dec_t *h, mp3dec_scratch_t *s)
+{
+ int pos = (s->bs.pos + 7)/8u;
+ int remains = s->bs.limit/8u - pos;
+ if (remains > MAX_BITRESERVOIR_BYTES)
+ {
+ pos += remains - MAX_BITRESERVOIR_BYTES;
+ remains = MAX_BITRESERVOIR_BYTES;
+ }
+ if (remains > 0)
+ {
+ memmove(h->reserv_buf, s->maindata + pos, remains);
+ }
+ h->reserv = remains;
+}
+
+static int L3_restore_reservoir(mp3dec_t *h, bs_t *bs, mp3dec_scratch_t *s, int main_data_begin)
+{
+ int frame_bytes = (bs->limit - bs->pos)/8;
+ int bytes_have = MINIMP3_MIN(h->reserv, main_data_begin);
+ memcpy(s->maindata, h->reserv_buf + MINIMP3_MAX(0, h->reserv - main_data_begin), MINIMP3_MIN(h->reserv, main_data_begin));
+ memcpy(s->maindata + bytes_have, bs->buf + bs->pos/8, frame_bytes);
+ bs_init(&s->bs, s->maindata, bytes_have + frame_bytes);
+ return h->reserv >= main_data_begin;
+}
+
+static void L3_decode(mp3dec_t *h, mp3dec_scratch_t *s, L3_gr_info_t *gr_info, int nch)
+{
+ int ch;
+
+ for (ch = 0; ch < nch; ch++)
+ {
+ int layer3gr_limit = s->bs.pos + gr_info[ch].part_23_length;
+ L3_decode_scalefactors(h->header, s->ist_pos[ch], &s->bs, gr_info + ch, s->scf, ch);
+ L3_huffman(s->grbuf[ch], &s->bs, gr_info + ch, s->scf, layer3gr_limit);
+ }
+
+ if (HDR_TEST_I_STEREO(h->header))
+ {
+ L3_intensity_stereo(s->grbuf[0], s->ist_pos[1], gr_info, h->header);
+ } else if (HDR_IS_MS_STEREO(h->header))
+ {
+ L3_midside_stereo(s->grbuf[0], 576);
+ }
+
+ for (ch = 0; ch < nch; ch++, gr_info++)
+ {
+ int aa_bands = 31;
+ int n_long_bands = (gr_info->mixed_block_flag ? 2 : 0) << (int)(HDR_GET_MY_SAMPLE_RATE(h->header) == 2);
+
+ if (gr_info->n_short_sfb)
+ {
+ aa_bands = n_long_bands - 1;
+ L3_reorder(s->grbuf[ch] + n_long_bands*18, s->syn[0], gr_info->sfbtab + gr_info->n_long_sfb);
+ }
+
+ L3_antialias(s->grbuf[ch], aa_bands);
+ L3_imdct_gr(s->grbuf[ch], h->mdct_overlap[ch], gr_info->block_type, n_long_bands);
+ L3_change_sign(s->grbuf[ch]);
+ }
+}
+
+static void mp3d_DCT_II(float *grbuf, int n)
+{
+ static const float g_sec[24] = {
+ 10.19000816f,0.50060302f,0.50241929f,3.40760851f,0.50547093f,0.52249861f,2.05778098f,0.51544732f,0.56694406f,1.48416460f,0.53104258f,0.64682180f,1.16943991f,0.55310392f,0.78815460f,0.97256821f,0.58293498f,1.06067765f,0.83934963f,0.62250412f,1.72244716f,0.74453628f,0.67480832f,5.10114861f
+ };
+ int i, k = 0;
+#if HAVE_SIMD
+ if (have_simd()) for (; k < n; k += 4)
+ {
+ f4 t[4][8], *x;
+ float *y = grbuf + k;
+
+ for (x = t[0], i = 0; i < 8; i++, x++)
+ {
+ f4 x0 = VLD(&y[i*18]);
+ f4 x1 = VLD(&y[(15 - i)*18]);
+ f4 x2 = VLD(&y[(16 + i)*18]);
+ f4 x3 = VLD(&y[(31 - i)*18]);
+ f4 t0 = VADD(x0, x3);
+ f4 t1 = VADD(x1, x2);
+ f4 t2 = VMUL_S(VSUB(x1, x2), g_sec[3*i + 0]);
+ f4 t3 = VMUL_S(VSUB(x0, x3), g_sec[3*i + 1]);
+ x[0] = VADD(t0, t1);
+ x[8] = VMUL_S(VSUB(t0, t1), g_sec[3*i + 2]);
+ x[16] = VADD(t3, t2);
+ x[24] = VMUL_S(VSUB(t3, t2), g_sec[3*i + 2]);
+ }
+ for (x = t[0], i = 0; i < 4; i++, x += 8)
+ {
+ f4 x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+ xt = VSUB(x0, x7); x0 = VADD(x0, x7);
+ x7 = VSUB(x1, x6); x1 = VADD(x1, x6);
+ x6 = VSUB(x2, x5); x2 = VADD(x2, x5);
+ x5 = VSUB(x3, x4); x3 = VADD(x3, x4);
+ x4 = VSUB(x0, x3); x0 = VADD(x0, x3);
+ x3 = VSUB(x1, x2); x1 = VADD(x1, x2);
+ x[0] = VADD(x0, x1);
+ x[4] = VMUL_S(VSUB(x0, x1), 0.70710677f);
+ x5 = VADD(x5, x6);
+ x6 = VMUL_S(VADD(x6, x7), 0.70710677f);
+ x7 = VADD(x7, xt);
+ x3 = VMUL_S(VADD(x3, x4), 0.70710677f);
+ x5 = VSUB(x5, VMUL_S(x7, 0.198912367f)); /* rotate by PI/8 */
+ x7 = VADD(x7, VMUL_S(x5, 0.382683432f));
+ x5 = VSUB(x5, VMUL_S(x7, 0.198912367f));
+ x0 = VSUB(xt, x6); xt = VADD(xt, x6);
+ x[1] = VMUL_S(VADD(xt, x7), 0.50979561f);
+ x[2] = VMUL_S(VADD(x4, x3), 0.54119611f);
+ x[3] = VMUL_S(VSUB(x0, x5), 0.60134488f);
+ x[5] = VMUL_S(VADD(x0, x5), 0.89997619f);
+ x[6] = VMUL_S(VSUB(x4, x3), 1.30656302f);
+ x[7] = VMUL_S(VSUB(xt, x7), 2.56291556f);
+ }
+
+ if (k > n - 3)
+ {
+#if HAVE_SSE
+#define VSAVE2(i, v) _mm_storel_pi((__m64 *)(void*)&y[i*18], v)
+#else /* HAVE_SSE */
+#define VSAVE2(i, v) vst1_f32((float32_t *)&y[i*18], vget_low_f32(v))
+#endif /* HAVE_SSE */
+ for (i = 0; i < 7; i++, y += 4*18)
+ {
+ f4 s = VADD(t[3][i], t[3][i + 1]);
+ VSAVE2(0, t[0][i]);
+ VSAVE2(1, VADD(t[2][i], s));
+ VSAVE2(2, VADD(t[1][i], t[1][i + 1]));
+ VSAVE2(3, VADD(t[2][1 + i], s));
+ }
+ VSAVE2(0, t[0][7]);
+ VSAVE2(1, VADD(t[2][7], t[3][7]));
+ VSAVE2(2, t[1][7]);
+ VSAVE2(3, t[3][7]);
+ } else
+ {
+#define VSAVE4(i, v) VSTORE(&y[i*18], v)
+ for (i = 0; i < 7; i++, y += 4*18)
+ {
+ f4 s = VADD(t[3][i], t[3][i + 1]);
+ VSAVE4(0, t[0][i]);
+ VSAVE4(1, VADD(t[2][i], s));
+ VSAVE4(2, VADD(t[1][i], t[1][i + 1]));
+ VSAVE4(3, VADD(t[2][1 + i], s));
+ }
+ VSAVE4(0, t[0][7]);
+ VSAVE4(1, VADD(t[2][7], t[3][7]));
+ VSAVE4(2, t[1][7]);
+ VSAVE4(3, t[3][7]);
+ }
+ } else
+#endif /* HAVE_SIMD */
+#ifdef MINIMP3_ONLY_SIMD
+ {}
+#else /* MINIMP3_ONLY_SIMD */
+ for (; k < n; k++)
+ {
+ float t[4][8], *x, *y = grbuf + k;
+
+ for (x = t[0], i = 0; i < 8; i++, x++)
+ {
+ float x0 = y[i*18];
+ float x1 = y[(15 - i)*18];
+ float x2 = y[(16 + i)*18];
+ float x3 = y[(31 - i)*18];
+ float t0 = x0 + x3;
+ float t1 = x1 + x2;
+ float t2 = (x1 - x2)*g_sec[3*i + 0];
+ float t3 = (x0 - x3)*g_sec[3*i + 1];
+ x[0] = t0 + t1;
+ x[8] = (t0 - t1)*g_sec[3*i + 2];
+ x[16] = t3 + t2;
+ x[24] = (t3 - t2)*g_sec[3*i + 2];
+ }
+ for (x = t[0], i = 0; i < 4; i++, x += 8)
+ {
+ float x0 = x[0], x1 = x[1], x2 = x[2], x3 = x[3], x4 = x[4], x5 = x[5], x6 = x[6], x7 = x[7], xt;
+ xt = x0 - x7; x0 += x7;
+ x7 = x1 - x6; x1 += x6;
+ x6 = x2 - x5; x2 += x5;
+ x5 = x3 - x4; x3 += x4;
+ x4 = x0 - x3; x0 += x3;
+ x3 = x1 - x2; x1 += x2;
+ x[0] = x0 + x1;
+ x[4] = (x0 - x1)*0.70710677f;
+ x5 = x5 + x6;
+ x6 = (x6 + x7)*0.70710677f;
+ x7 = x7 + xt;
+ x3 = (x3 + x4)*0.70710677f;
+ x5 -= x7*0.198912367f; /* rotate by PI/8 */
+ x7 += x5*0.382683432f;
+ x5 -= x7*0.198912367f;
+ x0 = xt - x6; xt += x6;
+ x[1] = (xt + x7)*0.50979561f;
+ x[2] = (x4 + x3)*0.54119611f;
+ x[3] = (x0 - x5)*0.60134488f;
+ x[5] = (x0 + x5)*0.89997619f;
+ x[6] = (x4 - x3)*1.30656302f;
+ x[7] = (xt - x7)*2.56291556f;
+
+ }
+ for (i = 0; i < 7; i++, y += 4*18)
+ {
+ y[0*18] = t[0][i];
+ y[1*18] = t[2][i] + t[3][i] + t[3][i + 1];
+ y[2*18] = t[1][i] + t[1][i + 1];
+ y[3*18] = t[2][i + 1] + t[3][i] + t[3][i + 1];
+ }
+ y[0*18] = t[0][7];
+ y[1*18] = t[2][7] + t[3][7];
+ y[2*18] = t[1][7];
+ y[3*18] = t[3][7];
+ }
+#endif /* MINIMP3_ONLY_SIMD */
+}
+
+#ifndef MINIMP3_FLOAT_OUTPUT
+static int16_t mp3d_scale_pcm(float sample)
+{
+#if HAVE_ARMV6
+ int32_t s32 = (int32_t)(sample + .5f);
+ s32 -= (s32 < 0);
+ int16_t s = (int16_t)minimp3_clip_int16_arm(s32);
+#else
+ if (sample >= 32766.5) return (int16_t) 32767;
+ if (sample <= -32767.5) return (int16_t)-32768;
+ int16_t s = (int16_t)(sample + .5f);
+ s -= (s < 0); /* away from zero, to be compliant */
+#endif
+ return s;
+}
+#else /* MINIMP3_FLOAT_OUTPUT */
+static float mp3d_scale_pcm(float sample)
+{
+ return sample*(1.f/32768.f);
+}
+#endif /* MINIMP3_FLOAT_OUTPUT */
+
+static void mp3d_synth_pair(mp3d_sample_t *pcm, int nch, const float *z)
+{
+ float a;
+ a = (z[14*64] - z[ 0]) * 29;
+ a += (z[ 1*64] + z[13*64]) * 213;
+ a += (z[12*64] - z[ 2*64]) * 459;
+ a += (z[ 3*64] + z[11*64]) * 2037;
+ a += (z[10*64] - z[ 4*64]) * 5153;
+ a += (z[ 5*64] + z[ 9*64]) * 6574;
+ a += (z[ 8*64] - z[ 6*64]) * 37489;
+ a += z[ 7*64] * 75038;
+ pcm[0] = mp3d_scale_pcm(a);
+
+ z += 2;
+ a = z[14*64] * 104;
+ a += z[12*64] * 1567;
+ a += z[10*64] * 9727;
+ a += z[ 8*64] * 64019;
+ a += z[ 6*64] * -9975;
+ a += z[ 4*64] * -45;
+ a += z[ 2*64] * 146;
+ a += z[ 0*64] * -5;
+ pcm[16*nch] = mp3d_scale_pcm(a);
+}
+
+static void mp3d_synth(float *xl, mp3d_sample_t *dstl, int nch, float *lins)
+{
+ int i;
+ float *xr = xl + 576*(nch - 1);
+ mp3d_sample_t *dstr = dstl + (nch - 1);
+
+ static const float g_win[] = {
+ -1,26,-31,208,218,401,-519,2063,2000,4788,-5517,7134,5959,35640,-39336,74992,
+ -1,24,-35,202,222,347,-581,2080,1952,4425,-5879,7640,5288,33791,-41176,74856,
+ -1,21,-38,196,225,294,-645,2087,1893,4063,-6237,8092,4561,31947,-43006,74630,
+ -1,19,-41,190,227,244,-711,2085,1822,3705,-6589,8492,3776,30112,-44821,74313,
+ -1,17,-45,183,228,197,-779,2075,1739,3351,-6935,8840,2935,28289,-46617,73908,
+ -1,16,-49,176,228,153,-848,2057,1644,3004,-7271,9139,2037,26482,-48390,73415,
+ -2,14,-53,169,227,111,-919,2032,1535,2663,-7597,9389,1082,24694,-50137,72835,
+ -2,13,-58,161,224,72,-991,2001,1414,2330,-7910,9592,70,22929,-51853,72169,
+ -2,11,-63,154,221,36,-1064,1962,1280,2006,-8209,9750,-998,21189,-53534,71420,
+ -2,10,-68,147,215,2,-1137,1919,1131,1692,-8491,9863,-2122,19478,-55178,70590,
+ -3,9,-73,139,208,-29,-1210,1870,970,1388,-8755,9935,-3300,17799,-56778,69679,
+ -3,8,-79,132,200,-57,-1283,1817,794,1095,-8998,9966,-4533,16155,-58333,68692,
+ -4,7,-85,125,189,-83,-1356,1759,605,814,-9219,9959,-5818,14548,-59838,67629,
+ -4,7,-91,117,177,-106,-1428,1698,402,545,-9416,9916,-7154,12980,-61289,66494,
+ -5,6,-97,111,163,-127,-1498,1634,185,288,-9585,9838,-8540,11455,-62684,65290
+ };
+ float *zlin = lins + 15*64;
+ const float *w = g_win;
+
+ zlin[4*15] = xl[18*16];
+ zlin[4*15 + 1] = xr[18*16];
+ zlin[4*15 + 2] = xl[0];
+ zlin[4*15 + 3] = xr[0];
+
+ zlin[4*31] = xl[1 + 18*16];
+ zlin[4*31 + 1] = xr[1 + 18*16];
+ zlin[4*31 + 2] = xl[1];
+ zlin[4*31 + 3] = xr[1];
+
+ mp3d_synth_pair(dstr, nch, lins + 4*15 + 1);
+ mp3d_synth_pair(dstr + 32*nch, nch, lins + 4*15 + 64 + 1);
+ mp3d_synth_pair(dstl, nch, lins + 4*15);
+ mp3d_synth_pair(dstl + 32*nch, nch, lins + 4*15 + 64);
+
+#if HAVE_SIMD
+ if (have_simd()) for (i = 14; i >= 0; i--)
+ {
+#define VLOAD(k) f4 w0 = VSET(*w++); f4 w1 = VSET(*w++); f4 vz = VLD(&zlin[4*i - 64*k]); f4 vy = VLD(&zlin[4*i - 64*(15 - k)]);
+#define V0(k) { VLOAD(k) b = VADD(VMUL(vz, w1), VMUL(vy, w0)) ; a = VSUB(VMUL(vz, w0), VMUL(vy, w1)); }
+#define V1(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vz, w0), VMUL(vy, w1))); }
+#define V2(k) { VLOAD(k) b = VADD(b, VADD(VMUL(vz, w1), VMUL(vy, w0))); a = VADD(a, VSUB(VMUL(vy, w1), VMUL(vz, w0))); }
+ f4 a, b;
+ zlin[4*i] = xl[18*(31 - i)];
+ zlin[4*i + 1] = xr[18*(31 - i)];
+ zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+ zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+ zlin[4*i + 64] = xl[1 + 18*(1 + i)];
+ zlin[4*i + 64 + 1] = xr[1 + 18*(1 + i)];
+ zlin[4*i - 64 + 2] = xl[18*(1 + i)];
+ zlin[4*i - 64 + 3] = xr[18*(1 + i)];
+
+ V0(0) V2(1) V1(2) V2(3) V1(4) V2(5) V1(6) V2(7)
+
+ {
+#ifndef MINIMP3_FLOAT_OUTPUT
+#if HAVE_SSE
+ static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
+ static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
+ __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
+ _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
+ dstr[(15 - i)*nch] = _mm_extract_epi16(pcm8, 1);
+ dstr[(17 + i)*nch] = _mm_extract_epi16(pcm8, 5);
+ dstl[(15 - i)*nch] = _mm_extract_epi16(pcm8, 0);
+ dstl[(17 + i)*nch] = _mm_extract_epi16(pcm8, 4);
+ dstr[(47 - i)*nch] = _mm_extract_epi16(pcm8, 3);
+ dstr[(49 + i)*nch] = _mm_extract_epi16(pcm8, 7);
+ dstl[(47 - i)*nch] = _mm_extract_epi16(pcm8, 2);
+ dstl[(49 + i)*nch] = _mm_extract_epi16(pcm8, 6);
+#else /* HAVE_SSE */
+ int16x4_t pcma, pcmb;
+ a = VADD(a, VSET(0.5f));
+ b = VADD(b, VSET(0.5f));
+ pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
+ pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
+ vst1_lane_s16(dstr + (15 - i)*nch, pcma, 1);
+ vst1_lane_s16(dstr + (17 + i)*nch, pcmb, 1);
+ vst1_lane_s16(dstl + (15 - i)*nch, pcma, 0);
+ vst1_lane_s16(dstl + (17 + i)*nch, pcmb, 0);
+ vst1_lane_s16(dstr + (47 - i)*nch, pcma, 3);
+ vst1_lane_s16(dstr + (49 + i)*nch, pcmb, 3);
+ vst1_lane_s16(dstl + (47 - i)*nch, pcma, 2);
+ vst1_lane_s16(dstl + (49 + i)*nch, pcmb, 2);
+#endif /* HAVE_SSE */
+
+#else /* MINIMP3_FLOAT_OUTPUT */
+
+ static const f4 g_scale = { 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f, 1.0f/32768.0f };
+ a = VMUL(a, g_scale);
+ b = VMUL(b, g_scale);
+#if HAVE_SSE
+ _mm_store_ss(dstr + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 1, 1, 1)));
+ _mm_store_ss(dstr + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(1, 1, 1, 1)));
+ _mm_store_ss(dstl + (15 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 0, 0, 0)));
+ _mm_store_ss(dstl + (17 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(0, 0, 0, 0)));
+ _mm_store_ss(dstr + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 3, 3, 3)));
+ _mm_store_ss(dstr + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 3, 3, 3)));
+ _mm_store_ss(dstl + (47 - i)*nch, _mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 2, 2, 2)));
+ _mm_store_ss(dstl + (49 + i)*nch, _mm_shuffle_ps(b, b, _MM_SHUFFLE(2, 2, 2, 2)));
+#else /* HAVE_SSE */
+ vst1q_lane_f32(dstr + (15 - i)*nch, a, 1);
+ vst1q_lane_f32(dstr + (17 + i)*nch, b, 1);
+ vst1q_lane_f32(dstl + (15 - i)*nch, a, 0);
+ vst1q_lane_f32(dstl + (17 + i)*nch, b, 0);
+ vst1q_lane_f32(dstr + (47 - i)*nch, a, 3);
+ vst1q_lane_f32(dstr + (49 + i)*nch, b, 3);
+ vst1q_lane_f32(dstl + (47 - i)*nch, a, 2);
+ vst1q_lane_f32(dstl + (49 + i)*nch, b, 2);
+#endif /* HAVE_SSE */
+#endif /* MINIMP3_FLOAT_OUTPUT */
+ }
+ } else
+#endif /* HAVE_SIMD */
+#ifdef MINIMP3_ONLY_SIMD
+ {}
+#else /* MINIMP3_ONLY_SIMD */
+ for (i = 14; i >= 0; i--)
+ {
+#define LOAD(k) float w0 = *w++; float w1 = *w++; float *vz = &zlin[4*i - k*64]; float *vy = &zlin[4*i - (15 - k)*64];
+#define S0(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] = vz[j]*w1 + vy[j]*w0, a[j] = vz[j]*w0 - vy[j]*w1; }
+#define S1(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vz[j]*w0 - vy[j]*w1; }
+#define S2(k) { int j; LOAD(k); for (j = 0; j < 4; j++) b[j] += vz[j]*w1 + vy[j]*w0, a[j] += vy[j]*w1 - vz[j]*w0; }
+ float a[4], b[4];
+
+ zlin[4*i] = xl[18*(31 - i)];
+ zlin[4*i + 1] = xr[18*(31 - i)];
+ zlin[4*i + 2] = xl[1 + 18*(31 - i)];
+ zlin[4*i + 3] = xr[1 + 18*(31 - i)];
+ zlin[4*(i + 16)] = xl[1 + 18*(1 + i)];
+ zlin[4*(i + 16) + 1] = xr[1 + 18*(1 + i)];
+ zlin[4*(i - 16) + 2] = xl[18*(1 + i)];
+ zlin[4*(i - 16) + 3] = xr[18*(1 + i)];
+
+ S0(0) S2(1) S1(2) S2(3) S1(4) S2(5) S1(6) S2(7)
+
+ dstr[(15 - i)*nch] = mp3d_scale_pcm(a[1]);
+ dstr[(17 + i)*nch] = mp3d_scale_pcm(b[1]);
+ dstl[(15 - i)*nch] = mp3d_scale_pcm(a[0]);
+ dstl[(17 + i)*nch] = mp3d_scale_pcm(b[0]);
+ dstr[(47 - i)*nch] = mp3d_scale_pcm(a[3]);
+ dstr[(49 + i)*nch] = mp3d_scale_pcm(b[3]);
+ dstl[(47 - i)*nch] = mp3d_scale_pcm(a[2]);
+ dstl[(49 + i)*nch] = mp3d_scale_pcm(b[2]);
+ }
+#endif /* MINIMP3_ONLY_SIMD */
+}
+
+static void mp3d_synth_granule(float *qmf_state, float *grbuf, int nbands, int nch, mp3d_sample_t *pcm, float *lins)
+{
+ int i;
+ for (i = 0; i < nch; i++)
+ {
+ mp3d_DCT_II(grbuf + 576*i, nbands);
+ }
+
+ memcpy(lins, qmf_state, sizeof(float)*15*64);
+
+ for (i = 0; i < nbands; i += 2)
+ {
+ mp3d_synth(grbuf + i, pcm + 32*nch*i, nch, lins + i*64);
+ }
+#ifndef MINIMP3_NONSTANDARD_BUT_LOGICAL
+ if (nch == 1)
+ {
+ for (i = 0; i < 15*64; i += 2)
+ {
+ qmf_state[i] = lins[nbands*64 + i];
+ }
+ } else
+#endif /* MINIMP3_NONSTANDARD_BUT_LOGICAL */
+ {
+ memcpy(qmf_state, lins + nbands*64, sizeof(float)*15*64);
+ }
+}
+
+static int mp3d_match_frame(const uint8_t *hdr, int mp3_bytes, int frame_bytes)
+{
+ int i, nmatch;
+ for (i = 0, nmatch = 0; nmatch < MAX_FRAME_SYNC_MATCHES; nmatch++)
+ {
+ i += hdr_frame_bytes(hdr + i, frame_bytes) + hdr_padding(hdr + i);
+ if (i + HDR_SIZE > mp3_bytes)
+ return nmatch > 0;
+ if (!hdr_compare(hdr, hdr + i))
+ return 0;
+ }
+ return 1;
+}
+
+static int mp3d_find_frame(const uint8_t *mp3, int mp3_bytes, int *free_format_bytes, int *ptr_frame_bytes)
+{
+ int i, k;
+ for (i = 0; i < mp3_bytes - HDR_SIZE; i++, mp3++)
+ {
+ if (hdr_valid(mp3))
+ {
+ int frame_bytes = hdr_frame_bytes(mp3, *free_format_bytes);
+ int frame_and_padding = frame_bytes + hdr_padding(mp3);
+
+ for (k = HDR_SIZE; !frame_bytes && k < MAX_FREE_FORMAT_FRAME_SIZE && i + 2*k < mp3_bytes - HDR_SIZE; k++)
+ {
+ if (hdr_compare(mp3, mp3 + k))
+ {
+ int fb = k - hdr_padding(mp3);
+ int nextfb = fb + hdr_padding(mp3 + k);
+ if (i + k + nextfb + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + k + nextfb))
+ continue;
+ frame_and_padding = k;
+ frame_bytes = fb;
+ *free_format_bytes = fb;
+ }
+ }
+ if ((frame_bytes && i + frame_and_padding <= mp3_bytes &&
+ mp3d_match_frame(mp3, mp3_bytes - i, frame_bytes)) ||
+ (!i && frame_and_padding == mp3_bytes))
+ {
+ *ptr_frame_bytes = frame_and_padding;
+ return i;
+ }
+ *free_format_bytes = 0;
+ }
+ }
+ *ptr_frame_bytes = 0;
+ return mp3_bytes;
+}
+
+void mp3dec_init(mp3dec_t *dec)
+{
+ dec->header[0] = 0;
+}
+
+int mp3dec_decode_frame(mp3dec_t *dec, const uint8_t *mp3, int mp3_bytes, mp3d_sample_t *pcm, mp3dec_frame_info_t *info)
+{
+ int i = 0, igr, frame_size = 0, success = 1;
+ const uint8_t *hdr;
+ bs_t bs_frame[1];
+ mp3dec_scratch_t scratch;
+
+ if (mp3_bytes > 4 && dec->header[0] == 0xff && hdr_compare(dec->header, mp3))
+ {
+ frame_size = hdr_frame_bytes(mp3, dec->free_format_bytes) + hdr_padding(mp3);
+ if (frame_size != mp3_bytes && (frame_size + HDR_SIZE > mp3_bytes || !hdr_compare(mp3, mp3 + frame_size)))
+ {
+ frame_size = 0;
+ }
+ }
+ if (!frame_size)
+ {
+ memset(dec, 0, sizeof(mp3dec_t));
+ i = mp3d_find_frame(mp3, mp3_bytes, &dec->free_format_bytes, &frame_size);
+ if (!frame_size || i + frame_size > mp3_bytes)
+ {
+ info->frame_bytes = i;
+ return 0;
+ }
+ }
+
+ hdr = mp3 + i;
+ memcpy(dec->header, hdr, HDR_SIZE);
+ info->frame_bytes = i + frame_size;
+ info->frame_offset = i;
+ info->channels = HDR_IS_MONO(hdr) ? 1 : 2;
+ info->hz = hdr_sample_rate_hz(hdr);
+ info->layer = 4 - HDR_GET_LAYER(hdr);
+ info->bitrate_kbps = hdr_bitrate_kbps(hdr);
+
+ if (!pcm)
+ {
+ return hdr_frame_samples(hdr);
+ }
+
+ bs_init(bs_frame, hdr + HDR_SIZE, frame_size - HDR_SIZE);
+ if (HDR_IS_CRC(hdr))
+ {
+ get_bits(bs_frame, 16);
+ }
+
+ if (info->layer == 3)
+ {
+ int main_data_begin = L3_read_side_info(bs_frame, scratch.gr_info, hdr);
+ if (main_data_begin < 0 || bs_frame->pos > bs_frame->limit)
+ {
+ mp3dec_init(dec);
+ return 0;
+ }
+ success = L3_restore_reservoir(dec, bs_frame, &scratch, main_data_begin);
+ if (success)
+ {
+ for (igr = 0; igr < (HDR_TEST_MPEG1(hdr) ? 2 : 1); igr++, pcm += 576*info->channels)
+ {
+ memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+ L3_decode(dec, &scratch, scratch.gr_info + igr*info->channels, info->channels);
+ mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 18, info->channels, pcm, scratch.syn[0]);
+ }
+ }
+ L3_save_reservoir(dec, &scratch);
+ } else
+ {
+#ifdef MINIMP3_ONLY_MP3
+ return 0;
+#else /* MINIMP3_ONLY_MP3 */
+ L12_scale_info sci[1];
+ L12_read_scale_info(hdr, bs_frame, sci);
+
+ memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+ for (i = 0, igr = 0; igr < 3; igr++)
+ {
+ if (12 == (i += L12_dequantize_granule(scratch.grbuf[0] + i, bs_frame, sci, info->layer | 1)))
+ {
+ i = 0;
+ L12_apply_scf_384(sci, sci->scf + igr, scratch.grbuf[0]);
+ mp3d_synth_granule(dec->qmf_state, scratch.grbuf[0], 12, info->channels, pcm, scratch.syn[0]);
+ memset(scratch.grbuf[0], 0, 576*2*sizeof(float));
+ pcm += 384*info->channels;
+ }
+ if (bs_frame->pos > bs_frame->limit)
+ {
+ mp3dec_init(dec);
+ return 0;
+ }
+ }
+#endif /* MINIMP3_ONLY_MP3 */
+ }
+ return success*hdr_frame_samples(dec->header);
+}
+
+#ifdef MINIMP3_FLOAT_OUTPUT
+void mp3dec_f32_to_s16(const float *in, int16_t *out, int num_samples)
+{
+ int i = 0;
+#if HAVE_SIMD
+ int aligned_count = num_samples & ~7;
+ for(; i < aligned_count; i += 8)
+ {
+ static const f4 g_scale = { 32768.0f, 32768.0f, 32768.0f, 32768.0f };
+ f4 a = VMUL(VLD(&in[i ]), g_scale);
+ f4 b = VMUL(VLD(&in[i+4]), g_scale);
+#if HAVE_SSE
+ static const f4 g_max = { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
+ static const f4 g_min = { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
+ __m128i pcm8 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(a, g_max), g_min)),
+ _mm_cvtps_epi32(_mm_max_ps(_mm_min_ps(b, g_max), g_min)));
+ out[i ] = _mm_extract_epi16(pcm8, 0);
+ out[i+1] = _mm_extract_epi16(pcm8, 1);
+ out[i+2] = _mm_extract_epi16(pcm8, 2);
+ out[i+3] = _mm_extract_epi16(pcm8, 3);
+ out[i+4] = _mm_extract_epi16(pcm8, 4);
+ out[i+5] = _mm_extract_epi16(pcm8, 5);
+ out[i+6] = _mm_extract_epi16(pcm8, 6);
+ out[i+7] = _mm_extract_epi16(pcm8, 7);
+#else /* HAVE_SSE */
+ int16x4_t pcma, pcmb;
+ a = VADD(a, VSET(0.5f));
+ b = VADD(b, VSET(0.5f));
+ pcma = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(a), vreinterpretq_s32_u32(vcltq_f32(a, VSET(0)))));
+ pcmb = vqmovn_s32(vqaddq_s32(vcvtq_s32_f32(b), vreinterpretq_s32_u32(vcltq_f32(b, VSET(0)))));
+ vst1_lane_s16(out+i , pcma, 0);
+ vst1_lane_s16(out+i+1, pcma, 1);
+ vst1_lane_s16(out+i+2, pcma, 2);
+ vst1_lane_s16(out+i+3, pcma, 3);
+ vst1_lane_s16(out+i+4, pcmb, 0);
+ vst1_lane_s16(out+i+5, pcmb, 1);
+ vst1_lane_s16(out+i+6, pcmb, 2);
+ vst1_lane_s16(out+i+7, pcmb, 3);
+#endif /* HAVE_SSE */
+ }
+#endif /* HAVE_SIMD */
+ for(; i < num_samples; i++)
+ {
+ float sample = in[i] * 32768.0f;
+ if (sample >= 32766.5)
+ out[i] = (int16_t) 32767;
+ else if (sample <= -32767.5)
+ out[i] = (int16_t)-32768;
+ else
+ {
+ int16_t s = (int16_t)(sample + .5f);
+ s -= (s < 0); /* away from zero, to be compliant */
+ out[i] = s;
+ }
+ }
+}
+#endif /* MINIMP3_FLOAT_OUTPUT */
+#endif /* MINIMP3_IMPLEMENTATION && !_MINIMP3_IMPLEMENTATION_GUARD */
diff --git a/thirdparty/minimp3/minimp3_ex.h b/thirdparty/minimp3/minimp3_ex.h
new file mode 100644
index 0000000000..e29dd15b2e
--- /dev/null
+++ b/thirdparty/minimp3/minimp3_ex.h
@@ -0,0 +1,1394 @@
+#ifndef MINIMP3_EXT_H
+#define MINIMP3_EXT_H
+/*
+ https://github.com/lieff/minimp3
+ To the extent possible under law, the author(s) have dedicated all copyright and related and neighboring rights to this software to the public domain worldwide.
+ This software is distributed without any warranty.
+ See <http://creativecommons.org/publicdomain/zero/1.0/>.
+*/
+#include "minimp3.h"
+
+/* flags for mp3dec_ex_open_* functions */
+#define MP3D_SEEK_TO_BYTE 0 /* mp3dec_ex_seek seeks to byte in stream */
+#define MP3D_SEEK_TO_SAMPLE 1 /* mp3dec_ex_seek precisely seeks to sample using index (created during duration calculation scan or when mp3dec_ex_seek called) */
+#define MP3D_DO_NOT_SCAN 2 /* do not scan whole stream for duration if vbrtag not found, mp3dec_ex_t::samples will be filled only if mp3dec_ex_t::vbr_tag_found == 1 */
+#ifdef MINIMP3_ALLOW_MONO_STEREO_TRANSITION
+#define MP3D_ALLOW_MONO_STEREO_TRANSITION 4
+#define MP3D_FLAGS_MASK 7
+#else
+#define MP3D_FLAGS_MASK 3
+#endif
+
+/* compile-time config */
+#define MINIMP3_PREDECODE_FRAMES 2 /* frames to pre-decode and skip after seek (to fill internal structures) */
+/*#define MINIMP3_SEEK_IDX_LINEAR_SEARCH*/ /* define to use linear index search instead of binary search on seek */
+#define MINIMP3_IO_SIZE (128*1024) /* io buffer size for streaming functions, must be greater than MINIMP3_BUF_SIZE */
+#define MINIMP3_BUF_SIZE (16*1024) /* buffer which can hold minimum 10 consecutive mp3 frames (~16KB) worst case */
+/*#define MINIMP3_SCAN_LIMIT (256*1024)*/ /* how many bytes will be scanned to search first valid mp3 frame, to prevent stall on large non-mp3 files */
+#define MINIMP3_ENABLE_RING 0 /* WIP enable hardware magic ring buffer if available, to make less input buffer memmove(s) in callback IO mode */
+
+/* return error codes */
+#define MP3D_E_PARAM -1
+#define MP3D_E_MEMORY -2
+#define MP3D_E_IOERROR -3
+#define MP3D_E_USER -4 /* can be used to stop processing from callbacks without indicating specific error */
+#define MP3D_E_DECODE -5 /* decode error which can't be safely skipped, such as sample rate, layer and channels change */
+
+typedef struct
+{
+ mp3d_sample_t *buffer;
+ size_t samples; /* channels included, byte size = samples*sizeof(mp3d_sample_t) */
+ int channels, hz, layer, avg_bitrate_kbps;
+} mp3dec_file_info_t;
+
+typedef struct
+{
+ const uint8_t *buffer;
+ size_t size;
+} mp3dec_map_info_t;
+
+typedef struct
+{
+ uint64_t sample;
+ uint64_t offset;
+} mp3dec_frame_t;
+
+typedef struct
+{
+ mp3dec_frame_t *frames;
+ size_t num_frames, capacity;
+} mp3dec_index_t;
+
+typedef size_t (*MP3D_READ_CB)(void *buf, size_t size, void *user_data);
+typedef int (*MP3D_SEEK_CB)(uint64_t position, void *user_data);
+
+typedef struct
+{
+ MP3D_READ_CB read;
+ void *read_data;
+ MP3D_SEEK_CB seek;
+ void *seek_data;
+} mp3dec_io_t;
+
+typedef struct
+{
+ mp3dec_t mp3d;
+ mp3dec_map_info_t file;
+ mp3dec_io_t *io;
+ mp3dec_index_t index;
+ uint64_t offset, samples, detected_samples, cur_sample, start_offset, end_offset;
+ mp3dec_frame_info_t info;
+ mp3d_sample_t buffer[MINIMP3_MAX_SAMPLES_PER_FRAME];
+ size_t input_consumed, input_filled;
+ int is_file, flags, vbr_tag_found, indexes_built;
+ int free_format_bytes;
+ int buffer_samples, buffer_consumed, to_skip, start_delay;
+ int last_error;
+} mp3dec_ex_t;
+
+typedef int (*MP3D_ITERATE_CB)(void *user_data, const uint8_t *frame, int frame_size, int free_format_bytes, size_t buf_size, uint64_t offset, mp3dec_frame_info_t *info);
+typedef int (*MP3D_PROGRESS_CB)(void *user_data, size_t file_size, uint64_t offset, mp3dec_frame_info_t *info);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* detect mp3/mpa format */
+int mp3dec_detect_buf(const uint8_t *buf, size_t buf_size);
+int mp3dec_detect_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size);
+/* decode whole buffer block */
+int mp3dec_load_buf(mp3dec_t *dec, const uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+int mp3dec_load_cb(mp3dec_t *dec, mp3dec_io_t *io, uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+/* iterate through frames */
+int mp3dec_iterate_buf(const uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data);
+int mp3dec_iterate_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data);
+/* streaming decoder with seeking capability */
+int mp3dec_ex_open_buf(mp3dec_ex_t *dec, const uint8_t *buf, size_t buf_size, int flags);
+int mp3dec_ex_open_cb(mp3dec_ex_t *dec, mp3dec_io_t *io, int flags);
+void mp3dec_ex_close(mp3dec_ex_t *dec);
+int mp3dec_ex_seek(mp3dec_ex_t *dec, uint64_t position);
+size_t mp3dec_ex_read_frame(mp3dec_ex_t *dec, mp3d_sample_t **buf, mp3dec_frame_info_t *frame_info, size_t max_samples);
+size_t mp3dec_ex_read(mp3dec_ex_t *dec, mp3d_sample_t *buf, size_t samples);
+#ifndef MINIMP3_NO_STDIO
+/* stdio versions of file detect, load, iterate and stream */
+int mp3dec_detect(const char *file_name);
+int mp3dec_load(mp3dec_t *dec, const char *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+int mp3dec_iterate(const char *file_name, MP3D_ITERATE_CB callback, void *user_data);
+int mp3dec_ex_open(mp3dec_ex_t *dec, const char *file_name, int flags);
+#ifdef _WIN32
+int mp3dec_detect_w(const wchar_t *file_name);
+int mp3dec_load_w(mp3dec_t *dec, const wchar_t *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data);
+int mp3dec_iterate_w(const wchar_t *file_name, MP3D_ITERATE_CB callback, void *user_data);
+int mp3dec_ex_open_w(mp3dec_ex_t *dec, const wchar_t *file_name, int flags);
+#endif
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /*MINIMP3_EXT_H*/
+
+#ifdef MINIMP3_IMPLEMENTATION
+#include <limits.h>
+
+static void mp3dec_skip_id3v1(const uint8_t *buf, size_t *pbuf_size)
+{
+ size_t buf_size = *pbuf_size;
+#ifndef MINIMP3_NOSKIP_ID3V1
+ if (buf_size >= 128 && !memcmp(buf + buf_size - 128, "TAG", 3))
+ {
+ buf_size -= 128;
+ if (buf_size >= 227 && !memcmp(buf + buf_size - 227, "TAG+", 4))
+ buf_size -= 227;
+ }
+#endif
+#ifndef MINIMP3_NOSKIP_APEV2
+ if (buf_size > 32 && !memcmp(buf + buf_size - 32, "APETAGEX", 8))
+ {
+ buf_size -= 32;
+ const uint8_t *tag = buf + buf_size + 8 + 4;
+ uint32_t tag_size = (uint32_t)(tag[3] << 24) | (tag[2] << 16) | (tag[1] << 8) | tag[0];
+ if (buf_size >= tag_size)
+ buf_size -= tag_size;
+ }
+#endif
+ *pbuf_size = buf_size;
+}
+
+static size_t mp3dec_skip_id3v2(const uint8_t *buf, size_t buf_size)
+{
+#define MINIMP3_ID3_DETECT_SIZE 10
+#ifndef MINIMP3_NOSKIP_ID3V2
+ if (buf_size >= MINIMP3_ID3_DETECT_SIZE && !memcmp(buf, "ID3", 3) && !((buf[5] & 15) || (buf[6] & 0x80) || (buf[7] & 0x80) || (buf[8] & 0x80) || (buf[9] & 0x80)))
+ {
+ size_t id3v2size = (((buf[6] & 0x7f) << 21) | ((buf[7] & 0x7f) << 14) | ((buf[8] & 0x7f) << 7) | (buf[9] & 0x7f)) + 10;
+ if ((buf[5] & 16))
+ id3v2size += 10; /* footer */
+ return id3v2size;
+ }
+#endif
+ return 0;
+}
+
+static void mp3dec_skip_id3(const uint8_t **pbuf, size_t *pbuf_size)
+{
+ uint8_t *buf = (uint8_t *)(*pbuf);
+ size_t buf_size = *pbuf_size;
+ size_t id3v2size = mp3dec_skip_id3v2(buf, buf_size);
+ if (id3v2size)
+ {
+ if (id3v2size >= buf_size)
+ id3v2size = buf_size;
+ buf += id3v2size;
+ buf_size -= id3v2size;
+ }
+ mp3dec_skip_id3v1(buf, &buf_size);
+ *pbuf = (const uint8_t *)buf;
+ *pbuf_size = buf_size;
+}
+
+static int mp3dec_check_vbrtag(const uint8_t *frame, int frame_size, uint32_t *frames, int *delay, int *padding)
+{
+ static const char g_xing_tag[4] = { 'X', 'i', 'n', 'g' };
+ static const char g_info_tag[4] = { 'I', 'n', 'f', 'o' };
+#define FRAMES_FLAG 1
+#define BYTES_FLAG 2
+#define TOC_FLAG 4
+#define VBR_SCALE_FLAG 8
+ /* Side info offsets after header:
+ / Mono Stereo
+ / MPEG1 17 32
+ / MPEG2 & 2.5 9 17*/
+ bs_t bs[1];
+ L3_gr_info_t gr_info[4];
+ bs_init(bs, frame + HDR_SIZE, frame_size - HDR_SIZE);
+ if (HDR_IS_CRC(frame))
+ get_bits(bs, 16);
+ if (L3_read_side_info(bs, gr_info, frame) < 0)
+ return 0; /* side info corrupted */
+
+ const uint8_t *tag = frame + HDR_SIZE + bs->pos/8;
+ if (memcmp(g_xing_tag, tag, 4) && memcmp(g_info_tag, tag, 4))
+ return 0;
+ int flags = tag[7];
+ if (!((flags & FRAMES_FLAG)))
+ return -1;
+ tag += 8;
+ *frames = (uint32_t)(tag[0] << 24) | (tag[1] << 16) | (tag[2] << 8) | tag[3];
+ tag += 4;
+ if (flags & BYTES_FLAG)
+ tag += 4;
+ if (flags & TOC_FLAG)
+ tag += 100;
+ if (flags & VBR_SCALE_FLAG)
+ tag += 4;
+ *delay = *padding = 0;
+ if (*tag)
+ { /* extension, LAME, Lavc, etc. Should be the same structure. */
+ tag += 21;
+ if (tag - frame + 14 >= frame_size)
+ return 0;
+ *delay = ((tag[0] << 4) | (tag[1] >> 4)) + (528 + 1);
+ *padding = (((tag[1] & 0xF) << 8) | tag[2]) - (528 + 1);
+ }
+ return 1;
+}
+
+int mp3dec_detect_buf(const uint8_t *buf, size_t buf_size)
+{
+ return mp3dec_detect_cb(0, (uint8_t *)buf, buf_size);
+}
+
+int mp3dec_detect_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size)
+{
+ if (!buf || (size_t)-1 == buf_size || (io && buf_size < MINIMP3_BUF_SIZE))
+ return MP3D_E_PARAM;
+ size_t filled = buf_size;
+ if (io)
+ {
+ if (io->seek(0, io->seek_data))
+ return MP3D_E_IOERROR;
+ filled = io->read(buf, MINIMP3_ID3_DETECT_SIZE, io->read_data);
+ if (filled > MINIMP3_ID3_DETECT_SIZE)
+ return MP3D_E_IOERROR;
+ }
+ if (filled < MINIMP3_ID3_DETECT_SIZE)
+ return MP3D_E_USER; /* too small, can't be mp3/mpa */
+ if (mp3dec_skip_id3v2(buf, filled))
+ return 0; /* id3v2 tag is enough evidence */
+ if (io)
+ {
+ size_t readed = io->read(buf + MINIMP3_ID3_DETECT_SIZE, buf_size - MINIMP3_ID3_DETECT_SIZE, io->read_data);
+ if (readed > (buf_size - MINIMP3_ID3_DETECT_SIZE))
+ return MP3D_E_IOERROR;
+ filled += readed;
+ if (filled < MINIMP3_BUF_SIZE)
+ mp3dec_skip_id3v1(buf, &filled);
+ } else
+ {
+ mp3dec_skip_id3v1(buf, &filled);
+ if (filled > MINIMP3_BUF_SIZE)
+ filled = MINIMP3_BUF_SIZE;
+ }
+ int free_format_bytes, frame_size;
+ mp3d_find_frame(buf, filled, &free_format_bytes, &frame_size);
+ if (frame_size)
+ return 0; /* MAX_FRAME_SYNC_MATCHES consecutive frames found */
+ return MP3D_E_USER;
+}
+
+int mp3dec_load_buf(mp3dec_t *dec, const uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+ return mp3dec_load_cb(dec, 0, (uint8_t *)buf, buf_size, info, progress_cb, user_data);
+}
+
+int mp3dec_load_cb(mp3dec_t *dec, mp3dec_io_t *io, uint8_t *buf, size_t buf_size, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+ if (!dec || !buf || !info || (size_t)-1 == buf_size || (io && buf_size < MINIMP3_BUF_SIZE))
+ return MP3D_E_PARAM;
+ uint64_t detected_samples = 0;
+ size_t orig_buf_size = buf_size;
+ int to_skip = 0;
+ mp3dec_frame_info_t frame_info;
+ memset(info, 0, sizeof(*info));
+ memset(&frame_info, 0, sizeof(frame_info));
+
+ /* skip id3 */
+ size_t filled = 0, consumed = 0;
+ int eof = 0, ret = 0;
+ if (io)
+ {
+ if (io->seek(0, io->seek_data))
+ return MP3D_E_IOERROR;
+ filled = io->read(buf, MINIMP3_ID3_DETECT_SIZE, io->read_data);
+ if (filled > MINIMP3_ID3_DETECT_SIZE)
+ return MP3D_E_IOERROR;
+ if (MINIMP3_ID3_DETECT_SIZE != filled)
+ return 0;
+ size_t id3v2size = mp3dec_skip_id3v2(buf, filled);
+ if (id3v2size)
+ {
+ if (io->seek(id3v2size, io->seek_data))
+ return MP3D_E_IOERROR;
+ filled = io->read(buf, buf_size, io->read_data);
+ if (filled > buf_size)
+ return MP3D_E_IOERROR;
+ } else
+ {
+ size_t readed = io->read(buf + MINIMP3_ID3_DETECT_SIZE, buf_size - MINIMP3_ID3_DETECT_SIZE, io->read_data);
+ if (readed > (buf_size - MINIMP3_ID3_DETECT_SIZE))
+ return MP3D_E_IOERROR;
+ filled += readed;
+ }
+ if (filled < MINIMP3_BUF_SIZE)
+ mp3dec_skip_id3v1(buf, &filled);
+ } else
+ {
+ mp3dec_skip_id3((const uint8_t **)&buf, &buf_size);
+ if (!buf_size)
+ return 0;
+ }
+ /* try to make allocation size assumption by first frame or vbr tag */
+ mp3dec_init(dec);
+ int samples;
+ do
+ {
+ uint32_t frames;
+ int i, delay, padding, free_format_bytes = 0, frame_size = 0;
+ const uint8_t *hdr;
+ if (io)
+ {
+ if (!eof && filled - consumed < MINIMP3_BUF_SIZE)
+ { /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+ memmove(buf, buf + consumed, filled - consumed);
+ filled -= consumed;
+ consumed = 0;
+ size_t readed = io->read(buf + filled, buf_size - filled, io->read_data);
+ if (readed > (buf_size - filled))
+ return MP3D_E_IOERROR;
+ if (readed != (buf_size - filled))
+ eof = 1;
+ filled += readed;
+ if (eof)
+ mp3dec_skip_id3v1(buf, &filled);
+ }
+ i = mp3d_find_frame(buf + consumed, filled - consumed, &free_format_bytes, &frame_size);
+ consumed += i;
+ hdr = buf + consumed;
+ } else
+ {
+ i = mp3d_find_frame(buf, buf_size, &free_format_bytes, &frame_size);
+ buf += i;
+ buf_size -= i;
+ hdr = buf;
+ }
+ if (i && !frame_size)
+ continue;
+ if (!frame_size)
+ return 0;
+ frame_info.channels = HDR_IS_MONO(hdr) ? 1 : 2;
+ frame_info.hz = hdr_sample_rate_hz(hdr);
+ frame_info.layer = 4 - HDR_GET_LAYER(hdr);
+ frame_info.bitrate_kbps = hdr_bitrate_kbps(hdr);
+ frame_info.frame_bytes = frame_size;
+ samples = hdr_frame_samples(hdr)*frame_info.channels;
+ if (3 != frame_info.layer)
+ break;
+ int ret = mp3dec_check_vbrtag(hdr, frame_size, &frames, &delay, &padding);
+ if (ret > 0)
+ {
+ padding *= frame_info.channels;
+ to_skip = delay*frame_info.channels;
+ detected_samples = samples*(uint64_t)frames;
+ if (detected_samples >= (uint64_t)to_skip)
+ detected_samples -= to_skip;
+ if (padding > 0 && detected_samples >= (uint64_t)padding)
+ detected_samples -= padding;
+ if (!detected_samples)
+ return 0;
+ }
+ if (ret)
+ {
+ if (io)
+ {
+ consumed += frame_size;
+ } else
+ {
+ buf += frame_size;
+ buf_size -= frame_size;
+ }
+ }
+ break;
+ } while(1);
+ size_t allocated = MINIMP3_MAX_SAMPLES_PER_FRAME*sizeof(mp3d_sample_t);
+ if (detected_samples)
+ allocated += detected_samples*sizeof(mp3d_sample_t);
+ else
+ allocated += (buf_size/frame_info.frame_bytes)*samples*sizeof(mp3d_sample_t);
+ info->buffer = (mp3d_sample_t*)malloc(allocated);
+ if (!info->buffer)
+ return MP3D_E_MEMORY;
+ /* save info */
+ info->channels = frame_info.channels;
+ info->hz = frame_info.hz;
+ info->layer = frame_info.layer;
+ /* decode all frames */
+ size_t avg_bitrate_kbps = 0, frames = 0;
+ do
+ {
+ if ((allocated - info->samples*sizeof(mp3d_sample_t)) < MINIMP3_MAX_SAMPLES_PER_FRAME*sizeof(mp3d_sample_t))
+ {
+ allocated *= 2;
+ mp3d_sample_t *alloc_buf = (mp3d_sample_t*)realloc(info->buffer, allocated);
+ if (!alloc_buf)
+ return MP3D_E_MEMORY;
+ info->buffer = alloc_buf;
+ }
+ if (io)
+ {
+ if (!eof && filled - consumed < MINIMP3_BUF_SIZE)
+ { /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+ memmove(buf, buf + consumed, filled - consumed);
+ filled -= consumed;
+ consumed = 0;
+ size_t readed = io->read(buf + filled, buf_size - filled, io->read_data);
+ if (readed != (buf_size - filled))
+ eof = 1;
+ filled += readed;
+ if (eof)
+ mp3dec_skip_id3v1(buf, &filled);
+ }
+ samples = mp3dec_decode_frame(dec, buf + consumed, filled - consumed, info->buffer + info->samples, &frame_info);
+ consumed += frame_info.frame_bytes;
+ } else
+ {
+ samples = mp3dec_decode_frame(dec, buf, MINIMP3_MIN(buf_size, (size_t)INT_MAX), info->buffer + info->samples, &frame_info);
+ buf += frame_info.frame_bytes;
+ buf_size -= frame_info.frame_bytes;
+ }
+ if (samples)
+ {
+ if (info->hz != frame_info.hz || info->layer != frame_info.layer)
+ {
+ ret = MP3D_E_DECODE;
+ break;
+ }
+ if (info->channels && info->channels != frame_info.channels)
+ {
+#ifdef MINIMP3_ALLOW_MONO_STEREO_TRANSITION
+ info->channels = 0; /* mark file with mono-stereo transition */
+#else
+ ret = MP3D_E_DECODE;
+ break;
+#endif
+ }
+ samples *= frame_info.channels;
+ if (to_skip)
+ {
+ size_t skip = MINIMP3_MIN(samples, to_skip);
+ to_skip -= skip;
+ samples -= skip;
+ memmove(info->buffer, info->buffer + skip, samples*sizeof(mp3d_sample_t));
+ }
+ info->samples += samples;
+ avg_bitrate_kbps += frame_info.bitrate_kbps;
+ frames++;
+ if (progress_cb)
+ {
+ ret = progress_cb(user_data, orig_buf_size, orig_buf_size - buf_size, &frame_info);
+ if (ret)
+ break;
+ }
+ }
+ } while (frame_info.frame_bytes);
+ if (detected_samples && info->samples > detected_samples)
+ info->samples = detected_samples; /* cut padding */
+ /* reallocate to normal buffer size */
+ if (allocated != info->samples*sizeof(mp3d_sample_t))
+ {
+ mp3d_sample_t *alloc_buf = (mp3d_sample_t*)realloc(info->buffer, info->samples*sizeof(mp3d_sample_t));
+ if (!alloc_buf && info->samples)
+ return MP3D_E_MEMORY;
+ info->buffer = alloc_buf;
+ }
+ if (frames)
+ info->avg_bitrate_kbps = avg_bitrate_kbps/frames;
+ return ret;
+}
+
+int mp3dec_iterate_buf(const uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data)
+{
+ const uint8_t *orig_buf = buf;
+ if (!buf || (size_t)-1 == buf_size || !callback)
+ return MP3D_E_PARAM;
+ /* skip id3 */
+ mp3dec_skip_id3(&buf, &buf_size);
+ if (!buf_size)
+ return 0;
+ mp3dec_frame_info_t frame_info;
+ memset(&frame_info, 0, sizeof(frame_info));
+ do
+ {
+ int free_format_bytes = 0, frame_size = 0, ret;
+ int i = mp3d_find_frame(buf, buf_size, &free_format_bytes, &frame_size);
+ buf += i;
+ buf_size -= i;
+ if (i && !frame_size)
+ continue;
+ if (!frame_size)
+ break;
+ const uint8_t *hdr = buf;
+ frame_info.channels = HDR_IS_MONO(hdr) ? 1 : 2;
+ frame_info.hz = hdr_sample_rate_hz(hdr);
+ frame_info.layer = 4 - HDR_GET_LAYER(hdr);
+ frame_info.bitrate_kbps = hdr_bitrate_kbps(hdr);
+ frame_info.frame_bytes = frame_size;
+
+ if (callback)
+ {
+ if ((ret = callback(user_data, hdr, frame_size, free_format_bytes, buf_size, hdr - orig_buf, &frame_info)))
+ return ret;
+ }
+ buf += frame_size;
+ buf_size -= frame_size;
+ } while (1);
+ return 0;
+}
+
+int mp3dec_iterate_cb(mp3dec_io_t *io, uint8_t *buf, size_t buf_size, MP3D_ITERATE_CB callback, void *user_data)
+{
+ if (!io || !buf || (size_t)-1 == buf_size || buf_size < MINIMP3_BUF_SIZE || !callback)
+ return MP3D_E_PARAM;
+ size_t filled = io->read(buf, MINIMP3_ID3_DETECT_SIZE, io->read_data), consumed = 0;
+ uint64_t readed = 0;
+ mp3dec_frame_info_t frame_info;
+ int eof = 0;
+ memset(&frame_info, 0, sizeof(frame_info));
+ if (filled > MINIMP3_ID3_DETECT_SIZE)
+ return MP3D_E_IOERROR;
+ if (MINIMP3_ID3_DETECT_SIZE != filled)
+ return 0;
+ size_t id3v2size = mp3dec_skip_id3v2(buf, filled);
+ if (id3v2size)
+ {
+ if (io->seek(id3v2size, io->seek_data))
+ return MP3D_E_IOERROR;
+ filled = io->read(buf, buf_size, io->read_data);
+ if (filled > buf_size)
+ return MP3D_E_IOERROR;
+ readed += id3v2size;
+ } else
+ {
+ size_t readed = io->read(buf + MINIMP3_ID3_DETECT_SIZE, buf_size - MINIMP3_ID3_DETECT_SIZE, io->read_data);
+ if (readed > (buf_size - MINIMP3_ID3_DETECT_SIZE))
+ return MP3D_E_IOERROR;
+ filled += readed;
+ }
+ if (filled < MINIMP3_BUF_SIZE)
+ mp3dec_skip_id3v1(buf, &filled);
+ do
+ {
+ int free_format_bytes = 0, frame_size = 0, ret;
+ int i = mp3d_find_frame(buf + consumed, filled - consumed, &free_format_bytes, &frame_size);
+ if (i && !frame_size)
+ {
+ consumed += i;
+ continue;
+ }
+ if (!frame_size)
+ break;
+ const uint8_t *hdr = buf + consumed + i;
+ frame_info.channels = HDR_IS_MONO(hdr) ? 1 : 2;
+ frame_info.hz = hdr_sample_rate_hz(hdr);
+ frame_info.layer = 4 - HDR_GET_LAYER(hdr);
+ frame_info.bitrate_kbps = hdr_bitrate_kbps(hdr);
+ frame_info.frame_bytes = frame_size;
+
+ readed += i;
+ if (callback)
+ {
+ if ((ret = callback(user_data, hdr, frame_size, free_format_bytes, filled - consumed, readed, &frame_info)))
+ return ret;
+ }
+ readed += frame_size;
+ consumed += i + frame_size;
+ if (!eof && filled - consumed < MINIMP3_BUF_SIZE)
+ { /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+ memmove(buf, buf + consumed, filled - consumed);
+ filled -= consumed;
+ consumed = 0;
+ size_t readed = io->read(buf + filled, buf_size - filled, io->read_data);
+ if (readed > (buf_size - filled))
+ return MP3D_E_IOERROR;
+ if (readed != (buf_size - filled))
+ eof = 1;
+ filled += readed;
+ if (eof)
+ mp3dec_skip_id3v1(buf, &filled);
+ }
+ } while (1);
+ return 0;
+}
+
+static int mp3dec_load_index(void *user_data, const uint8_t *frame, int frame_size, int free_format_bytes, size_t buf_size, uint64_t offset, mp3dec_frame_info_t *info)
+{
+ mp3dec_frame_t *idx_frame;
+ mp3dec_ex_t *dec = (mp3dec_ex_t *)user_data;
+ if (!dec->index.frames && !dec->start_offset)
+ { /* detect VBR tag and try to avoid full scan */
+ uint32_t frames;
+ int delay, padding;
+ dec->info = *info;
+ dec->start_offset = dec->offset = offset;
+ dec->end_offset = offset + buf_size;
+ dec->free_format_bytes = free_format_bytes; /* should not change */
+ if (3 == dec->info.layer)
+ {
+ int ret = mp3dec_check_vbrtag(frame, frame_size, &frames, &delay, &padding);
+ if (ret)
+ dec->start_offset = dec->offset = offset + frame_size;
+ if (ret > 0)
+ {
+ padding *= info->channels;
+ dec->start_delay = dec->to_skip = delay*info->channels;
+ dec->samples = hdr_frame_samples(frame)*info->channels*(uint64_t)frames;
+ if (dec->samples >= (uint64_t)dec->start_delay)
+ dec->samples -= dec->start_delay;
+ if (padding > 0 && dec->samples >= (uint64_t)padding)
+ dec->samples -= padding;
+ dec->detected_samples = dec->samples;
+ dec->vbr_tag_found = 1;
+ return MP3D_E_USER;
+ } else if (ret < 0)
+ return 0;
+ }
+ }
+ if (dec->flags & MP3D_DO_NOT_SCAN)
+ return MP3D_E_USER;
+ if (dec->index.num_frames + 1 > dec->index.capacity)
+ {
+ if (!dec->index.capacity)
+ dec->index.capacity = 4096;
+ else
+ dec->index.capacity *= 2;
+ mp3dec_frame_t *alloc_buf = (mp3dec_frame_t *)realloc((void*)dec->index.frames, sizeof(mp3dec_frame_t)*dec->index.capacity);
+ if (!alloc_buf)
+ return MP3D_E_MEMORY;
+ dec->index.frames = alloc_buf;
+ }
+ idx_frame = &dec->index.frames[dec->index.num_frames++];
+ idx_frame->offset = offset;
+ idx_frame->sample = dec->samples;
+ if (!dec->buffer_samples && dec->index.num_frames < 256)
+ { /* for some cutted mp3 frames, bit-reservoir not filled and decoding can't be started from first frames */
+ /* try to decode up to 255 first frames till samples starts to decode */
+ dec->buffer_samples = mp3dec_decode_frame(&dec->mp3d, frame, MINIMP3_MIN(buf_size, (size_t)INT_MAX), dec->buffer, info);
+ dec->samples += dec->buffer_samples*info->channels;
+ } else
+ dec->samples += hdr_frame_samples(frame)*info->channels;
+ return 0;
+}
+
+int mp3dec_ex_open_buf(mp3dec_ex_t *dec, const uint8_t *buf, size_t buf_size, int flags)
+{
+ if (!dec || !buf || (size_t)-1 == buf_size || (flags & (~MP3D_FLAGS_MASK)))
+ return MP3D_E_PARAM;
+ memset(dec, 0, sizeof(*dec));
+ dec->file.buffer = buf;
+ dec->file.size = buf_size;
+ dec->flags = flags;
+ mp3dec_init(&dec->mp3d);
+ int ret = mp3dec_iterate_buf(dec->file.buffer, dec->file.size, mp3dec_load_index, dec);
+ if (ret && MP3D_E_USER != ret)
+ return ret;
+ mp3dec_init(&dec->mp3d);
+ dec->buffer_samples = 0;
+ dec->indexes_built = !(dec->vbr_tag_found || (flags & MP3D_DO_NOT_SCAN));
+ dec->flags &= (~MP3D_DO_NOT_SCAN);
+ return 0;
+}
+
+#ifndef MINIMP3_SEEK_IDX_LINEAR_SEARCH
+static size_t mp3dec_idx_binary_search(mp3dec_index_t *idx, uint64_t position)
+{
+ size_t end = idx->num_frames, start = 0, index = 0;
+ while (start <= end)
+ {
+ size_t mid = (start + end) / 2;
+ if (idx->frames[mid].sample >= position)
+ { /* move left side. */
+ if (idx->frames[mid].sample == position)
+ return mid;
+ end = mid - 1;
+ } else
+ { /* move to right side */
+ index = mid;
+ start = mid + 1;
+ if (start == idx->num_frames)
+ break;
+ }
+ }
+ return index;
+}
+#endif
+
+int mp3dec_ex_seek(mp3dec_ex_t *dec, uint64_t position)
+{
+ size_t i;
+ if (!dec)
+ return MP3D_E_PARAM;
+ if (!(dec->flags & MP3D_SEEK_TO_SAMPLE))
+ {
+ if (dec->io)
+ {
+ dec->offset = position;
+ } else
+ {
+ dec->offset = MINIMP3_MIN(position, dec->file.size);
+ }
+ dec->cur_sample = 0;
+ goto do_exit;
+ }
+ dec->cur_sample = position;
+ position += dec->start_delay;
+ if (0 == position)
+ { /* optimize seek to zero, no index needed */
+seek_zero:
+ dec->offset = dec->start_offset;
+ dec->to_skip = 0;
+ goto do_exit;
+ }
+ if (!dec->indexes_built)
+ { /* no index created yet (vbr tag used to calculate track length or MP3D_DO_NOT_SCAN open flag used) */
+ dec->indexes_built = 1;
+ dec->samples = 0;
+ dec->buffer_samples = 0;
+ if (dec->io)
+ {
+ if (dec->io->seek(dec->start_offset, dec->io->seek_data))
+ return MP3D_E_IOERROR;
+ int ret = mp3dec_iterate_cb(dec->io, (uint8_t *)dec->file.buffer, dec->file.size, mp3dec_load_index, dec);
+ if (ret && MP3D_E_USER != ret)
+ return ret;
+ } else
+ {
+ int ret = mp3dec_iterate_buf(dec->file.buffer + dec->start_offset, dec->file.size - dec->start_offset, mp3dec_load_index, dec);
+ if (ret && MP3D_E_USER != ret)
+ return ret;
+ }
+ for (i = 0; i < dec->index.num_frames; i++)
+ dec->index.frames[i].offset += dec->start_offset;
+ dec->samples = dec->detected_samples;
+ }
+ if (!dec->index.frames)
+ goto seek_zero; /* no frames in file - seek to zero */
+#ifdef MINIMP3_SEEK_IDX_LINEAR_SEARCH
+ for (i = 0; i < dec->index.num_frames; i++)
+ {
+ if (dec->index.frames[i].sample >= position)
+ break;
+ }
+#else
+ i = mp3dec_idx_binary_search(&dec->index, position);
+#endif
+ if (i)
+ {
+ int to_fill_bytes = 511;
+ int skip_frames = MINIMP3_PREDECODE_FRAMES
+#ifdef MINIMP3_SEEK_IDX_LINEAR_SEARCH
+ + ((dec->index.frames[i].sample == position) ? 0 : 1)
+#endif
+ ;
+ i -= MINIMP3_MIN(i, (size_t)skip_frames);
+ if (3 == dec->info.layer)
+ {
+ while (i && to_fill_bytes)
+ { /* make sure bit-reservoir is filled when we start decoding */
+ bs_t bs[1];
+ L3_gr_info_t gr_info[4];
+ int frame_bytes, frame_size;
+ const uint8_t *hdr;
+ if (dec->io)
+ {
+ hdr = dec->file.buffer;
+ if (dec->io->seek(dec->index.frames[i - 1].offset, dec->io->seek_data))
+ return MP3D_E_IOERROR;
+ size_t readed = dec->io->read((uint8_t *)hdr, HDR_SIZE, dec->io->read_data);
+ if (readed != HDR_SIZE)
+ return MP3D_E_IOERROR;
+ frame_size = hdr_frame_bytes(hdr, dec->free_format_bytes) + hdr_padding(hdr);
+ readed = dec->io->read((uint8_t *)hdr + HDR_SIZE, frame_size - HDR_SIZE, dec->io->read_data);
+ if (readed != (size_t)(frame_size - HDR_SIZE))
+ return MP3D_E_IOERROR;
+ bs_init(bs, hdr + HDR_SIZE, frame_size - HDR_SIZE);
+ } else
+ {
+ hdr = dec->file.buffer + dec->index.frames[i - 1].offset;
+ frame_size = hdr_frame_bytes(hdr, dec->free_format_bytes) + hdr_padding(hdr);
+ bs_init(bs, hdr + HDR_SIZE, frame_size - HDR_SIZE);
+ }
+ if (HDR_IS_CRC(hdr))
+ get_bits(bs, 16);
+ i--;
+ if (L3_read_side_info(bs, gr_info, hdr) < 0)
+ break; /* frame not decodable, we can start from here */
+ frame_bytes = (bs->limit - bs->pos)/8;
+ to_fill_bytes -= MINIMP3_MIN(to_fill_bytes, frame_bytes);
+ }
+ }
+ }
+ dec->offset = dec->index.frames[i].offset;
+ dec->to_skip = position - dec->index.frames[i].sample;
+ while ((i + 1) < dec->index.num_frames && !dec->index.frames[i].sample && !dec->index.frames[i + 1].sample)
+ { /* skip not decodable first frames */
+ const uint8_t *hdr;
+ if (dec->io)
+ {
+ hdr = dec->file.buffer;
+ if (dec->io->seek(dec->index.frames[i].offset, dec->io->seek_data))
+ return MP3D_E_IOERROR;
+ size_t readed = dec->io->read((uint8_t *)hdr, HDR_SIZE, dec->io->read_data);
+ if (readed != HDR_SIZE)
+ return MP3D_E_IOERROR;
+ } else
+ hdr = dec->file.buffer + dec->index.frames[i].offset;
+ dec->to_skip += hdr_frame_samples(hdr)*dec->info.channels;
+ i++;
+ }
+do_exit:
+ if (dec->io)
+ {
+ if (dec->io->seek(dec->offset, dec->io->seek_data))
+ return MP3D_E_IOERROR;
+ }
+ dec->buffer_samples = 0;
+ dec->buffer_consumed = 0;
+ dec->input_consumed = 0;
+ dec->input_filled = 0;
+ dec->last_error = 0;
+ mp3dec_init(&dec->mp3d);
+ return 0;
+}
+
+size_t mp3dec_ex_read_frame(mp3dec_ex_t *dec, mp3d_sample_t **buf, mp3dec_frame_info_t *frame_info, size_t max_samples)
+{
+ if (!dec || !buf || !frame_info)
+ {
+ if (dec)
+ dec->last_error = MP3D_E_PARAM;
+ return 0;
+ }
+ if (dec->detected_samples && dec->cur_sample >= dec->detected_samples)
+ return 0; /* at end of stream */
+ if (dec->last_error)
+ return 0; /* error eof state, seek can reset it */
+ *buf = NULL;
+ uint64_t end_offset = dec->end_offset ? dec->end_offset : dec->file.size;
+ int eof = 0;
+ while (dec->buffer_consumed == dec->buffer_samples)
+ {
+ const uint8_t *dec_buf;
+ if (dec->io)
+ {
+ if (!eof && (dec->input_filled - dec->input_consumed) < MINIMP3_BUF_SIZE)
+ { /* keep minimum 10 consecutive mp3 frames (~16KB) worst case */
+ memmove((uint8_t*)dec->file.buffer, (uint8_t*)dec->file.buffer + dec->input_consumed, dec->input_filled - dec->input_consumed);
+ dec->input_filled -= dec->input_consumed;
+ dec->input_consumed = 0;
+ size_t readed = dec->io->read((uint8_t*)dec->file.buffer + dec->input_filled, dec->file.size - dec->input_filled, dec->io->read_data);
+ if (readed > (dec->file.size - dec->input_filled))
+ {
+ dec->last_error = MP3D_E_IOERROR;
+ readed = 0;
+ }
+ if (readed != (dec->file.size - dec->input_filled))
+ eof = 1;
+ dec->input_filled += readed;
+ if (eof)
+ mp3dec_skip_id3v1((uint8_t*)dec->file.buffer, &dec->input_filled);
+ }
+ dec_buf = dec->file.buffer + dec->input_consumed;
+ if (!(dec->input_filled - dec->input_consumed))
+ return 0;
+ dec->buffer_samples = mp3dec_decode_frame(&dec->mp3d, dec_buf, dec->input_filled - dec->input_consumed, dec->buffer, frame_info);
+ dec->input_consumed += frame_info->frame_bytes;
+ } else
+ {
+ dec_buf = dec->file.buffer + dec->offset;
+ uint64_t buf_size = end_offset - dec->offset;
+ if (!buf_size)
+ return 0;
+ dec->buffer_samples = mp3dec_decode_frame(&dec->mp3d, dec_buf, MINIMP3_MIN(buf_size, (uint64_t)INT_MAX), dec->buffer, frame_info);
+ }
+ dec->buffer_consumed = 0;
+ if (dec->info.hz != frame_info->hz || dec->info.layer != frame_info->layer)
+ {
+return_e_decode:
+ dec->last_error = MP3D_E_DECODE;
+ return 0;
+ }
+ if (dec->buffer_samples)
+ {
+ dec->buffer_samples *= frame_info->channels;
+ if (dec->to_skip)
+ {
+ size_t skip = MINIMP3_MIN(dec->buffer_samples, dec->to_skip);
+ dec->buffer_consumed += skip;
+ dec->to_skip -= skip;
+ }
+ if (
+#ifdef MINIMP3_ALLOW_MONO_STEREO_TRANSITION
+ !(dec->flags & MP3D_ALLOW_MONO_STEREO_TRANSITION) &&
+#endif
+ dec->buffer_consumed != dec->buffer_samples && dec->info.channels != frame_info->channels)
+ {
+ goto return_e_decode;
+ }
+ } else if (dec->to_skip)
+ { /* In mp3 decoding not always can start decode from any frame because of bit reservoir,
+ count skip samples for such frames */
+ int frame_samples = hdr_frame_samples(dec_buf)*frame_info->channels;
+ dec->to_skip -= MINIMP3_MIN(frame_samples, dec->to_skip);
+ }
+ dec->offset += frame_info->frame_bytes;
+ }
+ size_t out_samples = MINIMP3_MIN((size_t)(dec->buffer_samples - dec->buffer_consumed), max_samples);
+ if (dec->detected_samples)
+ { /* count decoded samples to properly cut padding */
+ if (dec->cur_sample + out_samples >= dec->detected_samples)
+ out_samples = dec->detected_samples - dec->cur_sample;
+ }
+ dec->cur_sample += out_samples;
+ *buf = dec->buffer + dec->buffer_consumed;
+ dec->buffer_consumed += out_samples;
+ return out_samples;
+}
+
+size_t mp3dec_ex_read(mp3dec_ex_t *dec, mp3d_sample_t *buf, size_t samples)
+{
+ if (!dec || !buf)
+ {
+ if (dec)
+ dec->last_error = MP3D_E_PARAM;
+ return 0;
+ }
+ mp3dec_frame_info_t frame_info;
+ memset(&frame_info, 0, sizeof(frame_info));
+ size_t samples_requested = samples;
+ while (samples)
+ {
+ mp3d_sample_t *buf_frame = NULL;
+ size_t read_samples = mp3dec_ex_read_frame(dec, &buf_frame, &frame_info, samples);
+ if (!read_samples)
+ {
+ break;
+ }
+ memcpy(buf, buf_frame, read_samples * sizeof(mp3d_sample_t));
+ buf += read_samples;
+ samples -= read_samples;
+ }
+ return samples_requested - samples;
+}
+
+int mp3dec_ex_open_cb(mp3dec_ex_t *dec, mp3dec_io_t *io, int flags)
+{
+ if (!dec || !io || (flags & (~MP3D_FLAGS_MASK)))
+ return MP3D_E_PARAM;
+ memset(dec, 0, sizeof(*dec));
+#ifdef MINIMP3_HAVE_RING
+ int ret;
+ if (ret = mp3dec_open_ring(&dec->file, MINIMP3_IO_SIZE))
+ return ret;
+#else
+ dec->file.size = MINIMP3_IO_SIZE;
+ dec->file.buffer = (const uint8_t*)malloc(dec->file.size);
+ if (!dec->file.buffer)
+ return MP3D_E_MEMORY;
+#endif
+ dec->flags = flags;
+ dec->io = io;
+ mp3dec_init(&dec->mp3d);
+ if (io->seek(0, io->seek_data))
+ return MP3D_E_IOERROR;
+ int ret = mp3dec_iterate_cb(io, (uint8_t *)dec->file.buffer, dec->file.size, mp3dec_load_index, dec);
+ if (ret && MP3D_E_USER != ret)
+ return ret;
+ if (dec->io->seek(dec->start_offset, dec->io->seek_data))
+ return MP3D_E_IOERROR;
+ mp3dec_init(&dec->mp3d);
+ dec->buffer_samples = 0;
+ dec->indexes_built = !(dec->vbr_tag_found || (flags & MP3D_DO_NOT_SCAN));
+ dec->flags &= (~MP3D_DO_NOT_SCAN);
+ return 0;
+}
+
+
+#ifndef MINIMP3_NO_STDIO
+
+#if defined(__linux__) || defined(__FreeBSD__)
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#if !defined(_GNU_SOURCE)
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#endif
+#if !defined(MAP_POPULATE) && defined(__linux__)
+#define MAP_POPULATE 0x08000
+#elif !defined(MAP_POPULATE)
+#define MAP_POPULATE 0
+#endif
+
+static void mp3dec_close_file(mp3dec_map_info_t *map_info)
+{
+ if (map_info->buffer && MAP_FAILED != map_info->buffer)
+ munmap((void *)map_info->buffer, map_info->size);
+ map_info->buffer = 0;
+ map_info->size = 0;
+}
+
+static int mp3dec_open_file(const char *file_name, mp3dec_map_info_t *map_info)
+{
+ if (!file_name)
+ return MP3D_E_PARAM;
+ int file;
+ struct stat st;
+ memset(map_info, 0, sizeof(*map_info));
+retry_open:
+ file = open(file_name, O_RDONLY);
+ if (file < 0 && (errno == EAGAIN || errno == EINTR))
+ goto retry_open;
+ if (file < 0 || fstat(file, &st) < 0)
+ {
+ close(file);
+ return MP3D_E_IOERROR;
+ }
+
+ map_info->size = st.st_size;
+retry_mmap:
+ map_info->buffer = (const uint8_t *)mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE | MAP_POPULATE, file, 0);
+ if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+ goto retry_mmap;
+ close(file);
+ if (MAP_FAILED == map_info->buffer)
+ return MP3D_E_IOERROR;
+ return 0;
+}
+
+#if MINIMP3_ENABLE_RING && defined(__linux__) && defined(_GNU_SOURCE)
+#define MINIMP3_HAVE_RING
+static void mp3dec_close_ring(mp3dec_map_info_t *map_info)
+{
+#if defined(__linux__) && defined(_GNU_SOURCE)
+ if (map_info->buffer && MAP_FAILED != map_info->buffer)
+ munmap((void *)map_info->buffer, map_info->size*2);
+#else
+ if (map_info->buffer)
+ {
+ shmdt(map_info->buffer);
+ shmdt(map_info->buffer + map_info->size);
+ }
+#endif
+ map_info->buffer = 0;
+ map_info->size = 0;
+}
+
+static int mp3dec_open_ring(mp3dec_map_info_t *map_info, size_t size)
+{
+ int memfd, page_size;
+#if defined(__linux__) && defined(_GNU_SOURCE)
+ void *buffer;
+ int res;
+#endif
+ memset(map_info, 0, sizeof(*map_info));
+
+#ifdef _SC_PAGESIZE
+ page_size = sysconf(_SC_PAGESIZE);
+#else
+ page_size = getpagesize();
+#endif
+ map_info->size = (size + page_size - 1)/page_size*page_size;
+
+#if defined(__linux__) && defined(_GNU_SOURCE)
+ memfd = memfd_create("mp3_ring", 0);
+ if (memfd < 0)
+ return MP3D_E_MEMORY;
+
+retry_ftruncate:
+ res = ftruncate(memfd, map_info->size);
+ if (res && (errno == EAGAIN || errno == EINTR))
+ goto retry_ftruncate;
+ if (res)
+ goto error;
+
+retry_mmap:
+ map_info->buffer = (const uint8_t *)mmap(NULL, map_info->size*2, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+ if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+ goto retry_mmap;
+ if (MAP_FAILED == map_info->buffer || !map_info->buffer)
+ goto error;
+retry_mmap2:
+ buffer = mmap((void *)map_info->buffer, map_info->size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, memfd, 0);
+ if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+ goto retry_mmap2;
+ if (MAP_FAILED == map_info->buffer || buffer != (void *)map_info->buffer)
+ goto error;
+retry_mmap3:
+ buffer = mmap((void *)map_info->buffer + map_info->size, map_info->size, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, memfd, 0);
+ if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+ goto retry_mmap3;
+ if (MAP_FAILED == map_info->buffer || buffer != (void *)(map_info->buffer + map_info->size))
+ goto error;
+
+ close(memfd);
+ return 0;
+error:
+ close(memfd);
+ mp3dec_close_ring(map_info);
+ return MP3D_E_MEMORY;
+#else
+ memfd = shmget(IPC_PRIVATE, map_info->size, IPC_CREAT | 0700);
+ if (memfd < 0)
+ return MP3D_E_MEMORY;
+retry_mmap:
+ map_info->buffer = (const uint8_t *)mmap(NULL, map_info->size*2, PROT_NONE, MAP_PRIVATE, -1, 0);
+ if (MAP_FAILED == map_info->buffer && (errno == EAGAIN || errno == EINTR))
+ goto retry_mmap;
+ if (MAP_FAILED == map_info->buffer)
+ goto error;
+ if (map_info->buffer != shmat(memfd, map_info->buffer, 0))
+ goto error;
+ if ((map_info->buffer + map_info->size) != shmat(memfd, map_info->buffer + map_info->size, 0))
+ goto error;
+ if (shmctl(memfd, IPC_RMID, NULL) < 0)
+ return MP3D_E_MEMORY;
+ return 0;
+error:
+ shmctl(memfd, IPC_RMID, NULL);
+ mp3dec_close_ring(map_info);
+ return MP3D_E_MEMORY;
+#endif
+}
+#endif /*MINIMP3_ENABLE_RING*/
+#elif defined(_WIN32)
+#include <windows.h>
+
+static void mp3dec_close_file(mp3dec_map_info_t *map_info)
+{
+ if (map_info->buffer)
+ UnmapViewOfFile(map_info->buffer);
+ map_info->buffer = 0;
+ map_info->size = 0;
+}
+
+static int mp3dec_open_file_h(HANDLE file, mp3dec_map_info_t *map_info)
+{
+ memset(map_info, 0, sizeof(*map_info));
+
+ HANDLE mapping = NULL;
+ LARGE_INTEGER s;
+ s.LowPart = GetFileSize(file, (DWORD*)&s.HighPart);
+ if (s.LowPart == INVALID_FILE_SIZE && GetLastError() != NO_ERROR)
+ goto error;
+ map_info->size = s.QuadPart;
+
+ mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
+ if (!mapping)
+ goto error;
+ map_info->buffer = (const uint8_t*)MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, s.QuadPart);
+ CloseHandle(mapping);
+ if (!map_info->buffer)
+ goto error;
+
+ CloseHandle(file);
+ return 0;
+error:
+ mp3dec_close_file(map_info);
+ CloseHandle(file);
+ return MP3D_E_IOERROR;
+}
+
+static int mp3dec_open_file(const char *file_name, mp3dec_map_info_t *map_info)
+{
+ if (!file_name)
+ return MP3D_E_PARAM;
+ HANDLE file = CreateFileA(file_name, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
+ if (INVALID_HANDLE_VALUE == file)
+ return MP3D_E_IOERROR;
+ return mp3dec_open_file_h(file, map_info);
+}
+
+static int mp3dec_open_file_w(const wchar_t *file_name, mp3dec_map_info_t *map_info)
+{
+ if (!file_name)
+ return MP3D_E_PARAM;
+ HANDLE file = CreateFileW(file_name, GENERIC_READ, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
+ if (INVALID_HANDLE_VALUE == file)
+ return MP3D_E_IOERROR;
+ return mp3dec_open_file_h(file, map_info);
+}
+#else
+#include <stdio.h>
+
+static void mp3dec_close_file(mp3dec_map_info_t *map_info)
+{
+ if (map_info->buffer)
+ free((void *)map_info->buffer);
+ map_info->buffer = 0;
+ map_info->size = 0;
+}
+
+static int mp3dec_open_file(const char *file_name, mp3dec_map_info_t *map_info)
+{
+ if (!file_name)
+ return MP3D_E_PARAM;
+ memset(map_info, 0, sizeof(*map_info));
+ FILE *file = fopen(file_name, "rb");
+ if (!file)
+ return MP3D_E_IOERROR;
+ int res = MP3D_E_IOERROR;
+ long size = -1;
+ if (fseek(file, 0, SEEK_END))
+ goto error;
+ size = ftell(file);
+ if (size < 0)
+ goto error;
+ map_info->size = (size_t)size;
+ if (fseek(file, 0, SEEK_SET))
+ goto error;
+ map_info->buffer = (uint8_t *)malloc(map_info->size);
+ if (!map_info->buffer)
+ {
+ res = MP3D_E_MEMORY;
+ goto error;
+ }
+ if (fread((void *)map_info->buffer, 1, map_info->size, file) != map_info->size)
+ goto error;
+ fclose(file);
+ return 0;
+error:
+ mp3dec_close_file(map_info);
+ fclose(file);
+ return res;
+}
+#endif
+
+static int mp3dec_detect_mapinfo(mp3dec_map_info_t *map_info)
+{
+ int ret = mp3dec_detect_buf(map_info->buffer, map_info->size);
+ mp3dec_close_file(map_info);
+ return ret;
+}
+
+static int mp3dec_load_mapinfo(mp3dec_t *dec, mp3dec_map_info_t *map_info, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+ int ret = mp3dec_load_buf(dec, map_info->buffer, map_info->size, info, progress_cb, user_data);
+ mp3dec_close_file(map_info);
+ return ret;
+}
+
+static int mp3dec_iterate_mapinfo(mp3dec_map_info_t *map_info, MP3D_ITERATE_CB callback, void *user_data)
+{
+ int ret = mp3dec_iterate_buf(map_info->buffer, map_info->size, callback, user_data);
+ mp3dec_close_file(map_info);
+ return ret;
+}
+
+static int mp3dec_ex_open_mapinfo(mp3dec_ex_t *dec, int flags)
+{
+ int ret = mp3dec_ex_open_buf(dec, dec->file.buffer, dec->file.size, flags);
+ dec->is_file = 1;
+ if (ret)
+ mp3dec_ex_close(dec);
+ return ret;
+}
+
+int mp3dec_detect(const char *file_name)
+{
+ int ret;
+ mp3dec_map_info_t map_info;
+ if ((ret = mp3dec_open_file(file_name, &map_info)))
+ return ret;
+ return mp3dec_detect_mapinfo(&map_info);
+}
+
+int mp3dec_load(mp3dec_t *dec, const char *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+ int ret;
+ mp3dec_map_info_t map_info;
+ if ((ret = mp3dec_open_file(file_name, &map_info)))
+ return ret;
+ return mp3dec_load_mapinfo(dec, &map_info, info, progress_cb, user_data);
+}
+
+int mp3dec_iterate(const char *file_name, MP3D_ITERATE_CB callback, void *user_data)
+{
+ int ret;
+ mp3dec_map_info_t map_info;
+ if ((ret = mp3dec_open_file(file_name, &map_info)))
+ return ret;
+ return mp3dec_iterate_mapinfo(&map_info, callback, user_data);
+}
+
+int mp3dec_ex_open(mp3dec_ex_t *dec, const char *file_name, int flags)
+{
+ int ret;
+ if (!dec)
+ return MP3D_E_PARAM;
+ if ((ret = mp3dec_open_file(file_name, &dec->file)))
+ return ret;
+ return mp3dec_ex_open_mapinfo(dec, flags);
+}
+
+void mp3dec_ex_close(mp3dec_ex_t *dec)
+{
+#ifdef MINIMP3_HAVE_RING
+ if (dec->io)
+ mp3dec_close_ring(&dec->file);
+#else
+ if (dec->io && dec->file.buffer)
+ free((void*)dec->file.buffer);
+#endif
+ if (dec->is_file)
+ mp3dec_close_file(&dec->file);
+ if (dec->index.frames)
+ free(dec->index.frames);
+ memset(dec, 0, sizeof(*dec));
+}
+
+#ifdef _WIN32
+int mp3dec_detect_w(const wchar_t *file_name)
+{
+ int ret;
+ mp3dec_map_info_t map_info;
+ if ((ret = mp3dec_open_file_w(file_name, &map_info)))
+ return ret;
+ return mp3dec_detect_mapinfo(&map_info);
+}
+
+int mp3dec_load_w(mp3dec_t *dec, const wchar_t *file_name, mp3dec_file_info_t *info, MP3D_PROGRESS_CB progress_cb, void *user_data)
+{
+ int ret;
+ mp3dec_map_info_t map_info;
+ if ((ret = mp3dec_open_file_w(file_name, &map_info)))
+ return ret;
+ return mp3dec_load_mapinfo(dec, &map_info, info, progress_cb, user_data);
+}
+
+int mp3dec_iterate_w(const wchar_t *file_name, MP3D_ITERATE_CB callback, void *user_data)
+{
+ int ret;
+ mp3dec_map_info_t map_info;
+ if ((ret = mp3dec_open_file_w(file_name, &map_info)))
+ return ret;
+ return mp3dec_iterate_mapinfo(&map_info, callback, user_data);
+}
+
+int mp3dec_ex_open_w(mp3dec_ex_t *dec, const wchar_t *file_name, int flags)
+{
+ int ret;
+ if ((ret = mp3dec_open_file_w(file_name, &dec->file)))
+ return ret;
+ return mp3dec_ex_open_mapinfo(dec, flags);
+}
+#endif
+#else /* MINIMP3_NO_STDIO */
+void mp3dec_ex_close(mp3dec_ex_t *dec)
+{
+#ifdef MINIMP3_HAVE_RING
+ if (dec->io)
+ mp3dec_close_ring(&dec->file);
+#else
+ if (dec->io && dec->file.buffer)
+ free((void*)dec->file.buffer);
+#endif
+ if (dec->index.frames)
+ free(dec->index.frames);
+ memset(dec, 0, sizeof(*dec));
+}
+#endif
+
+#endif /*MINIMP3_IMPLEMENTATION*/
diff --git a/thirdparty/misc/easing_equations.cpp b/thirdparty/misc/easing_equations.cpp
index bc84564b19..af48aaf079 100644
--- a/thirdparty/misc/easing_equations.cpp
+++ b/thirdparty/misc/easing_equations.cpp
@@ -188,7 +188,8 @@ static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
///////////////////////////////////////////////////////////////////////////
namespace cubic {
static real_t in(real_t t, real_t b, real_t c, real_t d) {
- return c * (t /= d) * t * t + b;
+ t /= d;
+ return c * t * t * t + b;
}
static real_t out(real_t t, real_t b, real_t c, real_t d) {
@@ -197,8 +198,10 @@ static real_t out(real_t t, real_t b, real_t c, real_t d) {
}
static real_t in_out(real_t t, real_t b, real_t c, real_t d) {
- if ((t /= d / 2) < 1) return c / 2 * t * t * t + b;
- return c / 2 * ((t -= 2) * t * t + 2) + b;
+ t /= d / 2;
+ if (t < 1) return c / 2 * t * t * t + b;
+ t -= 2;
+ return c / 2 * (t * t * t + 2) + b;
}
static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
@@ -210,16 +213,22 @@ static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
///////////////////////////////////////////////////////////////////////////
namespace circ {
static real_t in(real_t t, real_t b, real_t c, real_t d) {
- return -c * (sqrt(1 - (t /= d) * t) - 1) + b; // TODO: ehrich: operation with t is undefined
+ t /= d;
+ return -c * (sqrt(1 - t * t) - 1) + b;
}
static real_t out(real_t t, real_t b, real_t c, real_t d) {
- return c * sqrt(1 - (t = t / d - 1) * t) + b; // TODO: ehrich: operation with t is undefined
+ t = t / d - 1;
+ return c * sqrt(1 - t * t) + b;
}
static real_t in_out(real_t t, real_t b, real_t c, real_t d) {
- if ((t /= d / 2) < 1) return -c / 2 * (sqrt(1 - t * t) - 1) + b;
- return c / 2 * (sqrt(1 - t * (t -= 2)) + 1) + b; // TODO: ehrich: operation with t is undefined
+ t /= d / 2;
+ if (t < 1) {
+ return -c / 2 * (sqrt(1 - t * t) - 1) + b;
+ }
+ t -= 2;
+ return c / 2 * (sqrt(1 - t * t) + 1) + b;
}
static real_t out_in(real_t t, real_t b, real_t c, real_t d) {
@@ -271,14 +280,16 @@ static real_t in(real_t t, real_t b, real_t c, real_t d) {
static real_t out(real_t t, real_t b, real_t c, real_t d) {
float s = 1.70158f;
- return c * ((t = t / d - 1) * t * ((s + 1) * t + s) + 1) + b; // TODO: ehrich: operation with t is undefined
+ t = t / d - 1;
+ return c * (t * t * ((s + 1) * t + s) + 1) + b;
}
static real_t in_out(real_t t, real_t b, real_t c, real_t d) {
- float s = 1.70158f;
- if ((t /= d / 2) < 1) return c / 2 * (t * t * (((s *= (1.525f)) + 1) * t - s)) + b; // TODO: ehrich: operation with s is undefined
- float postFix = t -= 2;
- return c / 2 * ((postFix)*t * (((s *= (1.525f)) + 1) * t + s) + 2) + b; // TODO: ehrich: operation with s is undefined
+ float s = 1.70158f * 1.525f;
+ t /= d / 2;
+ if (t < 1) return c / 2 * (t * t * ((s + 1) * t - s)) + b;
+ t -= 2;
+ return c / 2 * (t * t * ((s + 1) * t + s) + 2) + b;
}
static real_t out_in(real_t t, real_t b, real_t c, real_t d) {