diff options
Diffstat (limited to 'thirdparty/meshoptimizer')
18 files changed, 7908 insertions, 0 deletions
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md new file mode 100644 index 0000000000..4fcd766d22 --- /dev/null +++ b/thirdparty/meshoptimizer/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2016-2020 Arseny Kapoulkine + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/thirdparty/meshoptimizer/allocator.cpp b/thirdparty/meshoptimizer/allocator.cpp new file mode 100644 index 0000000000..da7cc540b2 --- /dev/null +++ b/thirdparty/meshoptimizer/allocator.cpp @@ -0,0 +1,8 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*)) +{ + meshopt_Allocator::Storage::allocate = allocate; + meshopt_Allocator::Storage::deallocate = deallocate; +} diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp new file mode 100644 index 0000000000..f7d88c5136 --- /dev/null +++ b/thirdparty/meshoptimizer/clusterizer.cpp @@ -0,0 +1,351 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <math.h> +#include <string.h> + +// This work is based on: +// Graham Wihlidal. Optimizing the Graphics Pipeline with Compute. 2016 +// Matthaeus Chajdas. GeometryFX 1.2 - Cluster Culling. 2016 +// Jack Ritter. An Efficient Bounding Sphere. 1990 +namespace meshopt +{ + +static void computeBoundingSphere(float result[4], const float points[][3], size_t count) +{ + assert(count > 0); + + // find extremum points along all 3 axes; for each axis we get a pair of points with min/max coordinates + size_t pmin[3] = {0, 0, 0}; + size_t pmax[3] = {0, 0, 0}; + + for (size_t i = 0; i < count; ++i) + { + const float* p = points[i]; + + for (int axis = 0; axis < 3; ++axis) + { + pmin[axis] = (p[axis] < points[pmin[axis]][axis]) ? i : pmin[axis]; + pmax[axis] = (p[axis] > points[pmax[axis]][axis]) ? i : pmax[axis]; + } + } + + // find the pair of points with largest distance + float paxisd2 = 0; + int paxis = 0; + + for (int axis = 0; axis < 3; ++axis) + { + const float* p1 = points[pmin[axis]]; + const float* p2 = points[pmax[axis]]; + + float d2 = (p2[0] - p1[0]) * (p2[0] - p1[0]) + (p2[1] - p1[1]) * (p2[1] - p1[1]) + (p2[2] - p1[2]) * (p2[2] - p1[2]); + + if (d2 > paxisd2) + { + paxisd2 = d2; + paxis = axis; + } + } + + // use the longest segment as the initial sphere diameter + const float* p1 = points[pmin[paxis]]; + const float* p2 = points[pmax[paxis]]; + + float center[3] = {(p1[0] + p2[0]) / 2, (p1[1] + p2[1]) / 2, (p1[2] + p2[2]) / 2}; + float radius = sqrtf(paxisd2) / 2; + + // iteratively adjust the sphere up until all points fit + for (size_t i = 0; i < count; ++i) + { + const float* p = points[i]; + float d2 = (p[0] - center[0]) * (p[0] - center[0]) + (p[1] - center[1]) * (p[1] - center[1]) + (p[2] - center[2]) * (p[2] - center[2]); + + if (d2 > radius * radius) + { + float d = sqrtf(d2); + assert(d > 0); + + float k = 0.5f + (radius / d) / 2; + + center[0] = center[0] * k + p[0] * (1 - k); + center[1] = center[1] * k + p[1] * (1 - k); + center[2] = center[2] * k + p[2] * (1 - k); + radius = (radius + d) / 2; + } + } + + result[0] = center[0]; + result[1] = center[1]; + result[2] = center[2]; + result[3] = radius; +} + +} // namespace meshopt + +size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles) +{ + assert(index_count % 3 == 0); + assert(max_vertices >= 3); + assert(max_triangles >= 1); + + // meshlet construction is limited by max vertices and max triangles per meshlet + // the worst case is that the input is an unindexed stream since this equally stresses both limits + // note that we assume that in the worst case, we leave 2 vertices unpacked in each meshlet - if we have space for 3 we can pack any triangle + size_t max_vertices_conservative = max_vertices - 2; + size_t meshlet_limit_vertices = (index_count + max_vertices_conservative - 1) / max_vertices_conservative; + size_t meshlet_limit_triangles = (index_count / 3 + max_triangles - 1) / max_triangles; + + return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles; +} + +size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) +{ + assert(index_count % 3 == 0); + assert(max_vertices >= 3); + assert(max_triangles >= 1); + + meshopt_Allocator allocator; + + meshopt_Meshlet meshlet; + memset(&meshlet, 0, sizeof(meshlet)); + + assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0])); + assert(max_triangles <= sizeof(meshlet.indices) / 3); + + // index of the vertex in the meshlet, 0xff if the vertex isn't used + unsigned char* used = allocator.allocate<unsigned char>(vertex_count); + memset(used, -1, vertex_count); + + size_t offset = 0; + + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + unsigned char& av = used[a]; + unsigned char& bv = used[b]; + unsigned char& cv = used[c]; + + unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff); + + if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles) + { + destination[offset++] = meshlet; + + for (size_t j = 0; j < meshlet.vertex_count; ++j) + used[meshlet.vertices[j]] = 0xff; + + memset(&meshlet, 0, sizeof(meshlet)); + } + + if (av == 0xff) + { + av = meshlet.vertex_count; + meshlet.vertices[meshlet.vertex_count++] = a; + } + + if (bv == 0xff) + { + bv = meshlet.vertex_count; + meshlet.vertices[meshlet.vertex_count++] = b; + } + + if (cv == 0xff) + { + cv = meshlet.vertex_count; + meshlet.vertices[meshlet.vertex_count++] = c; + } + + meshlet.indices[meshlet.triangle_count][0] = av; + meshlet.indices[meshlet.triangle_count][1] = bv; + meshlet.indices[meshlet.triangle_count][2] = cv; + meshlet.triangle_count++; + } + + if (meshlet.triangle_count) + destination[offset++] = meshlet; + + assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles)); + + return offset; +} + +meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + assert(index_count / 3 <= 256); + + (void)vertex_count; + + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + // compute triangle normals and gather triangle corners + float normals[256][3]; + float corners[256][3][3]; + size_t triangles = 0; + + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + const float* p0 = vertex_positions + vertex_stride_float * a; + const float* p1 = vertex_positions + vertex_stride_float * b; + const float* p2 = vertex_positions + vertex_stride_float * c; + + float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]}; + float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]}; + + float normalx = p10[1] * p20[2] - p10[2] * p20[1]; + float normaly = p10[2] * p20[0] - p10[0] * p20[2]; + float normalz = p10[0] * p20[1] - p10[1] * p20[0]; + + float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz); + + // no need to include degenerate triangles - they will be invisible anyway + if (area == 0.f) + continue; + + // record triangle normals & corners for future use; normal and corner 0 define a plane equation + normals[triangles][0] = normalx / area; + normals[triangles][1] = normaly / area; + normals[triangles][2] = normalz / area; + memcpy(corners[triangles][0], p0, 3 * sizeof(float)); + memcpy(corners[triangles][1], p1, 3 * sizeof(float)); + memcpy(corners[triangles][2], p2, 3 * sizeof(float)); + triangles++; + } + + meshopt_Bounds bounds = {}; + + // degenerate cluster, no valid triangles => trivial reject (cone data is 0) + if (triangles == 0) + return bounds; + + // compute cluster bounding sphere; we'll use the center to determine normal cone apex as well + float psphere[4] = {}; + computeBoundingSphere(psphere, corners[0], triangles * 3); + + float center[3] = {psphere[0], psphere[1], psphere[2]}; + + // treating triangle normals as points, find the bounding sphere - the sphere center determines the optimal cone axis + float nsphere[4] = {}; + computeBoundingSphere(nsphere, normals, triangles); + + float axis[3] = {nsphere[0], nsphere[1], nsphere[2]}; + float axislength = sqrtf(axis[0] * axis[0] + axis[1] * axis[1] + axis[2] * axis[2]); + float invaxislength = axislength == 0.f ? 0.f : 1.f / axislength; + + axis[0] *= invaxislength; + axis[1] *= invaxislength; + axis[2] *= invaxislength; + + // compute a tight cone around all normals, mindp = cos(angle/2) + float mindp = 1.f; + + for (size_t i = 0; i < triangles; ++i) + { + float dp = normals[i][0] * axis[0] + normals[i][1] * axis[1] + normals[i][2] * axis[2]; + + mindp = (dp < mindp) ? dp : mindp; + } + + // fill bounding sphere info; note that below we can return bounds without cone information for degenerate cones + bounds.center[0] = center[0]; + bounds.center[1] = center[1]; + bounds.center[2] = center[2]; + bounds.radius = psphere[3]; + + // degenerate cluster, normal cone is larger than a hemisphere => trivial accept + // note that if mindp is positive but close to 0, the triangle intersection code below gets less stable + // we arbitrarily decide that if a normal cone is ~168 degrees wide or more, the cone isn't useful + if (mindp <= 0.1f) + { + bounds.cone_cutoff = 1; + bounds.cone_cutoff_s8 = 127; + return bounds; + } + + float maxt = 0; + + // we need to find the point on center-t*axis ray that lies in negative half-space of all triangles + for (size_t i = 0; i < triangles; ++i) + { + // dot(center-t*axis-corner, trinormal) = 0 + // dot(center-corner, trinormal) - t * dot(axis, trinormal) = 0 + float cx = center[0] - corners[i][0][0]; + float cy = center[1] - corners[i][0][1]; + float cz = center[2] - corners[i][0][2]; + + float dc = cx * normals[i][0] + cy * normals[i][1] + cz * normals[i][2]; + float dn = axis[0] * normals[i][0] + axis[1] * normals[i][1] + axis[2] * normals[i][2]; + + // dn should be larger than mindp cutoff above + assert(dn > 0.f); + float t = dc / dn; + + maxt = (t > maxt) ? t : maxt; + } + + // cone apex should be in the negative half-space of all cluster triangles by construction + bounds.cone_apex[0] = center[0] - axis[0] * maxt; + bounds.cone_apex[1] = center[1] - axis[1] * maxt; + bounds.cone_apex[2] = center[2] - axis[2] * maxt; + + // note: this axis is the axis of the normal cone, but our test for perspective camera effectively negates the axis + bounds.cone_axis[0] = axis[0]; + bounds.cone_axis[1] = axis[1]; + bounds.cone_axis[2] = axis[2]; + + // cos(a) for normal cone is mindp; we need to add 90 degrees on both sides and invert the cone + // which gives us -cos(a+90) = -(-sin(a)) = sin(a) = sqrt(1 - cos^2(a)) + bounds.cone_cutoff = sqrtf(1 - mindp * mindp); + + // quantize axis & cutoff to 8-bit SNORM format + bounds.cone_axis_s8[0] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[0], 8)); + bounds.cone_axis_s8[1] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[1], 8)); + bounds.cone_axis_s8[2] = (signed char)(meshopt_quantizeSnorm(bounds.cone_axis[2], 8)); + + // for the 8-bit test to be conservative, we need to adjust the cutoff by measuring the max. error + float cone_axis_s8_e0 = fabsf(bounds.cone_axis_s8[0] / 127.f - bounds.cone_axis[0]); + float cone_axis_s8_e1 = fabsf(bounds.cone_axis_s8[1] / 127.f - bounds.cone_axis[1]); + float cone_axis_s8_e2 = fabsf(bounds.cone_axis_s8[2] / 127.f - bounds.cone_axis[2]); + + // note that we need to round this up instead of rounding to nearest, hence +1 + int cone_cutoff_s8 = int(127 * (bounds.cone_cutoff + cone_axis_s8_e0 + cone_axis_s8_e1 + cone_axis_s8_e2) + 1); + + bounds.cone_cutoff_s8 = (cone_cutoff_s8 > 127) ? 127 : (signed char)(cone_cutoff_s8); + + return bounds; +} + +meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])]; + + for (size_t i = 0; i < meshlet->triangle_count; ++i) + { + unsigned int a = meshlet->vertices[meshlet->indices[i][0]]; + unsigned int b = meshlet->vertices[meshlet->indices[i][1]]; + unsigned int c = meshlet->vertices[meshlet->indices[i][2]]; + + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + indices[i * 3 + 0] = a; + indices[i * 3 + 1] = b; + indices[i * 3 + 2] = c; + } + + return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride); +} diff --git a/thirdparty/meshoptimizer/indexcodec.cpp b/thirdparty/meshoptimizer/indexcodec.cpp new file mode 100644 index 0000000000..eeb541e5be --- /dev/null +++ b/thirdparty/meshoptimizer/indexcodec.cpp @@ -0,0 +1,752 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <string.h> + +#ifndef TRACE +#define TRACE 0 +#endif + +#if TRACE +#include <stdio.h> +#endif + +// This work is based on: +// Fabian Giesen. Simple lossless index buffer compression & follow-up. 2013 +// Conor Stokes. Vertex Cache Optimised Index Buffer Compression. 2014 +namespace meshopt +{ + +const unsigned char kIndexHeader = 0xe0; +const unsigned char kSequenceHeader = 0xd0; + +static int gEncodeIndexVersion = 0; + +typedef unsigned int VertexFifo[16]; +typedef unsigned int EdgeFifo[16][2]; + +static const unsigned int kTriangleIndexOrder[3][3] = { + {0, 1, 2}, + {1, 2, 0}, + {2, 0, 1}, +}; + +static const unsigned char kCodeAuxEncodingTable[16] = { + 0x00, 0x76, 0x87, 0x56, 0x67, 0x78, 0xa9, 0x86, 0x65, 0x89, 0x68, 0x98, 0x01, 0x69, + 0, 0, // last two entries aren't used for encoding +}; + +static int rotateTriangle(unsigned int a, unsigned int b, unsigned int c, unsigned int next) +{ + (void)a; + + return (b == next) ? 1 : (c == next) ? 2 : 0; +} + +static int getEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, unsigned int c, size_t offset) +{ + for (int i = 0; i < 16; ++i) + { + size_t index = (offset - 1 - i) & 15; + + unsigned int e0 = fifo[index][0]; + unsigned int e1 = fifo[index][1]; + + if (e0 == a && e1 == b) + return (i << 2) | 0; + if (e0 == b && e1 == c) + return (i << 2) | 1; + if (e0 == c && e1 == a) + return (i << 2) | 2; + } + + return -1; +} + +static void pushEdgeFifo(EdgeFifo fifo, unsigned int a, unsigned int b, size_t& offset) +{ + fifo[offset][0] = a; + fifo[offset][1] = b; + offset = (offset + 1) & 15; +} + +static int getVertexFifo(VertexFifo fifo, unsigned int v, size_t offset) +{ + for (int i = 0; i < 16; ++i) + { + size_t index = (offset - 1 - i) & 15; + + if (fifo[index] == v) + return i; + } + + return -1; +} + +static void pushVertexFifo(VertexFifo fifo, unsigned int v, size_t& offset, int cond = 1) +{ + fifo[offset] = v; + offset = (offset + cond) & 15; +} + +static void encodeVByte(unsigned char*& data, unsigned int v) +{ + // encode 32-bit value in up to 5 7-bit groups + do + { + *data++ = (v & 127) | (v > 127 ? 128 : 0); + v >>= 7; + } while (v); +} + +static unsigned int decodeVByte(const unsigned char*& data) +{ + unsigned char lead = *data++; + + // fast path: single byte + if (lead < 128) + return lead; + + // slow path: up to 4 extra bytes + // note that this loop always terminates, which is important for malformed data + unsigned int result = lead & 127; + unsigned int shift = 7; + + for (int i = 0; i < 4; ++i) + { + unsigned char group = *data++; + result |= (group & 127) << shift; + shift += 7; + + if (group < 128) + break; + } + + return result; +} + +static void encodeIndex(unsigned char*& data, unsigned int index, unsigned int last) +{ + unsigned int d = index - last; + unsigned int v = (d << 1) ^ (int(d) >> 31); + + encodeVByte(data, v); +} + +static unsigned int decodeIndex(const unsigned char*& data, unsigned int last) +{ + unsigned int v = decodeVByte(data); + unsigned int d = (v >> 1) ^ -int(v & 1); + + return last + d; +} + +static int getCodeAuxIndex(unsigned char v, const unsigned char* table) +{ + for (int i = 0; i < 16; ++i) + if (table[i] == v) + return i; + + return -1; +} + +static void writeTriangle(void* destination, size_t offset, size_t index_size, unsigned int a, unsigned int b, unsigned int c) +{ + if (index_size == 2) + { + static_cast<unsigned short*>(destination)[offset + 0] = (unsigned short)(a); + static_cast<unsigned short*>(destination)[offset + 1] = (unsigned short)(b); + static_cast<unsigned short*>(destination)[offset + 2] = (unsigned short)(c); + } + else + { + static_cast<unsigned int*>(destination)[offset + 0] = a; + static_cast<unsigned int*>(destination)[offset + 1] = b; + static_cast<unsigned int*>(destination)[offset + 2] = c; + } +} + +#if TRACE +static size_t sortTop16(unsigned char dest[16], size_t stats[256]) +{ + size_t destsize = 0; + + for (size_t i = 0; i < 256; ++i) + { + size_t j = 0; + for (; j < destsize; ++j) + { + if (stats[i] >= stats[dest[j]]) + { + if (destsize < 16) + destsize++; + + memmove(&dest[j + 1], &dest[j], destsize - 1 - j); + dest[j] = (unsigned char)i; + break; + } + } + + if (j == destsize && destsize < 16) + { + dest[destsize] = (unsigned char)i; + destsize++; + } + } + + return destsize; +} +#endif + +} // namespace meshopt + +size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + +#if TRACE + size_t codestats[256] = {}; + size_t codeauxstats[256] = {}; +#endif + + // the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table + if (buffer_size < 1 + index_count / 3 + 16) + return 0; + + int version = gEncodeIndexVersion; + + buffer[0] = (unsigned char)(kIndexHeader | version); + + EdgeFifo edgefifo; + memset(edgefifo, -1, sizeof(edgefifo)); + + VertexFifo vertexfifo; + memset(vertexfifo, -1, sizeof(vertexfifo)); + + size_t edgefifooffset = 0; + size_t vertexfifooffset = 0; + + unsigned int next = 0; + unsigned int last = 0; + + unsigned char* code = buffer + 1; + unsigned char* data = code + index_count / 3; + unsigned char* data_safe_end = buffer + buffer_size - 16; + + int fecmax = version >= 1 ? 13 : 15; + + // use static encoding table; it's possible to pack the result and then build an optimal table and repack + // for now we keep it simple and use the table that has been generated based on symbol frequency on a training mesh set + const unsigned char* codeaux_table = kCodeAuxEncodingTable; + + for (size_t i = 0; i < index_count; i += 3) + { + // make sure we have enough space to write a triangle + // each triangle writes at most 16 bytes: 1b for codeaux and 5b for each free index + // after this we can be sure we can write without extra bounds checks + if (data > data_safe_end) + return 0; + + int fer = getEdgeFifo(edgefifo, indices[i + 0], indices[i + 1], indices[i + 2], edgefifooffset); + + if (fer >= 0 && (fer >> 2) < 15) + { + const unsigned int* order = kTriangleIndexOrder[fer & 3]; + + unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]]; + + // encode edge index and vertex fifo index, next or free index + int fe = fer >> 2; + int fc = getVertexFifo(vertexfifo, c, vertexfifooffset); + + int fec = (fc >= 1 && fc < fecmax) ? fc : (c == next) ? (next++, 0) : 15; + + if (fec == 15 && version >= 1) + { + // encode last-1 and last+1 to optimize strip-like sequences + if (c + 1 == last) + fec = 13, last = c; + if (c == last + 1) + fec = 14, last = c; + } + + *code++ = (unsigned char)((fe << 4) | fec); + +#if TRACE + codestats[code[-1]]++; +#endif + + // note that we need to update the last index since free indices are delta-encoded + if (fec == 15) + encodeIndex(data, c, last), last = c; + + // we only need to push third vertex since first two are likely already in the vertex fifo + if (fec == 0 || fec >= fecmax) + pushVertexFifo(vertexfifo, c, vertexfifooffset); + + // we only need to push two new edges to edge fifo since the third one is already there + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + } + else + { + int rotation = rotateTriangle(indices[i + 0], indices[i + 1], indices[i + 2], next); + const unsigned int* order = kTriangleIndexOrder[rotation]; + + unsigned int a = indices[i + order[0]], b = indices[i + order[1]], c = indices[i + order[2]]; + + // if a/b/c are 0/1/2, we emit a reset code + bool reset = false; + + if (a == 0 && b == 1 && c == 2 && next > 0 && version >= 1) + { + reset = true; + next = 0; + + // reset vertex fifo to make sure we don't accidentally reference vertices from that in the future + // this makes sure next continues to get incremented instead of being stuck + memset(vertexfifo, -1, sizeof(vertexfifo)); + } + + int fb = getVertexFifo(vertexfifo, b, vertexfifooffset); + int fc = getVertexFifo(vertexfifo, c, vertexfifooffset); + + // after rotation, a is almost always equal to next, so we don't waste bits on FIFO encoding for a + int fea = (a == next) ? (next++, 0) : 15; + int feb = (fb >= 0 && fb < 14) ? (fb + 1) : (b == next) ? (next++, 0) : 15; + int fec = (fc >= 0 && fc < 14) ? (fc + 1) : (c == next) ? (next++, 0) : 15; + + // we encode feb & fec in 4 bits using a table if possible, and as a full byte otherwise + unsigned char codeaux = (unsigned char)((feb << 4) | fec); + int codeauxindex = getCodeAuxIndex(codeaux, codeaux_table); + + // <14 encodes an index into codeaux table, 14 encodes fea=0, 15 encodes fea=15 + if (fea == 0 && codeauxindex >= 0 && codeauxindex < 14 && !reset) + { + *code++ = (unsigned char)((15 << 4) | codeauxindex); + } + else + { + *code++ = (unsigned char)((15 << 4) | 14 | fea); + *data++ = codeaux; + } + +#if TRACE + codestats[code[-1]]++; + codeauxstats[codeaux]++; +#endif + + // note that we need to update the last index since free indices are delta-encoded + if (fea == 15) + encodeIndex(data, a, last), last = a; + + if (feb == 15) + encodeIndex(data, b, last), last = b; + + if (fec == 15) + encodeIndex(data, c, last), last = c; + + // only push vertices that weren't already in fifo + if (fea == 0 || fea == 15) + pushVertexFifo(vertexfifo, a, vertexfifooffset); + + if (feb == 0 || feb == 15) + pushVertexFifo(vertexfifo, b, vertexfifooffset); + + if (fec == 0 || fec == 15) + pushVertexFifo(vertexfifo, c, vertexfifooffset); + + // all three edges aren't in the fifo; pushing all of them is important so that we can match them for later triangles + pushEdgeFifo(edgefifo, b, a, edgefifooffset); + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + } + } + + // make sure we have enough space to write codeaux table + if (data > data_safe_end) + return 0; + + // add codeaux encoding table to the end of the stream; this is used for decoding codeaux *and* as padding + // we need padding for decoding to be able to assume that each triangle is encoded as <= 16 bytes of extra data + // this is enough space for aux byte + 5 bytes per varint index which is the absolute worst case for any input + for (size_t i = 0; i < 16; ++i) + { + // decoder assumes that table entries never refer to separately encoded indices + assert((codeaux_table[i] & 0xf) != 0xf && (codeaux_table[i] >> 4) != 0xf); + + *data++ = codeaux_table[i]; + } + + // since we encode restarts as codeaux without a table reference, we need to make sure 00 is encoded as a table reference + assert(codeaux_table[0] == 0); + + assert(data >= buffer + index_count / 3 + 16); + assert(data <= buffer + buffer_size); + +#if TRACE + unsigned char codetop[16], codeauxtop[16]; + size_t codetopsize = sortTop16(codetop, codestats); + size_t codeauxtopsize = sortTop16(codeauxtop, codeauxstats); + + size_t sumcode = 0, sumcodeaux = 0; + for (size_t i = 0; i < 256; ++i) + sumcode += codestats[i], sumcodeaux += codeauxstats[i]; + + size_t acccode = 0, acccodeaux = 0; + + printf("code\t\t\t\t\tcodeaux\n"); + + for (size_t i = 0; i < codetopsize && i < codeauxtopsize; ++i) + { + acccode += codestats[codetop[i]]; + acccodeaux += codeauxstats[codeauxtop[i]]; + + printf("%2d: %02x = %d (%.1f%% ..%.1f%%)\t\t%2d: %02x = %d (%.1f%% ..%.1f%%)\n", + int(i), codetop[i], int(codestats[codetop[i]]), double(codestats[codetop[i]]) / double(sumcode) * 100, double(acccode) / double(sumcode) * 100, + int(i), codeauxtop[i], int(codeauxstats[codeauxtop[i]]), double(codeauxstats[codeauxtop[i]]) / double(sumcodeaux) * 100, double(acccodeaux) / double(sumcodeaux) * 100); + } +#endif + + return data - buffer; +} + +size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count) +{ + assert(index_count % 3 == 0); + + // compute number of bits required for each index + unsigned int vertex_bits = 1; + + while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits) + vertex_bits++; + + // worst-case encoding is 2 header bytes + 3 varint-7 encoded index deltas + unsigned int vertex_groups = (vertex_bits + 1 + 6) / 7; + + return 1 + (index_count / 3) * (2 + 3 * vertex_groups) + 16; +} + +void meshopt_encodeIndexVersion(int version) +{ + assert(unsigned(version) <= 1); + + meshopt::gEncodeIndexVersion = version; +} + +int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(index_size == 2 || index_size == 4); + + // the minimum valid encoding is header, 1 byte per triangle and a 16-byte codeaux table + if (buffer_size < 1 + index_count / 3 + 16) + return -2; + + if ((buffer[0] & 0xf0) != kIndexHeader) + return -1; + + int version = buffer[0] & 0x0f; + if (version > 1) + return -1; + + EdgeFifo edgefifo; + memset(edgefifo, -1, sizeof(edgefifo)); + + VertexFifo vertexfifo; + memset(vertexfifo, -1, sizeof(vertexfifo)); + + size_t edgefifooffset = 0; + size_t vertexfifooffset = 0; + + unsigned int next = 0; + unsigned int last = 0; + + int fecmax = version >= 1 ? 13 : 15; + + // since we store 16-byte codeaux table at the end, triangle data has to begin before data_safe_end + const unsigned char* code = buffer + 1; + const unsigned char* data = code + index_count / 3; + const unsigned char* data_safe_end = buffer + buffer_size - 16; + + const unsigned char* codeaux_table = data_safe_end; + + for (size_t i = 0; i < index_count; i += 3) + { + // make sure we have enough data to read for a triangle + // each triangle reads at most 16 bytes of data: 1b for codeaux and 5b for each free index + // after this we can be sure we can read without extra bounds checks + if (data > data_safe_end) + return -2; + + unsigned char codetri = *code++; + + if (codetri < 0xf0) + { + int fe = codetri >> 4; + + // fifo reads are wrapped around 16 entry buffer + unsigned int a = edgefifo[(edgefifooffset - 1 - fe) & 15][0]; + unsigned int b = edgefifo[(edgefifooffset - 1 - fe) & 15][1]; + + int fec = codetri & 15; + + // note: this is the most common path in the entire decoder + // inside this if we try to stay branchless (by using cmov/etc.) since these aren't predictable + if (fec < fecmax) + { + // fifo reads are wrapped around 16 entry buffer + unsigned int cf = vertexfifo[(vertexfifooffset - 1 - fec) & 15]; + unsigned int c = (fec == 0) ? next : cf; + + int fec0 = fec == 0; + next += fec0; + + // output triangle + writeTriangle(destination, i, index_size, a, b, c); + + // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); + + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + } + else + { + unsigned int c = 0; + + // fec - (fec ^ 3) decodes 13, 14 into -1, 1 + // note that we need to update the last index since free indices are delta-encoded + last = c = (fec != 15) ? last + (fec - (fec ^ 3)) : decodeIndex(data, last); + + // output triangle + writeTriangle(destination, i, index_size, a, b, c); + + // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + pushVertexFifo(vertexfifo, c, vertexfifooffset); + + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + } + } + else + { + // fast path: read codeaux from the table + if (codetri < 0xfe) + { + unsigned char codeaux = codeaux_table[codetri & 15]; + + // note: table can't contain feb/fec=15 + int feb = codeaux >> 4; + int fec = codeaux & 15; + + // fifo reads are wrapped around 16 entry buffer + // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior + unsigned int a = next++; + + unsigned int bf = vertexfifo[(vertexfifooffset - feb) & 15]; + unsigned int b = (feb == 0) ? next : bf; + + int feb0 = feb == 0; + next += feb0; + + unsigned int cf = vertexfifo[(vertexfifooffset - fec) & 15]; + unsigned int c = (fec == 0) ? next : cf; + + int fec0 = fec == 0; + next += fec0; + + // output triangle + writeTriangle(destination, i, index_size, a, b, c); + + // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + pushVertexFifo(vertexfifo, a, vertexfifooffset); + pushVertexFifo(vertexfifo, b, vertexfifooffset, feb0); + pushVertexFifo(vertexfifo, c, vertexfifooffset, fec0); + + pushEdgeFifo(edgefifo, b, a, edgefifooffset); + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + } + else + { + // slow path: read a full byte for codeaux instead of using a table lookup + unsigned char codeaux = *data++; + + int fea = codetri == 0xfe ? 0 : 15; + int feb = codeaux >> 4; + int fec = codeaux & 15; + + // reset: codeaux is 0 but encoded as not-a-table + if (codeaux == 0) + next = 0; + + // fifo reads are wrapped around 16 entry buffer + // also note that we increment next for all three vertices before decoding indices - this matches encoder behavior + unsigned int a = (fea == 0) ? next++ : 0; + unsigned int b = (feb == 0) ? next++ : vertexfifo[(vertexfifooffset - feb) & 15]; + unsigned int c = (fec == 0) ? next++ : vertexfifo[(vertexfifooffset - fec) & 15]; + + // note that we need to update the last index since free indices are delta-encoded + if (fea == 15) + last = a = decodeIndex(data, last); + + if (feb == 15) + last = b = decodeIndex(data, last); + + if (fec == 15) + last = c = decodeIndex(data, last); + + // output triangle + writeTriangle(destination, i, index_size, a, b, c); + + // push vertex/edge fifo must match the encoding step *exactly* otherwise the data will not be decoded correctly + pushVertexFifo(vertexfifo, a, vertexfifooffset); + pushVertexFifo(vertexfifo, b, vertexfifooffset, (feb == 0) | (feb == 15)); + pushVertexFifo(vertexfifo, c, vertexfifooffset, (fec == 0) | (fec == 15)); + + pushEdgeFifo(edgefifo, b, a, edgefifooffset); + pushEdgeFifo(edgefifo, c, b, edgefifooffset); + pushEdgeFifo(edgefifo, a, c, edgefifooffset); + } + } + } + + // we should've read all data bytes and stopped at the boundary between data and codeaux table + if (data != data_safe_end) + return -3; + + return 0; +} + +size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count) +{ + using namespace meshopt; + + // the minimum valid encoding is header, 1 byte per index and a 4-byte tail + if (buffer_size < 1 + index_count + 4) + return 0; + + int version = gEncodeIndexVersion; + + buffer[0] = (unsigned char)(kSequenceHeader | version); + + unsigned int last[2] = {}; + unsigned int current = 0; + + unsigned char* data = buffer + 1; + unsigned char* data_safe_end = buffer + buffer_size - 4; + + for (size_t i = 0; i < index_count; ++i) + { + // make sure we have enough data to write + // each index writes at most 5 bytes of data; there's a 4 byte tail after data_safe_end + // after this we can be sure we can write without extra bounds checks + if (data >= data_safe_end) + return 0; + + unsigned int index = indices[i]; + + // this is a heuristic that switches between baselines when the delta grows too large + // we want the encoded delta to fit into one byte (7 bits), but 2 bits are used for sign and baseline index + // for now we immediately switch the baseline when delta grows too large - this can be adjusted arbitrarily + int cd = int(index - last[current]); + current ^= ((cd < 0 ? -cd : cd) >= 30); + + // encode delta from the last index + unsigned int d = index - last[current]; + unsigned int v = (d << 1) ^ (int(d) >> 31); + + // note: low bit encodes the index of the last baseline which will be used for reconstruction + encodeVByte(data, (v << 1) | current); + + // update last for the next iteration that uses it + last[current] = index; + } + + // make sure we have enough space to write tail + if (data > data_safe_end) + return 0; + + for (int k = 0; k < 4; ++k) + *data++ = 0; + + return data - buffer; +} + +size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count) +{ + // compute number of bits required for each index + unsigned int vertex_bits = 1; + + while (vertex_bits < 32 && vertex_count > size_t(1) << vertex_bits) + vertex_bits++; + + // worst-case encoding is 1 varint-7 encoded index delta for a K bit value and an extra bit + unsigned int vertex_groups = (vertex_bits + 1 + 1 + 6) / 7; + + return 1 + index_count * vertex_groups + 4; +} + +int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size) +{ + using namespace meshopt; + + // the minimum valid encoding is header, 1 byte per index and a 4-byte tail + if (buffer_size < 1 + index_count + 4) + return -2; + + if ((buffer[0] & 0xf0) != kSequenceHeader) + return -1; + + int version = buffer[0] & 0x0f; + if (version > 1) + return -1; + + const unsigned char* data = buffer + 1; + const unsigned char* data_safe_end = buffer + buffer_size - 4; + + unsigned int last[2] = {}; + + for (size_t i = 0; i < index_count; ++i) + { + // make sure we have enough data to read + // each index reads at most 5 bytes of data; there's a 4 byte tail after data_safe_end + // after this we can be sure we can read without extra bounds checks + if (data >= data_safe_end) + return -2; + + unsigned int v = decodeVByte(data); + + // decode the index of the last baseline + unsigned int current = v & 1; + v >>= 1; + + // reconstruct index as a delta + unsigned int d = (v >> 1) ^ -int(v & 1); + unsigned int index = last[current] + d; + + // update last for the next iteration that uses it + last[current] = index; + + if (index_size == 2) + { + static_cast<unsigned short*>(destination)[i] = (unsigned short)(index); + } + else + { + static_cast<unsigned int*>(destination)[i] = index; + } + } + + // we should've read all data bytes and stopped at the boundary between data and tail + if (data != data_safe_end) + return -3; + + return 0; +} diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp new file mode 100644 index 0000000000..aa4a30efa4 --- /dev/null +++ b/thirdparty/meshoptimizer/indexgenerator.cpp @@ -0,0 +1,347 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <string.h> + +namespace meshopt +{ + +static unsigned int hashUpdate4(unsigned int h, const unsigned char* key, size_t len) +{ + // MurmurHash2 + const unsigned int m = 0x5bd1e995; + const int r = 24; + + while (len >= 4) + { + unsigned int k = *reinterpret_cast<const unsigned int*>(key); + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + key += 4; + len -= 4; + } + + return h; +} + +struct VertexHasher +{ + const unsigned char* vertices; + size_t vertex_size; + size_t vertex_stride; + + size_t hash(unsigned int index) const + { + return hashUpdate4(0, vertices + index * vertex_stride, vertex_size); + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + return memcmp(vertices + lhs * vertex_stride, vertices + rhs * vertex_stride, vertex_size) == 0; + } +}; + +struct VertexStreamHasher +{ + const meshopt_Stream* streams; + size_t stream_count; + + size_t hash(unsigned int index) const + { + unsigned int h = 0; + + for (size_t i = 0; i < stream_count; ++i) + { + const meshopt_Stream& s = streams[i]; + const unsigned char* data = static_cast<const unsigned char*>(s.data); + + h = hashUpdate4(h, data + index * s.stride, s.size); + } + + return h; + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + for (size_t i = 0; i < stream_count; ++i) + { + const meshopt_Stream& s = streams[i]; + const unsigned char* data = static_cast<const unsigned char*>(s.data); + + if (memcmp(data + lhs * s.stride, data + rhs * s.stride, s.size) != 0) + return false; + } + + return true; + } +}; + +static size_t hashBuckets(size_t count) +{ + size_t buckets = 1; + while (buckets < count) + buckets *= 2; + + return buckets; +} + +template <typename T, typename Hash> +static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty) +{ + assert(buckets > 0); + assert((buckets & (buckets - 1)) == 0); + + size_t hashmod = buckets - 1; + size_t bucket = hash.hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) + { + T& item = table[bucket]; + + if (item == empty) + return &item; + + if (hash.equal(item, key)) + return &item; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(false && "Hash table is full"); // unreachable + return 0; +} + +} // namespace meshopt + +size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + using namespace meshopt; + + assert(indices || index_count == vertex_count); + assert(index_count % 3 == 0); + assert(vertex_size > 0 && vertex_size <= 256); + + meshopt_Allocator allocator; + + memset(destination, -1, vertex_count * sizeof(unsigned int)); + + VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_size}; + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate<unsigned int>(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + unsigned int next_vertex = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices ? indices[i] : unsigned(i); + assert(index < vertex_count); + + if (destination[index] == ~0u) + { + unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); + + if (*entry == ~0u) + { + *entry = index; + + destination[index] = next_vertex++; + } + else + { + assert(destination[*entry] != ~0u); + + destination[index] = destination[*entry]; + } + } + } + + assert(next_vertex <= vertex_count); + + return next_vertex; +} + +size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count) +{ + using namespace meshopt; + + assert(indices || index_count == vertex_count); + assert(index_count % 3 == 0); + assert(stream_count > 0 && stream_count <= 16); + + for (size_t i = 0; i < stream_count; ++i) + { + assert(streams[i].size > 0 && streams[i].size <= 256); + assert(streams[i].size <= streams[i].stride); + } + + meshopt_Allocator allocator; + + memset(destination, -1, vertex_count * sizeof(unsigned int)); + + VertexStreamHasher hasher = {streams, stream_count}; + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate<unsigned int>(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + unsigned int next_vertex = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices ? indices[i] : unsigned(i); + assert(index < vertex_count); + + if (destination[index] == ~0u) + { + unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); + + if (*entry == ~0u) + { + *entry = index; + + destination[index] = next_vertex++; + } + else + { + assert(destination[*entry] != ~0u); + + destination[index] = destination[*entry]; + } + } + } + + assert(next_vertex <= vertex_count); + + return next_vertex; +} + +void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap) +{ + assert(vertex_size > 0 && vertex_size <= 256); + + meshopt_Allocator allocator; + + // support in-place remap + if (destination == vertices) + { + unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size); + memcpy(vertices_copy, vertices, vertex_count * vertex_size); + vertices = vertices_copy; + } + + for (size_t i = 0; i < vertex_count; ++i) + { + if (remap[i] != ~0u) + { + assert(remap[i] < vertex_count); + + memcpy(static_cast<unsigned char*>(destination) + remap[i] * vertex_size, static_cast<const unsigned char*>(vertices) + i * vertex_size, vertex_size); + } + } +} + +void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap) +{ + assert(index_count % 3 == 0); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices ? indices[i] : unsigned(i); + assert(remap[index] != ~0u); + + destination[i] = remap[index]; + } +} + +void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride) +{ + using namespace meshopt; + + assert(indices); + assert(index_count % 3 == 0); + assert(vertex_size > 0 && vertex_size <= 256); + assert(vertex_size <= vertex_stride); + + meshopt_Allocator allocator; + + unsigned int* remap = allocator.allocate<unsigned int>(vertex_count); + memset(remap, -1, vertex_count * sizeof(unsigned int)); + + VertexHasher hasher = {static_cast<const unsigned char*>(vertices), vertex_size, vertex_stride}; + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate<unsigned int>(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + if (remap[index] == ~0u) + { + unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); + + if (*entry == ~0u) + *entry = index; + + remap[index] = *entry; + } + + destination[i] = remap[index]; + } +} + +void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count) +{ + using namespace meshopt; + + assert(indices); + assert(index_count % 3 == 0); + assert(stream_count > 0 && stream_count <= 16); + + for (size_t i = 0; i < stream_count; ++i) + { + assert(streams[i].size > 0 && streams[i].size <= 256); + assert(streams[i].size <= streams[i].stride); + } + + meshopt_Allocator allocator; + + unsigned int* remap = allocator.allocate<unsigned int>(vertex_count); + memset(remap, -1, vertex_count * sizeof(unsigned int)); + + VertexStreamHasher hasher = {streams, stream_count}; + + size_t table_size = hashBuckets(vertex_count); + unsigned int* table = allocator.allocate<unsigned int>(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + if (remap[index] == ~0u) + { + unsigned int* entry = hashLookup(table, table_size, hasher, index, ~0u); + + if (*entry == ~0u) + *entry = index; + + remap[index] = *entry; + } + + destination[i] = remap[index]; + } +} diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h new file mode 100644 index 0000000000..fde00f9c82 --- /dev/null +++ b/thirdparty/meshoptimizer/meshoptimizer.h @@ -0,0 +1,951 @@ +/** + * meshoptimizer - version 0.15 + * + * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com) + * Report bugs and download new versions at https://github.com/zeux/meshoptimizer + * + * This library is distributed under the MIT License. See notice at the end of this file. + */ +#pragma once + +#include <assert.h> +#include <stddef.h> + +/* Version macro; major * 1000 + minor * 10 + patch */ +#define MESHOPTIMIZER_VERSION 150 /* 0.15 */ + +/* If no API is defined, assume default */ +#ifndef MESHOPTIMIZER_API +#define MESHOPTIMIZER_API +#endif + +/* Experimental APIs have unstable interface and might have implementation that's not fully tested or optimized */ +#define MESHOPTIMIZER_EXPERIMENTAL MESHOPTIMIZER_API + +/* C interface */ +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Vertex attribute stream, similar to glVertexPointer + * Each element takes size bytes, with stride controlling the spacing between successive elements. + */ +struct meshopt_Stream +{ + const void* data; + size_t size; + size_t stride; +}; + +/** + * Generates a vertex remap table from the vertex buffer and an optional index buffer and returns number of unique vertices + * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence. + * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer. + * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized. + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + * indices can be NULL if the input is unindexed + */ +MESHOPTIMIZER_API size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size); + +/** + * Generates a vertex remap table from multiple vertex streams and an optional index buffer and returns number of unique vertices + * As a result, all vertices that are binary equivalent map to the same (new) location, with no gaps in the resulting sequence. + * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer/meshopt_remapIndexBuffer. + * To remap vertex buffers, you will need to call meshopt_remapVertexBuffer for each vertex stream. + * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized. + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + * indices can be NULL if the input is unindexed + */ +MESHOPTIMIZER_API size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); + +/** + * Generates vertex buffer from the source vertex buffer and remap table generated by meshopt_generateVertexRemap + * + * destination must contain enough space for the resulting vertex buffer (unique_vertex_count elements, returned by meshopt_generateVertexRemap) + * vertex_count should be the initial vertex count and not the value returned by meshopt_generateVertexRemap + */ +MESHOPTIMIZER_API void meshopt_remapVertexBuffer(void* destination, const void* vertices, size_t vertex_count, size_t vertex_size, const unsigned int* remap); + +/** + * Generate index buffer from the source index buffer and remap table generated by meshopt_generateVertexRemap + * + * destination must contain enough space for the resulting index buffer (index_count elements) + * indices can be NULL if the input is unindexed + */ +MESHOPTIMIZER_API void meshopt_remapIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const unsigned int* remap); + +/** + * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary + * All vertices that are binary equivalent (wrt first vertex_size bytes) map to the first vertex in the original vertex buffer. + * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering. + * Note that binary equivalence considers all vertex_size bytes, including padding which should be zero-initialized. + * + * destination must contain enough space for the resulting index buffer (index_count elements) + */ +MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride); + +/** + * Generate index buffer that can be used for more efficient rendering when only a subset of the vertex attributes is necessary + * All vertices that are binary equivalent (wrt specified streams) map to the first vertex in the original vertex buffer. + * This makes it possible to use the index buffer for Z pre-pass or shadowmap rendering, while using the original index buffer for regular rendering. + * Note that binary equivalence considers all size bytes in each stream, including padding which should be zero-initialized. + * + * destination must contain enough space for the resulting index buffer (index_count elements) + */ +MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count); + +/** + * Vertex transform cache optimizer + * Reorders indices to reduce the number of GPU vertex shader invocations + * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * + * destination must contain enough space for the resulting index buffer (index_count elements) + */ +MESHOPTIMIZER_API void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count); + +/** + * Vertex transform cache optimizer for strip-like caches + * Produces inferior results to meshopt_optimizeVertexCache from the GPU vertex cache perspective + * However, the resulting index order is more optimal if the goal is to reduce the triangle strip length or improve compression efficiency + * + * destination must contain enough space for the resulting index buffer (index_count elements) + */ +MESHOPTIMIZER_API void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count); + +/** + * Vertex transform cache optimizer for FIFO caches + * Reorders indices to reduce the number of GPU vertex shader invocations + * Generally takes ~3x less time to optimize meshes but produces inferior results compared to meshopt_optimizeVertexCache + * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * + * destination must contain enough space for the resulting index buffer (index_count elements) + * cache_size should be less than the actual GPU cache size to avoid cache thrashing + */ +MESHOPTIMIZER_API void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size); + +/** + * Overdraw optimizer + * Reorders indices to reduce the number of GPU vertex shader invocations and the pixel overdraw + * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually. + * + * destination must contain enough space for the resulting index buffer (index_count elements) + * indices must contain index data that is the result of meshopt_optimizeVertexCache (*not* the original mesh indices!) + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * threshold indicates how much the overdraw optimizer can degrade vertex cache efficiency (1.05 = up to 5%) to reduce overdraw more efficiently + */ +MESHOPTIMIZER_API void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold); + +/** + * Vertex fetch cache optimizer + * Reorders vertices and changes indices to reduce the amount of GPU memory fetches during vertex processing + * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused + * This functions works for a single vertex stream; for multiple vertex streams, use meshopt_optimizeVertexFetchRemap + meshopt_remapVertexBuffer for each stream. + * + * destination must contain enough space for the resulting vertex buffer (vertex_count elements) + * indices is used both as an input and as an output index buffer + */ +MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size); + +/** + * Vertex fetch cache optimizer + * Generates vertex remap to reduce the amount of GPU memory fetches during vertex processing + * Returns the number of unique vertices, which is the same as input vertex count unless some vertices are unused + * The resulting remap table should be used to reorder vertex/index buffers using meshopt_remapVertexBuffer/meshopt_remapIndexBuffer + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + */ +MESHOPTIMIZER_API size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count); + +/** + * Index buffer encoder + * Encodes index data into an array of bytes that is generally much smaller (<1.5 bytes/triangle) and compresses better (<1 bytes/triangle) compared to original. + * Input index buffer must represent a triangle list. + * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space + * For maximum efficiency the index buffer being encoded has to be optimized for vertex cache and vertex fetch first. + * + * buffer must contain enough space for the encoded index buffer (use meshopt_encodeIndexBufferBound to compute worst case size) + */ +MESHOPTIMIZER_API size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count); +MESHOPTIMIZER_API size_t meshopt_encodeIndexBufferBound(size_t index_count, size_t vertex_count); + +/** + * Experimental: Set index encoder format version + * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) and 1 (decodable by 0.14+) + */ +MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeIndexVersion(int version); + +/** + * Index buffer decoder + * Decodes index data from an array of bytes generated by meshopt_encodeIndexBuffer + * Returns 0 if decoding was successful, and an error code otherwise + * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices). + * + * destination must contain enough space for the resulting index buffer (index_count elements) + */ +MESHOPTIMIZER_API int meshopt_decodeIndexBuffer(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size); + +/** + * Experimental: Index sequence encoder + * Encodes index sequence into an array of bytes that is generally smaller and compresses better compared to original. + * Input index sequence can represent arbitrary topology; for triangle lists meshopt_encodeIndexBuffer is likely to be better. + * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space + * + * buffer must contain enough space for the encoded index sequence (use meshopt_encodeIndexSequenceBound to compute worst case size) + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const unsigned int* indices, size_t index_count); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_encodeIndexSequenceBound(size_t index_count, size_t vertex_count); + +/** + * Index sequence decoder + * Decodes index data from an array of bytes generated by meshopt_encodeIndexSequence + * Returns 0 if decoding was successful, and an error code otherwise + * The decoder is safe to use for untrusted input, but it may produce garbage data (e.g. out of range indices). + * + * destination must contain enough space for the resulting index sequence (index_count elements) + */ +MESHOPTIMIZER_EXPERIMENTAL int meshopt_decodeIndexSequence(void* destination, size_t index_count, size_t index_size, const unsigned char* buffer, size_t buffer_size); + +/** + * Vertex buffer encoder + * Encodes vertex data into an array of bytes that is generally smaller and compresses better compared to original. + * Returns encoded data size on success, 0 on error; the only error condition is if buffer doesn't have enough space + * This function works for a single vertex stream; for multiple vertex streams, call meshopt_encodeVertexBuffer for each stream. + * Note that all vertex_size bytes of each vertex are encoded verbatim, including padding which should be zero-initialized. + * + * buffer must contain enough space for the encoded vertex buffer (use meshopt_encodeVertexBufferBound to compute worst case size) + */ +MESHOPTIMIZER_API size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size); +MESHOPTIMIZER_API size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size); + +/** + * Experimental: Set vertex encoder format version + * version must specify the data format version to encode; valid values are 0 (decodable by all library versions) + */ +MESHOPTIMIZER_EXPERIMENTAL void meshopt_encodeVertexVersion(int version); + +/** + * Vertex buffer decoder + * Decodes vertex data from an array of bytes generated by meshopt_encodeVertexBuffer + * Returns 0 if decoding was successful, and an error code otherwise + * The decoder is safe to use for untrusted input, but it may produce garbage data. + * + * destination must contain enough space for the resulting vertex buffer (vertex_count * vertex_size bytes) + */ +MESHOPTIMIZER_API int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size); + +/** + * Vertex buffer filters + * These functions can be used to filter output of meshopt_decodeVertexBuffer in-place. + * count must be aligned by 4 and stride is fixed for each function to facilitate SIMD implementation. + * + * meshopt_decodeFilterOct decodes octahedral encoding of a unit vector with K-bit (K <= 16) signed X/Y as an input; Z must store 1.0f. + * Each component is stored as an 8-bit or 16-bit normalized integer; stride must be equal to 4 or 8. W is preserved as is. + * + * meshopt_decodeFilterQuat decodes 3-component quaternion encoding with K-bit (4 <= K <= 16) component encoding and a 2-bit component index indicating which component to reconstruct. + * Each component is stored as an 16-bit integer; stride must be equal to 8. + * + * meshopt_decodeFilterExp decodes exponential encoding of floating-point data with 8-bit exponent and 24-bit integer mantissa as 2^E*M. + * Each 32-bit component is decoded in isolation; stride must be divisible by 4. + */ +MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size); +MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size); + +/** + * Experimental: Mesh simplifier + * Reduces the number of triangles in the mesh, attempting to preserve mesh appearance as much as possible + * The algorithm tries to preserve mesh topology and can stop short of the target goal based on topology constraints or target error. + * If not all attributes from the input mesh are required, it's recommended to reindex the mesh using meshopt_generateShadowIndexBuffer prior to simplification. + * Returns the number of indices after simplification, with destination containing new index data + * The resulting index buffer references vertices from the original vertex buffer. + * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * + * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!) + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + */ +// -- GODOT start -- +//MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error); +// -- GODOT end -- + +/** + * Experimental: Mesh simplifier (sloppy) + * Reduces the number of triangles in the mesh, sacrificing mesh apperance for simplification performance + * The algorithm doesn't preserve mesh topology but is always able to reach target triangle count. + * Returns the number of indices after simplification, with destination containing new index data + * The resulting index buffer references vertices from the original vertex buffer. + * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * + * destination must contain enough space for the target index buffer + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count); + +/** + * Experimental: Point cloud simplifier + * Reduces the number of points in the cloud to reach the given target + * Returns the number of points after simplification, with destination containing new index data + * The resulting index buffer references vertices from the original vertex buffer. + * If the original vertex data isn't required, creating a compact vertex buffer using meshopt_optimizeVertexFetch is recommended. + * + * destination must contain enough space for the target index buffer + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count); + +/** + * Mesh stripifier + * Converts a previously vertex cache optimized triangle list to triangle strip, stitching strips using restart index or degenerate triangles + * Returns the number of indices in the resulting strip, with destination containing new index data + * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first. + * Using restart indices can result in ~10% smaller index buffers, but on some GPUs restart indices may result in decreased performance. + * + * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_stripifyBound + * restart_index should be 0xffff or 0xffffffff depending on index size, or 0 to use degenerate triangles + */ +MESHOPTIMIZER_API size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index); +MESHOPTIMIZER_API size_t meshopt_stripifyBound(size_t index_count); + +/** + * Mesh unstripifier + * Converts a triangle strip to a triangle list + * Returns the number of indices in the resulting list, with destination containing new index data + * + * destination must contain enough space for the target index buffer, worst case can be computed with meshopt_unstripifyBound + */ +MESHOPTIMIZER_API size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index); +MESHOPTIMIZER_API size_t meshopt_unstripifyBound(size_t index_count); + +struct meshopt_VertexCacheStatistics +{ + unsigned int vertices_transformed; + unsigned int warps_executed; + float acmr; /* transformed vertices / triangle count; best case 0.5, worst case 3.0, optimum depends on topology */ + float atvr; /* transformed vertices / vertex count; best case 1.0, worst case 6.0, optimum is 1.0 (each vertex is transformed once) */ +}; + +/** + * Vertex transform cache analyzer + * Returns cache hit statistics using a simplified FIFO model + * Results may not match actual GPU performance + */ +MESHOPTIMIZER_API struct meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size); + +struct meshopt_OverdrawStatistics +{ + unsigned int pixels_covered; + unsigned int pixels_shaded; + float overdraw; /* shaded pixels / covered pixels; best case 1.0 */ +}; + +/** + * Overdraw analyzer + * Returns overdraw statistics using a software rasterizer + * Results may not match actual GPU performance + * + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + */ +MESHOPTIMIZER_API struct meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + +struct meshopt_VertexFetchStatistics +{ + unsigned int bytes_fetched; + float overfetch; /* fetched bytes / vertex buffer size; best case 1.0 (each byte is fetched once) */ +}; + +/** + * Vertex fetch cache analyzer + * Returns cache hit statistics using a simplified direct mapped model + * Results may not match actual GPU performance + */ +MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size); + +struct meshopt_Meshlet +{ + unsigned int vertices[64]; + unsigned char indices[126][3]; + unsigned char triangle_count; + unsigned char vertex_count; +}; + +/** + * Experimental: Meshlet builder + * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer + * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers. + * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first. + * + * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound + * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126) + */ +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); +MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles); + +struct meshopt_Bounds +{ + /* bounding sphere, useful for frustum and occlusion culling */ + float center[3]; + float radius; + + /* normal cone, useful for backface culling */ + float cone_apex[3]; + float cone_axis[3]; + float cone_cutoff; /* = cos(angle/2) */ + + /* normal cone axis and cutoff, stored in 8-bit SNORM format; decode using x/127.0 */ + signed char cone_axis_s8[3]; + signed char cone_cutoff_s8; +}; + +/** + * Experimental: Cluster bounds generator + * Creates bounding volumes that can be used for frustum, backface and occlusion culling. + * + * For backface culling with orthographic projection, use the following formula to reject backfacing clusters: + * dot(view, cone_axis) >= cone_cutoff + * + * For perspective projection, you can the formula that needs cone apex in addition to axis & cutoff: + * dot(normalize(cone_apex - camera_position), cone_axis) >= cone_cutoff + * + * Alternatively, you can use the formula that doesn't need cone apex and uses bounding sphere instead: + * dot(normalize(center - camera_position), cone_axis) >= cone_cutoff + radius / length(center - camera_position) + * or an equivalent formula that doesn't have a singularity at center = camera_position: + * dot(center - camera_position, cone_axis) >= cone_cutoff * length(center - camera_position) + radius + * + * The formula that uses the apex is slightly more accurate but needs the apex; if you are already using bounding sphere + * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable. + * + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size) + */ +MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + +/** + * Experimental: Spatial sorter + * Generates a remap table that can be used to reorder points for spatial locality. + * Resulting remap table maps old vertices to new vertices and can be used in meshopt_remapVertexBuffer. + * + * destination must contain enough space for the resulting remap table (vertex_count elements) + */ +MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + +/** + * Experimental: Spatial sorter + * Reorders triangles for spatial locality, and generates a new index buffer. The resulting index buffer can be used with other functions like optimizeVertexCache. + * + * destination must contain enough space for the resulting index buffer (index_count elements) + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + */ +MESHOPTIMIZER_EXPERIMENTAL void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); + +/** + * Set allocation callbacks + * These callbacks will be used instead of the default operator new/operator delete for all temporary allocations in the library. + * Note that all algorithms only allocate memory for temporary use. + * allocate/deallocate are always called in a stack-like order - last pointer to be allocated is deallocated first. + */ +MESHOPTIMIZER_API void meshopt_setAllocator(void* (*allocate)(size_t), void (*deallocate)(void*)); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +/* Quantization into commonly supported data formats */ +#ifdef __cplusplus +/** + * Quantize a float in [0..1] range into an N-bit fixed point unorm value + * Assumes reconstruction function (q / (2^N-1)), which is the case for fixed-function normalized fixed point conversion + * Maximum reconstruction error: 1/2^(N+1) + */ +inline int meshopt_quantizeUnorm(float v, int N); + +/** + * Quantize a float in [-1..1] range into an N-bit fixed point snorm value + * Assumes reconstruction function (q / (2^(N-1)-1)), which is the case for fixed-function normalized fixed point conversion (except early OpenGL versions) + * Maximum reconstruction error: 1/2^N + */ +inline int meshopt_quantizeSnorm(float v, int N); + +/** + * Quantize a float into half-precision floating point value + * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest + * Representable magnitude range: [6e-5; 65504] + * Maximum relative reconstruction error: 5e-4 + */ +inline unsigned short meshopt_quantizeHalf(float v); + +/** + * Quantize a float into a floating point value with a limited number of significant mantissa bits + * Generates +-inf for overflow, preserves NaN, flushes denormals to zero, rounds to nearest + * Assumes N is in a valid mantissa precision range, which is 1..23 + */ +inline float meshopt_quantizeFloat(float v, int N); +#endif + +/** + * C++ template interface + * + * These functions mirror the C interface the library provides, providing template-based overloads so that + * the caller can use an arbitrary type for the index data, both for input and output. + * When the supplied type is the same size as that of unsigned int, the wrappers are zero-cost; when it's not, + * the wrappers end up allocating memory and copying index data to convert from one type to another. + */ +#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS) +template <typename T> +inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size); +template <typename T> +inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count); +template <typename T> +inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap); +template <typename T> +inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride); +template <typename T> +inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count); +template <typename T> +inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count); +template <typename T> +inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count); +template <typename T> +inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size); +template <typename T> +inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold); +template <typename T> +inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count); +template <typename T> +inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size); +template <typename T> +inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count); +template <typename T> +inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size); +template <typename T> +inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count); +template <typename T> +inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size); +template <typename T> +inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error); +template <typename T> +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count); +template <typename T> +inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index); +template <typename T> +inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index); +template <typename T> +inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size); +template <typename T> +inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +template <typename T> +inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size); +template <typename T> +inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles); +template <typename T> +inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +template <typename T> +inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride); +#endif + +/* Inline implementation */ +#ifdef __cplusplus +inline int meshopt_quantizeUnorm(float v, int N) +{ + const float scale = float((1 << N) - 1); + + v = (v >= 0) ? v : 0; + v = (v <= 1) ? v : 1; + + return int(v * scale + 0.5f); +} + +inline int meshopt_quantizeSnorm(float v, int N) +{ + const float scale = float((1 << (N - 1)) - 1); + + float round = (v >= 0 ? 0.5f : -0.5f); + + v = (v >= -1) ? v : -1; + v = (v <= +1) ? v : +1; + + return int(v * scale + round); +} + +inline unsigned short meshopt_quantizeHalf(float v) +{ + union { float f; unsigned int ui; } u = {v}; + unsigned int ui = u.ui; + + int s = (ui >> 16) & 0x8000; + int em = ui & 0x7fffffff; + + /* bias exponent and round to nearest; 112 is relative exponent bias (127-15) */ + int h = (em - (112 << 23) + (1 << 12)) >> 13; + + /* underflow: flush to zero; 113 encodes exponent -14 */ + h = (em < (113 << 23)) ? 0 : h; + + /* overflow: infinity; 143 encodes exponent 16 */ + h = (em >= (143 << 23)) ? 0x7c00 : h; + + /* NaN; note that we convert all types of NaN to qNaN */ + h = (em > (255 << 23)) ? 0x7e00 : h; + + return (unsigned short)(s | h); +} + +inline float meshopt_quantizeFloat(float v, int N) +{ + union { float f; unsigned int ui; } u = {v}; + unsigned int ui = u.ui; + + const int mask = (1 << (23 - N)) - 1; + const int round = (1 << (23 - N)) >> 1; + + int e = ui & 0x7f800000; + unsigned int rui = (ui + round) & ~mask; + + /* round all numbers except inf/nan; this is important to make sure nan doesn't overflow into -0 */ + ui = e == 0x7f800000 ? ui : rui; + + /* flush denormals to zero */ + ui = e == 0 ? 0 : ui; + + u.ui = ui; + return u.f; +} +#endif + +/* Internal implementation helpers */ +#ifdef __cplusplus +class meshopt_Allocator +{ +public: + template <typename T> + struct StorageT + { + static void* (*allocate)(size_t); + static void (*deallocate)(void*); + }; + + typedef StorageT<void> Storage; + + meshopt_Allocator() + : blocks() + , count(0) + { + } + + ~meshopt_Allocator() + { + for (size_t i = count; i > 0; --i) + Storage::deallocate(blocks[i - 1]); + } + + template <typename T> T* allocate(size_t size) + { + assert(count < sizeof(blocks) / sizeof(blocks[0])); + T* result = static_cast<T*>(Storage::allocate(size > size_t(-1) / sizeof(T) ? size_t(-1) : size * sizeof(T))); + blocks[count++] = result; + return result; + } + +private: + void* blocks[24]; + size_t count; +}; + +// This makes sure that allocate/deallocate are lazily generated in translation units that need them and are deduplicated by the linker +template <typename T> void* (*meshopt_Allocator::StorageT<T>::allocate)(size_t) = operator new; +template <typename T> void (*meshopt_Allocator::StorageT<T>::deallocate)(void*) = operator delete; +#endif + +/* Inline implementation for C++ templated wrappers */ +#if defined(__cplusplus) && !defined(MESHOPTIMIZER_NO_WRAPPERS) +template <typename T, bool ZeroCopy = sizeof(T) == sizeof(unsigned int)> +struct meshopt_IndexAdapter; + +template <typename T> +struct meshopt_IndexAdapter<T, false> +{ + T* result; + unsigned int* data; + size_t count; + + meshopt_IndexAdapter(T* result_, const T* input, size_t count_) + : result(result_) + , data(0) + , count(count_) + { + size_t size = count > size_t(-1) / sizeof(unsigned int) ? size_t(-1) : count * sizeof(unsigned int); + + data = static_cast<unsigned int*>(meshopt_Allocator::Storage::allocate(size)); + + if (input) + { + for (size_t i = 0; i < count; ++i) + data[i] = input[i]; + } + } + + ~meshopt_IndexAdapter() + { + if (result) + { + for (size_t i = 0; i < count; ++i) + result[i] = T(data[i]); + } + + meshopt_Allocator::Storage::deallocate(data); + } +}; + +template <typename T> +struct meshopt_IndexAdapter<T, true> +{ + unsigned int* data; + + meshopt_IndexAdapter(T* result, const T* input, size_t) + : data(reinterpret_cast<unsigned int*>(result ? result : const_cast<T*>(input))) + { + } +}; + +template <typename T> +inline size_t meshopt_generateVertexRemap(unsigned int* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0); + + return meshopt_generateVertexRemap(destination, indices ? in.data : 0, index_count, vertices, vertex_count, vertex_size); +} + +template <typename T> +inline size_t meshopt_generateVertexRemapMulti(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count) +{ + meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0); + + return meshopt_generateVertexRemapMulti(destination, indices ? in.data : 0, index_count, vertex_count, streams, stream_count); +} + +template <typename T> +inline void meshopt_remapIndexBuffer(T* destination, const T* indices, size_t index_count, const unsigned int* remap) +{ + meshopt_IndexAdapter<T> in(0, indices, indices ? index_count : 0); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_remapIndexBuffer(out.data, indices ? in.data : 0, index_count, remap); +} + +template <typename T> +inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size, size_t vertex_stride) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_generateShadowIndexBuffer(out.data, in.data, index_count, vertices, vertex_count, vertex_size, vertex_stride); +} + +template <typename T> +inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_generateShadowIndexBufferMulti(out.data, in.data, index_count, vertex_count, streams, stream_count); +} + +template <typename T> +inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_optimizeVertexCache(out.data, in.data, index_count, vertex_count); +} + +template <typename T> +inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_optimizeVertexCacheStrip(out.data, in.data, index_count, vertex_count); +} + +template <typename T> +inline void meshopt_optimizeVertexCacheFifo(T* destination, const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_optimizeVertexCacheFifo(out.data, in.data, index_count, vertex_count, cache_size); +} + +template <typename T> +inline void meshopt_optimizeOverdraw(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_optimizeOverdraw(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, threshold); +} + +template <typename T> +inline size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const T* indices, size_t index_count, size_t vertex_count) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_optimizeVertexFetchRemap(destination, in.data, index_count, vertex_count); +} + +template <typename T> +inline size_t meshopt_optimizeVertexFetch(void* destination, T* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + meshopt_IndexAdapter<T> inout(indices, indices, index_count); + + return meshopt_optimizeVertexFetch(destination, inout.data, index_count, vertices, vertex_count, vertex_size); +} + +template <typename T> +inline size_t meshopt_encodeIndexBuffer(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_encodeIndexBuffer(buffer, buffer_size, in.data, index_count); +} + +template <typename T> +inline int meshopt_decodeIndexBuffer(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size) +{ + char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1]; + (void)index_size_valid; + + return meshopt_decodeIndexBuffer(destination, index_count, sizeof(T), buffer, buffer_size); +} + +template <typename T> +inline size_t meshopt_encodeIndexSequence(unsigned char* buffer, size_t buffer_size, const T* indices, size_t index_count) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_encodeIndexSequence(buffer, buffer_size, in.data, index_count); +} + +template <typename T> +inline int meshopt_decodeIndexSequence(T* destination, size_t index_count, const unsigned char* buffer, size_t buffer_size) +{ + char index_size_valid[sizeof(T) == 2 || sizeof(T) == 4 ? 1 : -1]; + (void)index_size_valid; + + return meshopt_decodeIndexSequence(destination, index_count, sizeof(T), buffer, buffer_size); +} + +template <typename T> +inline size_t meshopt_simplify(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + return meshopt_simplify(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count, target_error); +} + +template <typename T> +inline size_t meshopt_simplifySloppy(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, target_index_count); + + return meshopt_simplifySloppy(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, target_index_count); +} + +template <typename T> +inline size_t meshopt_stripify(T* destination, const T* indices, size_t index_count, size_t vertex_count, T restart_index) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, (index_count / 3) * 5); + + return meshopt_stripify(out.data, in.data, index_count, vertex_count, unsigned(restart_index)); +} + +template <typename T> +inline size_t meshopt_unstripify(T* destination, const T* indices, size_t index_count, T restart_index) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, (index_count - 2) * 3); + + return meshopt_unstripify(out.data, in.data, index_count, unsigned(restart_index)); +} + +template <typename T> +inline meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const T* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int buffer_size) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_analyzeVertexCache(in.data, index_count, vertex_count, cache_size, warp_size, buffer_size); +} + +template <typename T> +inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_analyzeOverdraw(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); +} + +template <typename T> +inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_analyzeVertexFetch(in.data, index_count, vertex_count, vertex_size); +} + +template <typename T> +inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles); +} + +template <typename T> +inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + + return meshopt_computeClusterBounds(in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); +} + +template <typename T> +inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + meshopt_IndexAdapter<T> in(0, indices, index_count); + meshopt_IndexAdapter<T> out(destination, 0, index_count); + + meshopt_spatialSortTriangles(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride); +} +#endif + +/** + * Copyright (c) 2016-2020 Arseny Kapoulkine + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ diff --git a/thirdparty/meshoptimizer/overdrawanalyzer.cpp b/thirdparty/meshoptimizer/overdrawanalyzer.cpp new file mode 100644 index 0000000000..8d5859ba39 --- /dev/null +++ b/thirdparty/meshoptimizer/overdrawanalyzer.cpp @@ -0,0 +1,230 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <float.h> +#include <string.h> + +// This work is based on: +// Nicolas Capens. Advanced Rasterization. 2004 +namespace meshopt +{ + +const int kViewport = 256; + +struct OverdrawBuffer +{ + float z[kViewport][kViewport][2]; + unsigned int overdraw[kViewport][kViewport][2]; +}; + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +#ifndef max +#define max(a, b) ((a) > (b) ? (a) : (b)) +#endif + +static float computeDepthGradients(float& dzdx, float& dzdy, float x1, float y1, float z1, float x2, float y2, float z2, float x3, float y3, float z3) +{ + // z2 = z1 + dzdx * (x2 - x1) + dzdy * (y2 - y1) + // z3 = z1 + dzdx * (x3 - x1) + dzdy * (y3 - y1) + // (x2-x1 y2-y1)(dzdx) = (z2-z1) + // (x3-x1 y3-y1)(dzdy) (z3-z1) + // we'll solve it with Cramer's rule + float det = (x2 - x1) * (y3 - y1) - (y2 - y1) * (x3 - x1); + float invdet = (det == 0) ? 0 : 1 / det; + + dzdx = (z2 - z1) * (y3 - y1) - (y2 - y1) * (z3 - z1) * invdet; + dzdy = (x2 - x1) * (z3 - z1) - (z2 - z1) * (x3 - x1) * invdet; + + return det; +} + +// half-space fixed point triangle rasterizer +static void rasterize(OverdrawBuffer* buffer, float v1x, float v1y, float v1z, float v2x, float v2y, float v2z, float v3x, float v3y, float v3z) +{ + // compute depth gradients + float DZx, DZy; + float det = computeDepthGradients(DZx, DZy, v1x, v1y, v1z, v2x, v2y, v2z, v3x, v3y, v3z); + int sign = det > 0; + + // flip backfacing triangles to simplify rasterization logic + if (sign) + { + // flipping v2 & v3 preserves depth gradients since they're based on v1 + float t; + t = v2x, v2x = v3x, v3x = t; + t = v2y, v2y = v3y, v3y = t; + t = v2z, v2z = v3z, v3z = t; + + // flip depth since we rasterize backfacing triangles to second buffer with reverse Z; only v1z is used below + v1z = kViewport - v1z; + DZx = -DZx; + DZy = -DZy; + } + + // coordinates, 28.4 fixed point + int X1 = int(16.0f * v1x + 0.5f); + int X2 = int(16.0f * v2x + 0.5f); + int X3 = int(16.0f * v3x + 0.5f); + + int Y1 = int(16.0f * v1y + 0.5f); + int Y2 = int(16.0f * v2y + 0.5f); + int Y3 = int(16.0f * v3y + 0.5f); + + // bounding rectangle, clipped against viewport + // since we rasterize pixels with covered centers, min >0.5 should round up + // as for max, due to top-left filling convention we will never rasterize right/bottom edges + // so max >= 0.5 should round down + int minx = max((min(X1, min(X2, X3)) + 7) >> 4, 0); + int maxx = min((max(X1, max(X2, X3)) + 7) >> 4, kViewport); + int miny = max((min(Y1, min(Y2, Y3)) + 7) >> 4, 0); + int maxy = min((max(Y1, max(Y2, Y3)) + 7) >> 4, kViewport); + + // deltas, 28.4 fixed point + int DX12 = X1 - X2; + int DX23 = X2 - X3; + int DX31 = X3 - X1; + + int DY12 = Y1 - Y2; + int DY23 = Y2 - Y3; + int DY31 = Y3 - Y1; + + // fill convention correction + int TL1 = DY12 < 0 || (DY12 == 0 && DX12 > 0); + int TL2 = DY23 < 0 || (DY23 == 0 && DX23 > 0); + int TL3 = DY31 < 0 || (DY31 == 0 && DX31 > 0); + + // half edge equations, 24.8 fixed point + // note that we offset minx/miny by half pixel since we want to rasterize pixels with covered centers + int FX = (minx << 4) + 8; + int FY = (miny << 4) + 8; + int CY1 = DX12 * (FY - Y1) - DY12 * (FX - X1) + TL1 - 1; + int CY2 = DX23 * (FY - Y2) - DY23 * (FX - X2) + TL2 - 1; + int CY3 = DX31 * (FY - Y3) - DY31 * (FX - X3) + TL3 - 1; + float ZY = v1z + (DZx * float(FX - X1) + DZy * float(FY - Y1)) * (1 / 16.f); + + for (int y = miny; y < maxy; y++) + { + int CX1 = CY1; + int CX2 = CY2; + int CX3 = CY3; + float ZX = ZY; + + for (int x = minx; x < maxx; x++) + { + // check if all CXn are non-negative + if ((CX1 | CX2 | CX3) >= 0) + { + if (ZX >= buffer->z[y][x][sign]) + { + buffer->z[y][x][sign] = ZX; + buffer->overdraw[y][x][sign]++; + } + } + + // signed left shift is UB for negative numbers so use unsigned-signed casts + CX1 -= int(unsigned(DY12) << 4); + CX2 -= int(unsigned(DY23) << 4); + CX3 -= int(unsigned(DY31) << 4); + ZX += DZx; + } + + // signed left shift is UB for negative numbers so use unsigned-signed casts + CY1 += int(unsigned(DX12) << 4); + CY2 += int(unsigned(DX23) << 4); + CY3 += int(unsigned(DX31) << 4); + ZY += DZy; + } +} + +} // namespace meshopt + +meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + meshopt_OverdrawStatistics result = {}; + + float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX}; + float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; + + for (size_t i = 0; i < vertex_count; ++i) + { + const float* v = vertex_positions + i * vertex_stride_float; + + for (int j = 0; j < 3; ++j) + { + minv[j] = min(minv[j], v[j]); + maxv[j] = max(maxv[j], v[j]); + } + } + + float extent = max(maxv[0] - minv[0], max(maxv[1] - minv[1], maxv[2] - minv[2])); + float scale = kViewport / extent; + + float* triangles = allocator.allocate<float>(index_count * 3); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + const float* v = vertex_positions + index * vertex_stride_float; + + triangles[i * 3 + 0] = (v[0] - minv[0]) * scale; + triangles[i * 3 + 1] = (v[1] - minv[1]) * scale; + triangles[i * 3 + 2] = (v[2] - minv[2]) * scale; + } + + OverdrawBuffer* buffer = allocator.allocate<OverdrawBuffer>(1); + + for (int axis = 0; axis < 3; ++axis) + { + memset(buffer, 0, sizeof(OverdrawBuffer)); + + for (size_t i = 0; i < index_count; i += 3) + { + const float* vn0 = &triangles[3 * (i + 0)]; + const float* vn1 = &triangles[3 * (i + 1)]; + const float* vn2 = &triangles[3 * (i + 2)]; + + switch (axis) + { + case 0: + rasterize(buffer, vn0[2], vn0[1], vn0[0], vn1[2], vn1[1], vn1[0], vn2[2], vn2[1], vn2[0]); + break; + case 1: + rasterize(buffer, vn0[0], vn0[2], vn0[1], vn1[0], vn1[2], vn1[1], vn2[0], vn2[2], vn2[1]); + break; + case 2: + rasterize(buffer, vn0[1], vn0[0], vn0[2], vn1[1], vn1[0], vn1[2], vn2[1], vn2[0], vn2[2]); + break; + } + } + + for (int y = 0; y < kViewport; ++y) + for (int x = 0; x < kViewport; ++x) + for (int s = 0; s < 2; ++s) + { + unsigned int overdraw = buffer->overdraw[y][x][s]; + + result.pixels_covered += overdraw > 0; + result.pixels_shaded += overdraw; + } + } + + result.overdraw = result.pixels_covered ? float(result.pixels_shaded) / float(result.pixels_covered) : 0.f; + + return result; +} diff --git a/thirdparty/meshoptimizer/overdrawoptimizer.cpp b/thirdparty/meshoptimizer/overdrawoptimizer.cpp new file mode 100644 index 0000000000..143656ed76 --- /dev/null +++ b/thirdparty/meshoptimizer/overdrawoptimizer.cpp @@ -0,0 +1,333 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <math.h> +#include <string.h> + +// This work is based on: +// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007 +namespace meshopt +{ + +static void calculateSortData(float* sort_data, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_positions_stride, const unsigned int* clusters, size_t cluster_count) +{ + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + float mesh_centroid[3] = {}; + + for (size_t i = 0; i < index_count; ++i) + { + const float* p = vertex_positions + vertex_stride_float * indices[i]; + + mesh_centroid[0] += p[0]; + mesh_centroid[1] += p[1]; + mesh_centroid[2] += p[2]; + } + + mesh_centroid[0] /= index_count; + mesh_centroid[1] /= index_count; + mesh_centroid[2] /= index_count; + + for (size_t cluster = 0; cluster < cluster_count; ++cluster) + { + size_t cluster_begin = clusters[cluster] * 3; + size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count; + assert(cluster_begin < cluster_end); + + float cluster_area = 0; + float cluster_centroid[3] = {}; + float cluster_normal[3] = {}; + + for (size_t i = cluster_begin; i < cluster_end; i += 3) + { + const float* p0 = vertex_positions + vertex_stride_float * indices[i + 0]; + const float* p1 = vertex_positions + vertex_stride_float * indices[i + 1]; + const float* p2 = vertex_positions + vertex_stride_float * indices[i + 2]; + + float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]}; + float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]}; + + float normalx = p10[1] * p20[2] - p10[2] * p20[1]; + float normaly = p10[2] * p20[0] - p10[0] * p20[2]; + float normalz = p10[0] * p20[1] - p10[1] * p20[0]; + + float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz); + + cluster_centroid[0] += (p0[0] + p1[0] + p2[0]) * (area / 3); + cluster_centroid[1] += (p0[1] + p1[1] + p2[1]) * (area / 3); + cluster_centroid[2] += (p0[2] + p1[2] + p2[2]) * (area / 3); + cluster_normal[0] += normalx; + cluster_normal[1] += normaly; + cluster_normal[2] += normalz; + cluster_area += area; + } + + float inv_cluster_area = cluster_area == 0 ? 0 : 1 / cluster_area; + + cluster_centroid[0] *= inv_cluster_area; + cluster_centroid[1] *= inv_cluster_area; + cluster_centroid[2] *= inv_cluster_area; + + float cluster_normal_length = sqrtf(cluster_normal[0] * cluster_normal[0] + cluster_normal[1] * cluster_normal[1] + cluster_normal[2] * cluster_normal[2]); + float inv_cluster_normal_length = cluster_normal_length == 0 ? 0 : 1 / cluster_normal_length; + + cluster_normal[0] *= inv_cluster_normal_length; + cluster_normal[1] *= inv_cluster_normal_length; + cluster_normal[2] *= inv_cluster_normal_length; + + float centroid_vector[3] = {cluster_centroid[0] - mesh_centroid[0], cluster_centroid[1] - mesh_centroid[1], cluster_centroid[2] - mesh_centroid[2]}; + + sort_data[cluster] = centroid_vector[0] * cluster_normal[0] + centroid_vector[1] * cluster_normal[1] + centroid_vector[2] * cluster_normal[2]; + } +} + +static void calculateSortOrderRadix(unsigned int* sort_order, const float* sort_data, unsigned short* sort_keys, size_t cluster_count) +{ + // compute sort data bounds and renormalize, using fixed point snorm + float sort_data_max = 1e-3f; + + for (size_t i = 0; i < cluster_count; ++i) + { + float dpa = fabsf(sort_data[i]); + + sort_data_max = (sort_data_max < dpa) ? dpa : sort_data_max; + } + + const int sort_bits = 11; + + for (size_t i = 0; i < cluster_count; ++i) + { + // note that we flip distribution since high dot product should come first + float sort_key = 0.5f - 0.5f * (sort_data[i] / sort_data_max); + + sort_keys[i] = meshopt_quantizeUnorm(sort_key, sort_bits) & ((1 << sort_bits) - 1); + } + + // fill histogram for counting sort + unsigned int histogram[1 << sort_bits]; + memset(histogram, 0, sizeof(histogram)); + + for (size_t i = 0; i < cluster_count; ++i) + { + histogram[sort_keys[i]]++; + } + + // compute offsets based on histogram data + size_t histogram_sum = 0; + + for (size_t i = 0; i < 1 << sort_bits; ++i) + { + size_t count = histogram[i]; + histogram[i] = unsigned(histogram_sum); + histogram_sum += count; + } + + assert(histogram_sum == cluster_count); + + // compute sort order based on offsets + for (size_t i = 0; i < cluster_count; ++i) + { + sort_order[histogram[sort_keys[i]]++] = unsigned(i); + } +} + +static unsigned int updateCache(unsigned int a, unsigned int b, unsigned int c, unsigned int cache_size, unsigned int* cache_timestamps, unsigned int& timestamp) +{ + unsigned int cache_misses = 0; + + // if vertex is not in cache, put it in cache + if (timestamp - cache_timestamps[a] > cache_size) + { + cache_timestamps[a] = timestamp++; + cache_misses++; + } + + if (timestamp - cache_timestamps[b] > cache_size) + { + cache_timestamps[b] = timestamp++; + cache_misses++; + } + + if (timestamp - cache_timestamps[c] > cache_size) + { + cache_timestamps[c] = timestamp++; + cache_misses++; + } + + return cache_misses; +} + +static size_t generateHardBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int* cache_timestamps) +{ + memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); + + unsigned int timestamp = cache_size + 1; + + size_t face_count = index_count / 3; + + size_t result = 0; + + for (size_t i = 0; i < face_count; ++i) + { + unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); + + // when all three vertices are not in the cache it's usually relatively safe to assume that this is a new patch in the mesh + // that is disjoint from previous vertices; sometimes it might come back to reference existing vertices but that frequently + // suggests an inefficiency in the vertex cache optimization algorithm + // usually the first triangle has 3 misses unless it's degenerate - thus we make sure the first cluster always starts with 0 + if (i == 0 || m == 3) + { + destination[result++] = unsigned(i); + } + } + + assert(result <= index_count / 3); + + return result; +} + +static size_t generateSoftBoundaries(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const unsigned int* clusters, size_t cluster_count, unsigned int cache_size, float threshold, unsigned int* cache_timestamps) +{ + memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); + + unsigned int timestamp = 0; + + size_t result = 0; + + for (size_t it = 0; it < cluster_count; ++it) + { + size_t start = clusters[it]; + size_t end = (it + 1 < cluster_count) ? clusters[it + 1] : index_count / 3; + assert(start < end); + + // reset cache + timestamp += cache_size + 1; + + // measure cluster ACMR + unsigned int cluster_misses = 0; + + for (size_t i = start; i < end; ++i) + { + unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); + + cluster_misses += m; + } + + float cluster_threshold = threshold * (float(cluster_misses) / float(end - start)); + + // first cluster always starts from the hard cluster boundary + destination[result++] = unsigned(start); + + // reset cache + timestamp += cache_size + 1; + + unsigned int running_misses = 0; + unsigned int running_faces = 0; + + for (size_t i = start; i < end; ++i) + { + unsigned int m = updateCache(indices[i * 3 + 0], indices[i * 3 + 1], indices[i * 3 + 2], cache_size, &cache_timestamps[0], timestamp); + + running_misses += m; + running_faces += 1; + + if (float(running_misses) / float(running_faces) <= cluster_threshold) + { + // we have reached the target ACMR with the current triangle so we need to start a new cluster on the next one + // note that this may mean that we add 'end` to destination for the last triangle, which will imply that the last + // cluster is empty; however, the 'pop_back' after the loop will clean it up + destination[result++] = unsigned(i + 1); + + // reset cache + timestamp += cache_size + 1; + + running_misses = 0; + running_faces = 0; + } + } + + // each time we reach the target ACMR we flush the cluster + // this means that the last cluster is by definition not very good - there are frequent cases where we are left with a few triangles + // in the last cluster, producing a very bad ACMR and significantly penalizing the overall results + // thus we remove the last cluster boundary, merging the last complete cluster with the last incomplete one + // there are sometimes cases when the last cluster is actually good enough - in which case the code above would have added 'end' + // to the cluster boundary array which we need to remove anyway - this code will do that automatically + if (destination[result - 1] != start) + { + result--; + } + } + + assert(result >= cluster_count); + assert(result <= index_count / 3); + + return result; +} + +} // namespace meshopt + +void meshopt_optimizeOverdraw(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, float threshold) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + // guard for empty meshes + if (index_count == 0 || vertex_count == 0) + return; + + // support in-place optimization + if (destination == indices) + { + unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); + memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); + indices = indices_copy; + } + + unsigned int cache_size = 16; + + unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count); + + // generate hard boundaries from full-triangle cache misses + unsigned int* hard_clusters = allocator.allocate<unsigned int>(index_count / 3); + size_t hard_cluster_count = generateHardBoundaries(hard_clusters, indices, index_count, vertex_count, cache_size, cache_timestamps); + + // generate soft boundaries + unsigned int* soft_clusters = allocator.allocate<unsigned int>(index_count / 3 + 1); + size_t soft_cluster_count = generateSoftBoundaries(soft_clusters, indices, index_count, vertex_count, hard_clusters, hard_cluster_count, cache_size, threshold, cache_timestamps); + + const unsigned int* clusters = soft_clusters; + size_t cluster_count = soft_cluster_count; + + // fill sort data + float* sort_data = allocator.allocate<float>(cluster_count); + calculateSortData(sort_data, indices, index_count, vertex_positions, vertex_positions_stride, clusters, cluster_count); + + // sort clusters using sort data + unsigned short* sort_keys = allocator.allocate<unsigned short>(cluster_count); + unsigned int* sort_order = allocator.allocate<unsigned int>(cluster_count); + calculateSortOrderRadix(sort_order, sort_data, sort_keys, cluster_count); + + // fill output buffer + size_t offset = 0; + + for (size_t it = 0; it < cluster_count; ++it) + { + unsigned int cluster = sort_order[it]; + assert(cluster < cluster_count); + + size_t cluster_begin = clusters[cluster] * 3; + size_t cluster_end = (cluster + 1 < cluster_count) ? clusters[cluster + 1] * 3 : index_count; + assert(cluster_begin < cluster_end); + + memcpy(destination + offset, indices + cluster_begin, (cluster_end - cluster_begin) * sizeof(unsigned int)); + offset += cluster_end - cluster_begin; + } + + assert(offset == index_count); +} diff --git a/thirdparty/meshoptimizer/patches/simplifier_get_resulting_error.patch b/thirdparty/meshoptimizer/patches/simplifier_get_resulting_error.patch new file mode 100644 index 0000000000..1be38e45d2 --- /dev/null +++ b/thirdparty/meshoptimizer/patches/simplifier_get_resulting_error.patch @@ -0,0 +1,96 @@ +diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h +index a442d103c8..fde00f9c82 100644 +--- a/thirdparty/meshoptimizer/meshoptimizer.h ++++ b/thirdparty/meshoptimizer/meshoptimizer.h +@@ -266,7 +266,10 @@ MESHOPTIMIZER_EXPERIMENTAL void meshopt_decodeFilterExp(void* buffer, size_t ver + * destination must contain enough space for the *source* index buffer (since optimization is iterative, this means index_count elements - *not* target_index_count!) + * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer + */ +-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error); ++// -- GODOT start -- ++//MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error); ++MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error); ++// -- GODOT end -- + + /** + * Experimental: Mesh simplifier (sloppy) +diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp +index bd523275ce..51cf634186 100644 +--- a/thirdparty/meshoptimizer/simplifier.cpp ++++ b/thirdparty/meshoptimizer/simplifier.cpp +@@ -1143,7 +1143,10 @@ unsigned int* meshopt_simplifyDebugLoop = 0; + unsigned int* meshopt_simplifyDebugLoopBack = 0; + #endif + +-size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error) ++// -- GODOT start -- ++//size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error) ++size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error) ++// -- GODOT end -- + { + using namespace meshopt; + +@@ -1198,10 +1201,13 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, + if (result != indices) + memcpy(result, indices, index_count * sizeof(unsigned int)); + ++// -- GODOT start -- + #if TRACE + size_t pass_count = 0; +- float worst_error = 0; ++ //float worst_error = 0; + #endif ++ float worst_error = 0; ++// -- GODOT end -- + + Collapse* edge_collapses = allocator.allocate<Collapse>(index_count); + unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count); +@@ -1213,6 +1219,12 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, + // target_error input is linear; we need to adjust it to match quadricError units + float error_limit = target_error * target_error; + ++// -- GODOT start -- ++ if (r_resulting_error) { ++ *r_resulting_error = 1.0; ++ } ++// -- GODOT end -- ++ + while (result_count > target_index_count) + { + size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop); +@@ -1257,7 +1269,8 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, + size_t new_count = remapIndexBuffer(result, result_count, collapse_remap); + assert(new_count < result_count); + +-#if TRACE ++// -- GODOT start -- ++//#if TRACE + float pass_error = 0.f; + for (size_t i = 0; i < edge_collapse_count; ++i) + { +@@ -1267,15 +1280,24 @@ size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, + pass_error = c.error; + } + +- pass_count++; ++ //pass_count++; + worst_error = (worst_error < pass_error) ? pass_error : worst_error; + ++#if TRACE ++ pass_count++; + printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal); + #endif ++// -- GODOT end -- + + result_count = new_count; + } + ++// -- GODOT start -- ++ if (r_resulting_error) { ++ *r_resulting_error = sqrt(worst_error); ++ } ++// -- GODOT end -- ++ + #if TRACE + printf("passes: %d, worst error: %e\n", int(pass_count), worst_error); + #endif diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp new file mode 100644 index 0000000000..b195a8cb5d --- /dev/null +++ b/thirdparty/meshoptimizer/simplifier.cpp @@ -0,0 +1,1562 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <float.h> +#include <math.h> +#include <string.h> + + +#ifndef TRACE +#define TRACE 0 +#endif + +#if TRACE +#include <stdio.h> +#endif + +// This work is based on: +// Michael Garland and Paul S. Heckbert. Surface simplification using quadric error metrics. 1997 +// Michael Garland. Quadric-based polygonal surface simplification. 1999 +// Peter Lindstrom. Out-of-Core Simplification of Large Polygonal Models. 2000 +// Matthias Teschner, Bruno Heidelberger, Matthias Mueller, Danat Pomeranets, Markus Gross. Optimized Spatial Hashing for Collision Detection of Deformable Objects. 2003 +// Peter Van Sandt, Yannis Chronis, Jignesh M. Patel. Efficiently Searching In-Memory Sorted Arrays: Revenge of the Interpolation Search? 2019 +namespace meshopt +{ + +struct EdgeAdjacency +{ + unsigned int* counts; + unsigned int* offsets; + unsigned int* data; +}; + +static void buildEdgeAdjacency(EdgeAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) +{ + size_t face_count = index_count / 3; + + // allocate arrays + adjacency.counts = allocator.allocate<unsigned int>(vertex_count); + adjacency.offsets = allocator.allocate<unsigned int>(vertex_count); + adjacency.data = allocator.allocate<unsigned int>(index_count); + + // fill edge counts + memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + assert(indices[i] < vertex_count); + + adjacency.counts[indices[i]]++; + } + + // fill offset table + unsigned int offset = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + adjacency.offsets[i] = offset; + offset += adjacency.counts[i]; + } + + assert(offset == index_count); + + // fill edge data + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + + adjacency.data[adjacency.offsets[a]++] = b; + adjacency.data[adjacency.offsets[b]++] = c; + adjacency.data[adjacency.offsets[c]++] = a; + } + + // fix offsets that have been disturbed by the previous pass + for (size_t i = 0; i < vertex_count; ++i) + { + assert(adjacency.offsets[i] >= adjacency.counts[i]); + + adjacency.offsets[i] -= adjacency.counts[i]; + } +} + +struct PositionHasher +{ + const float* vertex_positions; + size_t vertex_stride_float; + + size_t hash(unsigned int index) const + { + const unsigned int* key = reinterpret_cast<const unsigned int*>(vertex_positions + index * vertex_stride_float); + + // Optimized Spatial Hashing for Collision Detection of Deformable Objects + return (key[0] * 73856093) ^ (key[1] * 19349663) ^ (key[2] * 83492791); + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + return memcmp(vertex_positions + lhs * vertex_stride_float, vertex_positions + rhs * vertex_stride_float, sizeof(float) * 3) == 0; + } +}; + +static size_t hashBuckets2(size_t count) +{ + size_t buckets = 1; + while (buckets < count) + buckets *= 2; + + return buckets; +} + +template <typename T, typename Hash> +static T* hashLookup2(T* table, size_t buckets, const Hash& hash, const T& key, const T& empty) +{ + assert(buckets > 0); + assert((buckets & (buckets - 1)) == 0); + + size_t hashmod = buckets - 1; + size_t bucket = hash.hash(key) & hashmod; + + for (size_t probe = 0; probe <= hashmod; ++probe) + { + T& item = table[bucket]; + + if (item == empty) + return &item; + + if (hash.equal(item, key)) + return &item; + + // hash collision, quadratic probing + bucket = (bucket + probe + 1) & hashmod; + } + + assert(false && "Hash table is full"); // unreachable + return 0; +} + +static void buildPositionRemap(unsigned int* remap, unsigned int* wedge, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator) +{ + PositionHasher hasher = {vertex_positions_data, vertex_positions_stride / sizeof(float)}; + + size_t table_size = hashBuckets2(vertex_count); + unsigned int* table = allocator.allocate<unsigned int>(table_size); + memset(table, -1, table_size * sizeof(unsigned int)); + + // build forward remap: for each vertex, which other (canonical) vertex does it map to? + // we use position equivalence for this, and remap vertices to other existing vertices + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int index = unsigned(i); + unsigned int* entry = hashLookup2(table, table_size, hasher, index, ~0u); + + if (*entry == ~0u) + *entry = index; + + remap[index] = *entry; + } + + // build wedge table: for each vertex, which other vertex is the next wedge that also maps to the same vertex? + // entries in table form a (cyclic) wedge loop per vertex; for manifold vertices, wedge[i] == remap[i] == i + for (size_t i = 0; i < vertex_count; ++i) + wedge[i] = unsigned(i); + + for (size_t i = 0; i < vertex_count; ++i) + if (remap[i] != i) + { + unsigned int r = remap[i]; + + wedge[i] = wedge[r]; + wedge[r] = unsigned(i); + } +} + +enum VertexKind +{ + Kind_Manifold, // not on an attribute seam, not on any boundary + Kind_Border, // not on an attribute seam, has exactly two open edges + Kind_Seam, // on an attribute seam with exactly two attribute seam edges + Kind_Complex, // none of the above; these vertices can move as long as all wedges move to the target vertex + Kind_Locked, // none of the above; these vertices can't move + + Kind_Count +}; + +// manifold vertices can collapse onto anything +// border/seam vertices can only be collapsed onto border/seam respectively +// complex vertices can collapse onto complex/locked +// a rule of thumb is that collapsing kind A into kind B preserves the kind B in the target vertex +// for example, while we could collapse Complex into Manifold, this would mean the target vertex isn't Manifold anymore +const unsigned char kCanCollapse[Kind_Count][Kind_Count] = { + {1, 1, 1, 1, 1}, + {0, 1, 0, 0, 0}, + {0, 0, 1, 0, 0}, + {0, 0, 0, 1, 1}, + {0, 0, 0, 0, 0}, +}; + +// if a vertex is manifold or seam, adjoining edges are guaranteed to have an opposite edge +// note that for seam edges, the opposite edge isn't present in the attribute-based topology +// but is present if you consider a position-only mesh variant +const unsigned char kHasOpposite[Kind_Count][Kind_Count] = { + {1, 1, 1, 0, 1}, + {1, 0, 1, 0, 0}, + {1, 1, 1, 0, 1}, + {0, 0, 0, 0, 0}, + {1, 0, 1, 0, 0}, +}; + +static bool hasEdge(const EdgeAdjacency& adjacency, unsigned int a, unsigned int b) +{ + unsigned int count = adjacency.counts[a]; + const unsigned int* data = adjacency.data + adjacency.offsets[a]; + + for (size_t i = 0; i < count; ++i) + if (data[i] == b) + return true; + + return false; +} + +static void classifyVertices(unsigned char* result, unsigned int* loop, unsigned int* loopback, size_t vertex_count, const EdgeAdjacency& adjacency, const unsigned int* remap, const unsigned int* wedge) +{ + memset(loop, -1, vertex_count * sizeof(unsigned int)); + memset(loopback, -1, vertex_count * sizeof(unsigned int)); + + // incoming & outgoing open edges: ~0u if no open edges, i if there are more than 1 + // note that this is the same data as required in loop[] arrays; loop[] data is only valid for border/seam + // but here it's okay to fill the data out for other types of vertices as well + unsigned int* openinc = loopback; + unsigned int* openout = loop; + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int vertex = unsigned(i); + + unsigned int count = adjacency.counts[vertex]; + const unsigned int* data = adjacency.data + adjacency.offsets[vertex]; + + for (size_t j = 0; j < count; ++j) + { + unsigned int target = data[j]; + + if (!hasEdge(adjacency, target, vertex)) + { + openinc[target] = (openinc[target] == ~0u) ? vertex : target; + openout[vertex] = (openout[vertex] == ~0u) ? target : vertex; + } + } + } + +#if TRACE + size_t lockedstats[4] = {}; +#define TRACELOCKED(i) lockedstats[i]++; +#else +#define TRACELOCKED(i) (void)0 +#endif + + for (size_t i = 0; i < vertex_count; ++i) + { + if (remap[i] == i) + { + if (wedge[i] == i) + { + // no attribute seam, need to check if it's manifold + unsigned int openi = openinc[i], openo = openout[i]; + + // note: we classify any vertices with no open edges as manifold + // this is technically incorrect - if 4 triangles share an edge, we'll classify vertices as manifold + // it's unclear if this is a problem in practice + if (openi == ~0u && openo == ~0u) + { + result[i] = Kind_Manifold; + } + else if (openi != i && openo != i) + { + result[i] = Kind_Border; + } + else + { + result[i] = Kind_Locked; + TRACELOCKED(0); + } + } + else if (wedge[wedge[i]] == i) + { + // attribute seam; need to distinguish between Seam and Locked + unsigned int w = wedge[i]; + unsigned int openiv = openinc[i], openov = openout[i]; + unsigned int openiw = openinc[w], openow = openout[w]; + + // seam should have one open half-edge for each vertex, and the edges need to "connect" - point to the same vertex post-remap + if (openiv != ~0u && openiv != i && openov != ~0u && openov != i && + openiw != ~0u && openiw != w && openow != ~0u && openow != w) + { + if (remap[openiv] == remap[openow] && remap[openov] == remap[openiw]) + { + result[i] = Kind_Seam; + } + else + { + result[i] = Kind_Locked; + TRACELOCKED(1); + } + } + else + { + result[i] = Kind_Locked; + TRACELOCKED(2); + } + } + else + { + // more than one vertex maps to this one; we don't have classification available + result[i] = Kind_Locked; + TRACELOCKED(3); + } + } + else + { + assert(remap[i] < i); + + result[i] = result[remap[i]]; + } + } + +#if TRACE + printf("locked: many open edges %d, disconnected seam %d, many seam edges %d, many wedges %d\n", + int(lockedstats[0]), int(lockedstats[1]), int(lockedstats[2]), int(lockedstats[3])); +#endif +} + +struct Vector3 +{ + float x, y, z; +}; +// -- GODOT start -- +//static void rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) +static float rescalePositions(Vector3* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) +// -- GODOT end -- + +{ + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX}; + float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; + + for (size_t i = 0; i < vertex_count; ++i) + { + const float* v = vertex_positions_data + i * vertex_stride_float; + + result[i].x = v[0]; + result[i].y = v[1]; + result[i].z = v[2]; + + for (int j = 0; j < 3; ++j) + { + float vj = v[j]; + + minv[j] = minv[j] > vj ? vj : minv[j]; + maxv[j] = maxv[j] < vj ? vj : maxv[j]; + } + } + + float extent = 0.f; + + extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]); + extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]); + extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]); + + float scale = extent == 0 ? 0.f : 1.f / extent; + + for (size_t i = 0; i < vertex_count; ++i) + { + result[i].x = (result[i].x - minv[0]) * scale; + result[i].y = (result[i].y - minv[1]) * scale; + result[i].z = (result[i].z - minv[2]) * scale; + } +// -- GODOT start -- + return extent; +// -- GODOT end -- + +} + +struct Quadric +{ + float a00, a11, a22; + float a10, a20, a21; + float b0, b1, b2, c; + float w; +}; + +struct Collapse +{ + unsigned int v0; + unsigned int v1; + + union + { + unsigned int bidi; + float error; + unsigned int errorui; + }; +}; + +static float normalize(Vector3& v) +{ + float length = sqrtf(v.x * v.x + v.y * v.y + v.z * v.z); + + if (length > 0) + { + v.x /= length; + v.y /= length; + v.z /= length; + } + + return length; +} + +static void quadricAdd(Quadric& Q, const Quadric& R) +{ + Q.a00 += R.a00; + Q.a11 += R.a11; + Q.a22 += R.a22; + Q.a10 += R.a10; + Q.a20 += R.a20; + Q.a21 += R.a21; + Q.b0 += R.b0; + Q.b1 += R.b1; + Q.b2 += R.b2; + Q.c += R.c; + Q.w += R.w; +} + +static float quadricError(const Quadric& Q, const Vector3& v) +{ + float rx = Q.b0; + float ry = Q.b1; + float rz = Q.b2; + + rx += Q.a10 * v.y; + ry += Q.a21 * v.z; + rz += Q.a20 * v.x; + + rx *= 2; + ry *= 2; + rz *= 2; + + rx += Q.a00 * v.x; + ry += Q.a11 * v.y; + rz += Q.a22 * v.z; + + float r = Q.c; + r += rx * v.x; + r += ry * v.y; + r += rz * v.z; + + float s = Q.w == 0.f ? 0.f : 1.f / Q.w; + + return fabsf(r) * s; +} + +static void quadricFromPlane(Quadric& Q, float a, float b, float c, float d, float w) +{ + float aw = a * w; + float bw = b * w; + float cw = c * w; + float dw = d * w; + + Q.a00 = a * aw; + Q.a11 = b * bw; + Q.a22 = c * cw; + Q.a10 = a * bw; + Q.a20 = a * cw; + Q.a21 = b * cw; + Q.b0 = a * dw; + Q.b1 = b * dw; + Q.b2 = c * dw; + Q.c = d * dw; + Q.w = w; +} + +static void quadricFromPoint(Quadric& Q, float x, float y, float z, float w) +{ + // we need to encode (x - X) ^ 2 + (y - Y)^2 + (z - Z)^2 into the quadric + Q.a00 = w; + Q.a11 = w; + Q.a22 = w; + Q.a10 = 0.f; + Q.a20 = 0.f; + Q.a21 = 0.f; + Q.b0 = -2.f * x * w; + Q.b1 = -2.f * y * w; + Q.b2 = -2.f * z * w; + Q.c = (x * x + y * y + z * z) * w; + Q.w = w; +} + +static void quadricFromTriangle(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight) +{ + Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; + Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z}; + + // normal = cross(p1 - p0, p2 - p0) + Vector3 normal = {p10.y * p20.z - p10.z * p20.y, p10.z * p20.x - p10.x * p20.z, p10.x * p20.y - p10.y * p20.x}; + float area = normalize(normal); + + float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z; + + // we use sqrtf(area) so that the error is scaled linearly; this tends to improve silhouettes + quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, sqrtf(area) * weight); +} + +static void quadricFromTriangleEdge(Quadric& Q, const Vector3& p0, const Vector3& p1, const Vector3& p2, float weight) +{ + Vector3 p10 = {p1.x - p0.x, p1.y - p0.y, p1.z - p0.z}; + float length = normalize(p10); + + // p20p = length of projection of p2-p0 onto normalize(p1 - p0) + Vector3 p20 = {p2.x - p0.x, p2.y - p0.y, p2.z - p0.z}; + float p20p = p20.x * p10.x + p20.y * p10.y + p20.z * p10.z; + + // normal = altitude of triangle from point p2 onto edge p1-p0 + Vector3 normal = {p20.x - p10.x * p20p, p20.y - p10.y * p20p, p20.z - p10.z * p20p}; + normalize(normal); + + float distance = normal.x * p0.x + normal.y * p0.y + normal.z * p0.z; + + // note: the weight is scaled linearly with edge length; this has to match the triangle weight + quadricFromPlane(Q, normal.x, normal.y, normal.z, -distance, length * weight); +} + +static void fillFaceQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap) +{ + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int i0 = indices[i + 0]; + unsigned int i1 = indices[i + 1]; + unsigned int i2 = indices[i + 2]; + + Quadric Q; + quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], 1.f); + + quadricAdd(vertex_quadrics[remap[i0]], Q); + quadricAdd(vertex_quadrics[remap[i1]], Q); + quadricAdd(vertex_quadrics[remap[i2]], Q); + } +} + +static void fillEdgeQuadrics(Quadric* vertex_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop, const unsigned int* loopback) +{ + for (size_t i = 0; i < index_count; i += 3) + { + static const int next[3] = {1, 2, 0}; + + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + + unsigned char k0 = vertex_kind[i0]; + unsigned char k1 = vertex_kind[i1]; + + // check that either i0 or i1 are border/seam and are on the same edge loop + // note that we need to add the error even for edged that connect e.g. border & locked + // if we don't do that, the adjacent border->border edge won't have correct errors for corners + if (k0 != Kind_Border && k0 != Kind_Seam && k1 != Kind_Border && k1 != Kind_Seam) + continue; + + if ((k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1) + continue; + + if ((k1 == Kind_Border || k1 == Kind_Seam) && loopback[i1] != i0) + continue; + + // seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges + if (kHasOpposite[k0][k1] && remap[i1] > remap[i0]) + continue; + + unsigned int i2 = indices[i + next[next[e]]]; + + // we try hard to maintain border edge geometry; seam edges can move more freely + // due to topological restrictions on collapses, seam quadrics slightly improves collapse structure but aren't critical + const float kEdgeWeightSeam = 1.f; + const float kEdgeWeightBorder = 10.f; + + float edgeWeight = (k0 == Kind_Border || k1 == Kind_Border) ? kEdgeWeightBorder : kEdgeWeightSeam; + + Quadric Q; + quadricFromTriangleEdge(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], edgeWeight); + + quadricAdd(vertex_quadrics[remap[i0]], Q); + quadricAdd(vertex_quadrics[remap[i1]], Q); + } + } +} + +static size_t pickEdgeCollapses(Collapse* collapses, const unsigned int* indices, size_t index_count, const unsigned int* remap, const unsigned char* vertex_kind, const unsigned int* loop) +{ + size_t collapse_count = 0; + + for (size_t i = 0; i < index_count; i += 3) + { + static const int next[3] = {1, 2, 0}; + + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + + // this can happen either when input has a zero-length edge, or when we perform collapses for complex + // topology w/seams and collapse a manifold vertex that connects to both wedges onto one of them + // we leave edges like this alone since they may be important for preserving mesh integrity + if (remap[i0] == remap[i1]) + continue; + + unsigned char k0 = vertex_kind[i0]; + unsigned char k1 = vertex_kind[i1]; + + // the edge has to be collapsible in at least one direction + if (!(kCanCollapse[k0][k1] | kCanCollapse[k1][k0])) + continue; + + // manifold and seam edges should occur twice (i0->i1 and i1->i0) - skip redundant edges + if (kHasOpposite[k0][k1] && remap[i1] > remap[i0]) + continue; + + // two vertices are on a border or a seam, but there's no direct edge between them + // this indicates that they belong to two different edge loops and we should not collapse this edge + // loop[] tracks half edges so we only need to check i0->i1 + if (k0 == k1 && (k0 == Kind_Border || k0 == Kind_Seam) && loop[i0] != i1) + continue; + + // edge can be collapsed in either direction - we will pick the one with minimum error + // note: we evaluate error later during collapse ranking, here we just tag the edge as bidirectional + if (kCanCollapse[k0][k1] & kCanCollapse[k1][k0]) + { + Collapse c = {i0, i1, {/* bidi= */ 1}}; + collapses[collapse_count++] = c; + } + else + { + // edge can only be collapsed in one direction + unsigned int e0 = kCanCollapse[k0][k1] ? i0 : i1; + unsigned int e1 = kCanCollapse[k0][k1] ? i1 : i0; + + Collapse c = {e0, e1, {/* bidi= */ 0}}; + collapses[collapse_count++] = c; + } + } + } + + return collapse_count; +} + +static void rankEdgeCollapses(Collapse* collapses, size_t collapse_count, const Vector3* vertex_positions, const Quadric* vertex_quadrics, const unsigned int* remap) +{ + for (size_t i = 0; i < collapse_count; ++i) + { + Collapse& c = collapses[i]; + + unsigned int i0 = c.v0; + unsigned int i1 = c.v1; + + // most edges are bidirectional which means we need to evaluate errors for two collapses + // to keep this code branchless we just use the same edge for unidirectional edges + unsigned int j0 = c.bidi ? i1 : i0; + unsigned int j1 = c.bidi ? i0 : i1; + + const Quadric& qi = vertex_quadrics[remap[i0]]; + const Quadric& qj = vertex_quadrics[remap[j0]]; + + float ei = quadricError(qi, vertex_positions[i1]); + float ej = quadricError(qj, vertex_positions[j1]); + + // pick edge direction with minimal error + c.v0 = ei <= ej ? i0 : j0; + c.v1 = ei <= ej ? i1 : j1; + c.error = ei <= ej ? ei : ej; + } +} + +#if TRACE > 1 +static void dumpEdgeCollapses(const Collapse* collapses, size_t collapse_count, const unsigned char* vertex_kind) +{ + size_t ckinds[Kind_Count][Kind_Count] = {}; + float cerrors[Kind_Count][Kind_Count] = {}; + + for (int k0 = 0; k0 < Kind_Count; ++k0) + for (int k1 = 0; k1 < Kind_Count; ++k1) + cerrors[k0][k1] = FLT_MAX; + + for (size_t i = 0; i < collapse_count; ++i) + { + unsigned int i0 = collapses[i].v0; + unsigned int i1 = collapses[i].v1; + + unsigned char k0 = vertex_kind[i0]; + unsigned char k1 = vertex_kind[i1]; + + ckinds[k0][k1]++; + cerrors[k0][k1] = (collapses[i].error < cerrors[k0][k1]) ? collapses[i].error : cerrors[k0][k1]; + } + + for (int k0 = 0; k0 < Kind_Count; ++k0) + for (int k1 = 0; k1 < Kind_Count; ++k1) + if (ckinds[k0][k1]) + printf("collapses %d -> %d: %d, min error %e\n", k0, k1, int(ckinds[k0][k1]), cerrors[k0][k1]); +} + +static void dumpLockedCollapses(const unsigned int* indices, size_t index_count, const unsigned char* vertex_kind) +{ + size_t locked_collapses[Kind_Count][Kind_Count] = {}; + + for (size_t i = 0; i < index_count; i += 3) + { + static const int next[3] = {1, 2, 0}; + + for (int e = 0; e < 3; ++e) + { + unsigned int i0 = indices[i + e]; + unsigned int i1 = indices[i + next[e]]; + + unsigned char k0 = vertex_kind[i0]; + unsigned char k1 = vertex_kind[i1]; + + locked_collapses[k0][k1] += !kCanCollapse[k0][k1] && !kCanCollapse[k1][k0]; + } + } + + for (int k0 = 0; k0 < Kind_Count; ++k0) + for (int k1 = 0; k1 < Kind_Count; ++k1) + if (locked_collapses[k0][k1]) + printf("locked collapses %d -> %d: %d\n", k0, k1, int(locked_collapses[k0][k1])); +} +#endif + +static void sortEdgeCollapses(unsigned int* sort_order, const Collapse* collapses, size_t collapse_count) +{ + const int sort_bits = 11; + + // fill histogram for counting sort + unsigned int histogram[1 << sort_bits]; + memset(histogram, 0, sizeof(histogram)); + + for (size_t i = 0; i < collapse_count; ++i) + { + // skip sign bit since error is non-negative + unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits); + + histogram[key]++; + } + + // compute offsets based on histogram data + size_t histogram_sum = 0; + + for (size_t i = 0; i < 1 << sort_bits; ++i) + { + size_t count = histogram[i]; + histogram[i] = unsigned(histogram_sum); + histogram_sum += count; + } + + assert(histogram_sum == collapse_count); + + // compute sort order based on offsets + for (size_t i = 0; i < collapse_count; ++i) + { + // skip sign bit since error is non-negative + unsigned int key = (collapses[i].errorui << 1) >> (32 - sort_bits); + + sort_order[histogram[key]++] = unsigned(i); + } +} + +static size_t performEdgeCollapses(unsigned int* collapse_remap, unsigned char* collapse_locked, Quadric* vertex_quadrics, const Collapse* collapses, size_t collapse_count, const unsigned int* collapse_order, const unsigned int* remap, const unsigned int* wedge, const unsigned char* vertex_kind, size_t triangle_collapse_goal, float error_goal, float error_limit) +{ + size_t edge_collapses = 0; + size_t triangle_collapses = 0; + + for (size_t i = 0; i < collapse_count; ++i) + { + const Collapse& c = collapses[collapse_order[i]]; + + if (c.error > error_limit) + break; + + if (c.error > error_goal && triangle_collapses > triangle_collapse_goal / 10) + break; + + if (triangle_collapses >= triangle_collapse_goal) + break; + + unsigned int i0 = c.v0; + unsigned int i1 = c.v1; + + unsigned int r0 = remap[i0]; + unsigned int r1 = remap[i1]; + + // we don't collapse vertices that had source or target vertex involved in a collapse + // it's important to not move the vertices twice since it complicates the tracking/remapping logic + // it's important to not move other vertices towards a moved vertex to preserve error since we don't re-rank collapses mid-pass + if (collapse_locked[r0] | collapse_locked[r1]) + continue; + + assert(collapse_remap[r0] == r0); + assert(collapse_remap[r1] == r1); + + quadricAdd(vertex_quadrics[r1], vertex_quadrics[r0]); + + if (vertex_kind[i0] == Kind_Complex) + { + unsigned int v = i0; + + do + { + collapse_remap[v] = r1; + v = wedge[v]; + } while (v != i0); + } + else if (vertex_kind[i0] == Kind_Seam) + { + // remap v0 to v1 and seam pair of v0 to seam pair of v1 + unsigned int s0 = wedge[i0]; + unsigned int s1 = wedge[i1]; + + assert(s0 != i0 && s1 != i1); + assert(wedge[s0] == i0 && wedge[s1] == i1); + + collapse_remap[i0] = i1; + collapse_remap[s0] = s1; + } + else + { + assert(wedge[i0] == i0); + + collapse_remap[i0] = i1; + } + + collapse_locked[r0] = 1; + collapse_locked[r1] = 1; + + // border edges collapse 1 triangle, other edges collapse 2 or more + triangle_collapses += (vertex_kind[i0] == Kind_Border) ? 1 : 2; + edge_collapses++; + } + + return edge_collapses; +} + +static size_t remapIndexBuffer(unsigned int* indices, size_t index_count, const unsigned int* collapse_remap) +{ + size_t write = 0; + + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int v0 = collapse_remap[indices[i + 0]]; + unsigned int v1 = collapse_remap[indices[i + 1]]; + unsigned int v2 = collapse_remap[indices[i + 2]]; + + // we never move the vertex twice during a single pass + assert(collapse_remap[v0] == v0); + assert(collapse_remap[v1] == v1); + assert(collapse_remap[v2] == v2); + + if (v0 != v1 && v0 != v2 && v1 != v2) + { + indices[write + 0] = v0; + indices[write + 1] = v1; + indices[write + 2] = v2; + write += 3; + } + } + + return write; +} + +static void remapEdgeLoops(unsigned int* loop, size_t vertex_count, const unsigned int* collapse_remap) +{ + for (size_t i = 0; i < vertex_count; ++i) + { + if (loop[i] != ~0u) + { + unsigned int l = loop[i]; + unsigned int r = collapse_remap[l]; + + // i == r is a special case when the seam edge is collapsed in a direction opposite to where loop goes + loop[i] = (i == r) ? loop[l] : r; + } + } +} + +struct CellHasher +{ + const unsigned int* vertex_ids; + + size_t hash(unsigned int i) const + { + unsigned int h = vertex_ids[i]; + + // MurmurHash2 finalizer + h ^= h >> 13; + h *= 0x5bd1e995; + h ^= h >> 15; + return h; + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + return vertex_ids[lhs] == vertex_ids[rhs]; + } +}; + +struct IdHasher +{ + size_t hash(unsigned int id) const + { + unsigned int h = id; + + // MurmurHash2 finalizer + h ^= h >> 13; + h *= 0x5bd1e995; + h ^= h >> 15; + return h; + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + return lhs == rhs; + } +}; + +struct TriangleHasher +{ + unsigned int* indices; + + size_t hash(unsigned int i) const + { + const unsigned int* tri = indices + i * 3; + + // Optimized Spatial Hashing for Collision Detection of Deformable Objects + return (tri[0] * 73856093) ^ (tri[1] * 19349663) ^ (tri[2] * 83492791); + } + + bool equal(unsigned int lhs, unsigned int rhs) const + { + const unsigned int* lt = indices + lhs * 3; + const unsigned int* rt = indices + rhs * 3; + + return lt[0] == rt[0] && lt[1] == rt[1] && lt[2] == rt[2]; + } +}; + +static void computeVertexIds(unsigned int* vertex_ids, const Vector3* vertex_positions, size_t vertex_count, int grid_size) +{ + assert(grid_size >= 1 && grid_size <= 1024); + float cell_scale = float(grid_size - 1); + + for (size_t i = 0; i < vertex_count; ++i) + { + const Vector3& v = vertex_positions[i]; + + int xi = int(v.x * cell_scale + 0.5f); + int yi = int(v.y * cell_scale + 0.5f); + int zi = int(v.z * cell_scale + 0.5f); + + vertex_ids[i] = (xi << 20) | (yi << 10) | zi; + } +} + +static size_t countTriangles(const unsigned int* vertex_ids, const unsigned int* indices, size_t index_count) +{ + size_t result = 0; + + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int id0 = vertex_ids[indices[i + 0]]; + unsigned int id1 = vertex_ids[indices[i + 1]]; + unsigned int id2 = vertex_ids[indices[i + 2]]; + + result += (id0 != id1) & (id0 != id2) & (id1 != id2); + } + + return result; +} + +static size_t fillVertexCells(unsigned int* table, size_t table_size, unsigned int* vertex_cells, const unsigned int* vertex_ids, size_t vertex_count) +{ + CellHasher hasher = {vertex_ids}; + + memset(table, -1, table_size * sizeof(unsigned int)); + + size_t result = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int* entry = hashLookup2(table, table_size, hasher, unsigned(i), ~0u); + + if (*entry == ~0u) + { + *entry = unsigned(i); + vertex_cells[i] = unsigned(result++); + } + else + { + vertex_cells[i] = vertex_cells[*entry]; + } + } + + return result; +} + +static size_t countVertexCells(unsigned int* table, size_t table_size, const unsigned int* vertex_ids, size_t vertex_count) +{ + IdHasher hasher; + + memset(table, -1, table_size * sizeof(unsigned int)); + + size_t result = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int id = vertex_ids[i]; + unsigned int* entry = hashLookup2(table, table_size, hasher, id, ~0u); + + result += (*entry == ~0u); + *entry = id; + } + + return result; +} + +static void fillCellQuadrics(Quadric* cell_quadrics, const unsigned int* indices, size_t index_count, const Vector3* vertex_positions, const unsigned int* vertex_cells) +{ + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int i0 = indices[i + 0]; + unsigned int i1 = indices[i + 1]; + unsigned int i2 = indices[i + 2]; + + unsigned int c0 = vertex_cells[i0]; + unsigned int c1 = vertex_cells[i1]; + unsigned int c2 = vertex_cells[i2]; + + bool single_cell = (c0 == c1) & (c0 == c2); + + Quadric Q; + quadricFromTriangle(Q, vertex_positions[i0], vertex_positions[i1], vertex_positions[i2], single_cell ? 3.f : 1.f); + + if (single_cell) + { + quadricAdd(cell_quadrics[c0], Q); + } + else + { + quadricAdd(cell_quadrics[c0], Q); + quadricAdd(cell_quadrics[c1], Q); + quadricAdd(cell_quadrics[c2], Q); + } + } +} + +static void fillCellQuadrics(Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count, const unsigned int* vertex_cells) +{ + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int c = vertex_cells[i]; + const Vector3& v = vertex_positions[i]; + + Quadric Q; + quadricFromPoint(Q, v.x, v.y, v.z, 1.f); + + quadricAdd(cell_quadrics[c], Q); + } +} + +static void fillCellRemap(unsigned int* cell_remap, float* cell_errors, size_t cell_count, const unsigned int* vertex_cells, const Quadric* cell_quadrics, const Vector3* vertex_positions, size_t vertex_count) +{ + memset(cell_remap, -1, cell_count * sizeof(unsigned int)); + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned int cell = vertex_cells[i]; + float error = quadricError(cell_quadrics[cell], vertex_positions[i]); + + if (cell_remap[cell] == ~0u || cell_errors[cell] > error) + { + cell_remap[cell] = unsigned(i); + cell_errors[cell] = error; + } + } +} + +static size_t filterTriangles(unsigned int* destination, unsigned int* tritable, size_t tritable_size, const unsigned int* indices, size_t index_count, const unsigned int* vertex_cells, const unsigned int* cell_remap) +{ + TriangleHasher hasher = {destination}; + + memset(tritable, -1, tritable_size * sizeof(unsigned int)); + + size_t result = 0; + + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int c0 = vertex_cells[indices[i + 0]]; + unsigned int c1 = vertex_cells[indices[i + 1]]; + unsigned int c2 = vertex_cells[indices[i + 2]]; + + if (c0 != c1 && c0 != c2 && c1 != c2) + { + unsigned int a = cell_remap[c0]; + unsigned int b = cell_remap[c1]; + unsigned int c = cell_remap[c2]; + + if (b < a && b < c) + { + unsigned int t = a; + a = b, b = c, c = t; + } + else if (c < a && c < b) + { + unsigned int t = c; + c = b, b = a, a = t; + } + + destination[result * 3 + 0] = a; + destination[result * 3 + 1] = b; + destination[result * 3 + 2] = c; + + unsigned int* entry = hashLookup2(tritable, tritable_size, hasher, unsigned(result), ~0u); + + if (*entry == ~0u) + *entry = unsigned(result++); + } + } + + return result * 3; +} + +static float interpolate(float y, float x0, float y0, float x1, float y1, float x2, float y2) +{ + // three point interpolation from "revenge of interpolation search" paper + float num = (y1 - y) * (x1 - x2) * (x1 - x0) * (y2 - y0); + float den = (y2 - y) * (x1 - x2) * (y0 - y1) + (y0 - y) * (x1 - x0) * (y1 - y2); + return x1 + num / den; +} + +} // namespace meshopt + +#ifndef NDEBUG +unsigned char* meshopt_simplifyDebugKind = 0; +unsigned int* meshopt_simplifyDebugLoop = 0; +unsigned int* meshopt_simplifyDebugLoopBack = 0; +#endif + +// -- GODOT start -- +//size_t meshopt_simplify(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error) +size_t meshopt_simplify(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *r_resulting_error) +// -- GODOT end -- +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(target_index_count <= index_count); + + meshopt_Allocator allocator; + + unsigned int* result = destination; + + // build adjacency information + EdgeAdjacency adjacency = {}; + buildEdgeAdjacency(adjacency, indices, index_count, vertex_count, allocator); + + // build position remap that maps each vertex to the one with identical position + unsigned int* remap = allocator.allocate<unsigned int>(vertex_count); + unsigned int* wedge = allocator.allocate<unsigned int>(vertex_count); + buildPositionRemap(remap, wedge, vertex_positions_data, vertex_count, vertex_positions_stride, allocator); + + // classify vertices; vertex kind determines collapse rules, see kCanCollapse + unsigned char* vertex_kind = allocator.allocate<unsigned char>(vertex_count); + unsigned int* loop = allocator.allocate<unsigned int>(vertex_count); + unsigned int* loopback = allocator.allocate<unsigned int>(vertex_count); + classifyVertices(vertex_kind, loop, loopback, vertex_count, adjacency, remap, wedge); + +#if TRACE + size_t unique_positions = 0; + for (size_t i = 0; i < vertex_count; ++i) + unique_positions += remap[i] == i; + + printf("position remap: %d vertices => %d positions\n", int(vertex_count), int(unique_positions)); + + size_t kinds[Kind_Count] = {}; + for (size_t i = 0; i < vertex_count; ++i) + kinds[vertex_kind[i]] += remap[i] == i; + + printf("kinds: manifold %d, border %d, seam %d, complex %d, locked %d\n", + int(kinds[Kind_Manifold]), int(kinds[Kind_Border]), int(kinds[Kind_Seam]), int(kinds[Kind_Complex]), int(kinds[Kind_Locked])); +#endif + + Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count); +// -- GODOT start -- + //rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride); + float extent = rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride); +// -- GODOT end -- + + Quadric* vertex_quadrics = allocator.allocate<Quadric>(vertex_count); + memset(vertex_quadrics, 0, vertex_count * sizeof(Quadric)); + + fillFaceQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap); + fillEdgeQuadrics(vertex_quadrics, indices, index_count, vertex_positions, remap, vertex_kind, loop, loopback); + + if (result != indices) + memcpy(result, indices, index_count * sizeof(unsigned int)); + +// -- GODOT start -- +#if TRACE + size_t pass_count = 0; + //float worst_error = 0; +#endif + float worst_error = 0; +// -- GODOT end -- + + Collapse* edge_collapses = allocator.allocate<Collapse>(index_count); + unsigned int* collapse_order = allocator.allocate<unsigned int>(index_count); + unsigned int* collapse_remap = allocator.allocate<unsigned int>(vertex_count); + unsigned char* collapse_locked = allocator.allocate<unsigned char>(vertex_count); + + size_t result_count = index_count; + + // target_error input is linear; we need to adjust it to match quadricError units + float error_limit = target_error * target_error; + +// -- GODOT start -- + if (r_resulting_error) { + *r_resulting_error = 1.0; + } +// -- GODOT end -- + + while (result_count > target_index_count) + { + size_t edge_collapse_count = pickEdgeCollapses(edge_collapses, result, result_count, remap, vertex_kind, loop); + + // no edges can be collapsed any more due to topology restrictions + if (edge_collapse_count == 0) + break; + + rankEdgeCollapses(edge_collapses, edge_collapse_count, vertex_positions, vertex_quadrics, remap); + +#if TRACE > 1 + dumpEdgeCollapses(edge_collapses, edge_collapse_count, vertex_kind); +#endif + + sortEdgeCollapses(collapse_order, edge_collapses, edge_collapse_count); + + // most collapses remove 2 triangles; use this to establish a bound on the pass in terms of error limit + // note that edge_collapse_goal is an estimate; triangle_collapse_goal will be used to actually limit collapses + size_t triangle_collapse_goal = (result_count - target_index_count) / 3; + size_t edge_collapse_goal = triangle_collapse_goal / 2; + + // we limit the error in each pass based on the error of optimal last collapse; since many collapses will be locked + // as they will share vertices with other successfull collapses, we need to increase the acceptable error by this factor + const float kPassErrorBound = 1.5f; + + float error_goal = edge_collapse_goal < edge_collapse_count ? edge_collapses[collapse_order[edge_collapse_goal]].error * kPassErrorBound : FLT_MAX; + + for (size_t i = 0; i < vertex_count; ++i) + collapse_remap[i] = unsigned(i); + + memset(collapse_locked, 0, vertex_count); + + size_t collapses = performEdgeCollapses(collapse_remap, collapse_locked, vertex_quadrics, edge_collapses, edge_collapse_count, collapse_order, remap, wedge, vertex_kind, triangle_collapse_goal, error_goal, error_limit); + + // no edges can be collapsed any more due to hitting the error limit or triangle collapse limit + if (collapses == 0) + break; + + remapEdgeLoops(loop, vertex_count, collapse_remap); + remapEdgeLoops(loopback, vertex_count, collapse_remap); + + size_t new_count = remapIndexBuffer(result, result_count, collapse_remap); + assert(new_count < result_count); + +// -- GODOT start -- +//#if TRACE + float pass_error = 0.f; + for (size_t i = 0; i < edge_collapse_count; ++i) + { + Collapse& c = edge_collapses[collapse_order[i]]; + + if (collapse_remap[c.v0] == c.v1) + pass_error = c.error; + } + + //pass_count++; + worst_error = (worst_error < pass_error) ? pass_error : worst_error; + +#if TRACE + pass_count++; + printf("pass %d: triangles: %d -> %d, collapses: %d/%d (goal: %d), error: %e (limit %e goal %e)\n", int(pass_count), int(result_count / 3), int(new_count / 3), int(collapses), int(edge_collapse_count), int(edge_collapse_goal), pass_error, error_limit, error_goal); +#endif +// -- GODOT end -- + + result_count = new_count; + } + +// -- GODOT start -- + if (r_resulting_error) { + *r_resulting_error = sqrt(worst_error) * extent; + } +// -- GODOT end -- + +#if TRACE + printf("passes: %d, worst error: %e\n", int(pass_count), worst_error); +#endif + +#if TRACE > 1 + dumpLockedCollapses(result, result_count, vertex_kind); +#endif + +#ifndef NDEBUG + if (meshopt_simplifyDebugKind) + memcpy(meshopt_simplifyDebugKind, vertex_kind, vertex_count); + + if (meshopt_simplifyDebugLoop) + memcpy(meshopt_simplifyDebugLoop, loop, vertex_count * sizeof(unsigned int)); + + if (meshopt_simplifyDebugLoopBack) + memcpy(meshopt_simplifyDebugLoopBack, loopback, vertex_count * sizeof(unsigned int)); +#endif + + return result_count; +} + +size_t meshopt_simplifySloppy(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(target_index_count <= index_count); + + // we expect to get ~2 triangles/vertex in the output + size_t target_cell_count = target_index_count / 6; + + if (target_cell_count == 0) + return 0; + + meshopt_Allocator allocator; + + Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count); + rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride); + + // find the optimal grid size using guided binary search +#if TRACE + printf("source: %d vertices, %d triangles\n", int(vertex_count), int(index_count / 3)); + printf("target: %d cells, %d triangles\n", int(target_cell_count), int(target_index_count / 3)); +#endif + + unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count); + + const int kInterpolationPasses = 5; + + // invariant: # of triangles in min_grid <= target_count + int min_grid = 0; + int max_grid = 1025; + size_t min_triangles = 0; + size_t max_triangles = index_count / 3; + + // instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size... + int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f); + + for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass) + { + assert(min_triangles < target_index_count / 3); + assert(max_grid - min_grid > 1); + + // we clamp the prediction of the grid size to make sure that the search converges + int grid_size = next_grid_size; + grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size; + + computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size); + size_t triangles = countTriangles(vertex_ids, indices, index_count); + +#if TRACE + printf("pass %d (%s): grid size %d, triangles %d, %s\n", + pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary", + grid_size, int(triangles), + (triangles <= target_index_count / 3) ? "under" : "over"); +#endif + + float tip = interpolate(float(target_index_count / 3), float(min_grid), float(min_triangles), float(grid_size), float(triangles), float(max_grid), float(max_triangles)); + + if (triangles <= target_index_count / 3) + { + min_grid = grid_size; + min_triangles = triangles; + } + else + { + max_grid = grid_size; + max_triangles = triangles; + } + + if (triangles == target_index_count / 3 || max_grid - min_grid <= 1) + break; + + // we start by using interpolation search - it usually converges faster + // however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN) + next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2; + } + + if (min_triangles == 0) + return 0; + + // build vertex->cell association by mapping all vertices with the same quantized position to the same cell + size_t table_size = hashBuckets2(vertex_count); + unsigned int* table = allocator.allocate<unsigned int>(table_size); + + unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count); + + computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count); + + // build a quadric for each target cell + Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count); + memset(cell_quadrics, 0, cell_count * sizeof(Quadric)); + + fillCellQuadrics(cell_quadrics, indices, index_count, vertex_positions, vertex_cells); + + // for each target cell, find the vertex with the minimal error + unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count); + float* cell_errors = allocator.allocate<float>(cell_count); + + fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count); + + // collapse triangles! + // note that we need to filter out triangles that we've already output because we very frequently generate redundant triangles between cells :( + size_t tritable_size = hashBuckets2(min_triangles); + unsigned int* tritable = allocator.allocate<unsigned int>(tritable_size); + + size_t write = filterTriangles(destination, tritable, tritable_size, indices, index_count, vertex_cells, cell_remap); + assert(write <= target_index_count); + +#if TRACE + printf("result: %d cells, %d triangles (%d unfiltered)\n", int(cell_count), int(write / 3), int(min_triangles)); +#endif + + return write; +} + +size_t meshopt_simplifyPoints(unsigned int* destination, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_vertex_count) +{ + using namespace meshopt; + + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + assert(target_vertex_count <= vertex_count); + + size_t target_cell_count = target_vertex_count; + + if (target_cell_count == 0) + return 0; + + meshopt_Allocator allocator; + + Vector3* vertex_positions = allocator.allocate<Vector3>(vertex_count); + rescalePositions(vertex_positions, vertex_positions_data, vertex_count, vertex_positions_stride); + + // find the optimal grid size using guided binary search +#if TRACE + printf("source: %d vertices\n", int(vertex_count)); + printf("target: %d cells\n", int(target_cell_count)); +#endif + + unsigned int* vertex_ids = allocator.allocate<unsigned int>(vertex_count); + + size_t table_size = hashBuckets2(vertex_count); + unsigned int* table = allocator.allocate<unsigned int>(table_size); + + const int kInterpolationPasses = 5; + + // invariant: # of vertices in min_grid <= target_count + int min_grid = 0; + int max_grid = 1025; + size_t min_vertices = 0; + size_t max_vertices = vertex_count; + + // instead of starting in the middle, let's guess as to what the answer might be! triangle count usually grows as a square of grid size... + int next_grid_size = int(sqrtf(float(target_cell_count)) + 0.5f); + + for (int pass = 0; pass < 10 + kInterpolationPasses; ++pass) + { + assert(min_vertices < target_vertex_count); + assert(max_grid - min_grid > 1); + + // we clamp the prediction of the grid size to make sure that the search converges + int grid_size = next_grid_size; + grid_size = (grid_size <= min_grid) ? min_grid + 1 : (grid_size >= max_grid) ? max_grid - 1 : grid_size; + + computeVertexIds(vertex_ids, vertex_positions, vertex_count, grid_size); + size_t vertices = countVertexCells(table, table_size, vertex_ids, vertex_count); + +#if TRACE + printf("pass %d (%s): grid size %d, vertices %d, %s\n", + pass, (pass == 0) ? "guess" : (pass <= kInterpolationPasses) ? "lerp" : "binary", + grid_size, int(vertices), + (vertices <= target_vertex_count) ? "under" : "over"); +#endif + + float tip = interpolate(float(target_vertex_count), float(min_grid), float(min_vertices), float(grid_size), float(vertices), float(max_grid), float(max_vertices)); + + if (vertices <= target_vertex_count) + { + min_grid = grid_size; + min_vertices = vertices; + } + else + { + max_grid = grid_size; + max_vertices = vertices; + } + + if (vertices == target_vertex_count || max_grid - min_grid <= 1) + break; + + // we start by using interpolation search - it usually converges faster + // however, interpolation search has a worst case of O(N) so we switch to binary search after a few iterations which converges in O(logN) + next_grid_size = (pass < kInterpolationPasses) ? int(tip + 0.5f) : (min_grid + max_grid) / 2; + } + + if (min_vertices == 0) + return 0; + + // build vertex->cell association by mapping all vertices with the same quantized position to the same cell + unsigned int* vertex_cells = allocator.allocate<unsigned int>(vertex_count); + + computeVertexIds(vertex_ids, vertex_positions, vertex_count, min_grid); + size_t cell_count = fillVertexCells(table, table_size, vertex_cells, vertex_ids, vertex_count); + + // build a quadric for each target cell + Quadric* cell_quadrics = allocator.allocate<Quadric>(cell_count); + memset(cell_quadrics, 0, cell_count * sizeof(Quadric)); + + fillCellQuadrics(cell_quadrics, vertex_positions, vertex_count, vertex_cells); + + // for each target cell, find the vertex with the minimal error + unsigned int* cell_remap = allocator.allocate<unsigned int>(cell_count); + float* cell_errors = allocator.allocate<float>(cell_count); + + fillCellRemap(cell_remap, cell_errors, cell_count, vertex_cells, cell_quadrics, vertex_positions, vertex_count); + + // copy results to the output + assert(cell_count <= target_vertex_count); + memcpy(destination, cell_remap, sizeof(unsigned int) * cell_count); + +#if TRACE + printf("result: %d cells\n", int(cell_count)); +#endif + + return cell_count; +} diff --git a/thirdparty/meshoptimizer/spatialorder.cpp b/thirdparty/meshoptimizer/spatialorder.cpp new file mode 100644 index 0000000000..b09f80ac6f --- /dev/null +++ b/thirdparty/meshoptimizer/spatialorder.cpp @@ -0,0 +1,194 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <float.h> +#include <string.h> + +// This work is based on: +// Fabian Giesen. Decoding Morton codes. 2009 +namespace meshopt +{ + +// "Insert" two 0 bits after each of the 10 low bits of x +inline unsigned int part1By2(unsigned int x) +{ + x &= 0x000003ff; // x = ---- ---- ---- ---- ---- --98 7654 3210 + x = (x ^ (x << 16)) & 0xff0000ff; // x = ---- --98 ---- ---- ---- ---- 7654 3210 + x = (x ^ (x << 8)) & 0x0300f00f; // x = ---- --98 ---- ---- 7654 ---- ---- 3210 + x = (x ^ (x << 4)) & 0x030c30c3; // x = ---- --98 ---- 76-- --54 ---- 32-- --10 + x = (x ^ (x << 2)) & 0x09249249; // x = ---- 9--8 --7- -6-- 5--4 --3- -2-- 1--0 + return x; +} + +static void computeOrder(unsigned int* result, const float* vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride) +{ + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + float minv[3] = {FLT_MAX, FLT_MAX, FLT_MAX}; + float maxv[3] = {-FLT_MAX, -FLT_MAX, -FLT_MAX}; + + for (size_t i = 0; i < vertex_count; ++i) + { + const float* v = vertex_positions_data + i * vertex_stride_float; + + for (int j = 0; j < 3; ++j) + { + float vj = v[j]; + + minv[j] = minv[j] > vj ? vj : minv[j]; + maxv[j] = maxv[j] < vj ? vj : maxv[j]; + } + } + + float extent = 0.f; + + extent = (maxv[0] - minv[0]) < extent ? extent : (maxv[0] - minv[0]); + extent = (maxv[1] - minv[1]) < extent ? extent : (maxv[1] - minv[1]); + extent = (maxv[2] - minv[2]) < extent ? extent : (maxv[2] - minv[2]); + + float scale = extent == 0 ? 0.f : 1.f / extent; + + // generate Morton order based on the position inside a unit cube + for (size_t i = 0; i < vertex_count; ++i) + { + const float* v = vertex_positions_data + i * vertex_stride_float; + + int x = int((v[0] - minv[0]) * scale * 1023.f + 0.5f); + int y = int((v[1] - minv[1]) * scale * 1023.f + 0.5f); + int z = int((v[2] - minv[2]) * scale * 1023.f + 0.5f); + + result[i] = part1By2(x) | (part1By2(y) << 1) | (part1By2(z) << 2); + } +} + +static void computeHistogram(unsigned int (&hist)[1024][3], const unsigned int* data, size_t count) +{ + memset(hist, 0, sizeof(hist)); + + // compute 3 10-bit histograms in parallel + for (size_t i = 0; i < count; ++i) + { + unsigned int id = data[i]; + + hist[(id >> 0) & 1023][0]++; + hist[(id >> 10) & 1023][1]++; + hist[(id >> 20) & 1023][2]++; + } + + unsigned int sumx = 0, sumy = 0, sumz = 0; + + // replace histogram data with prefix histogram sums in-place + for (int i = 0; i < 1024; ++i) + { + unsigned int hx = hist[i][0], hy = hist[i][1], hz = hist[i][2]; + + hist[i][0] = sumx; + hist[i][1] = sumy; + hist[i][2] = sumz; + + sumx += hx; + sumy += hy; + sumz += hz; + } + + assert(sumx == count && sumy == count && sumz == count); +} + +static void radixPass(unsigned int* destination, const unsigned int* source, const unsigned int* keys, size_t count, unsigned int (&hist)[1024][3], int pass) +{ + int bitoff = pass * 10; + + for (size_t i = 0; i < count; ++i) + { + unsigned int id = (keys[source[i]] >> bitoff) & 1023; + + destination[hist[id][pass]++] = source[i]; + } +} + +} // namespace meshopt + +void meshopt_spatialSortRemap(unsigned int* destination, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + meshopt_Allocator allocator; + + unsigned int* keys = allocator.allocate<unsigned int>(vertex_count); + computeOrder(keys, vertex_positions, vertex_count, vertex_positions_stride); + + unsigned int hist[1024][3]; + computeHistogram(hist, keys, vertex_count); + + unsigned int* scratch = allocator.allocate<unsigned int>(vertex_count); + + for (size_t i = 0; i < vertex_count; ++i) + destination[i] = unsigned(i); + + // 3-pass radix sort computes the resulting order into scratch + radixPass(scratch, destination, keys, vertex_count, hist, 0); + radixPass(destination, scratch, keys, vertex_count, hist, 1); + radixPass(scratch, destination, keys, vertex_count, hist, 2); + + // since our remap table is mapping old=>new, we need to reverse it + for (size_t i = 0; i < vertex_count; ++i) + destination[scratch[i]] = unsigned(i); +} + +void meshopt_spatialSortTriangles(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256); + assert(vertex_positions_stride % sizeof(float) == 0); + + (void)vertex_count; + + size_t face_count = index_count / 3; + size_t vertex_stride_float = vertex_positions_stride / sizeof(float); + + meshopt_Allocator allocator; + + float* centroids = allocator.allocate<float>(face_count * 3); + + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + const float* va = vertex_positions + a * vertex_stride_float; + const float* vb = vertex_positions + b * vertex_stride_float; + const float* vc = vertex_positions + c * vertex_stride_float; + + centroids[i * 3 + 0] = (va[0] + vb[0] + vc[0]) / 3.f; + centroids[i * 3 + 1] = (va[1] + vb[1] + vc[1]) / 3.f; + centroids[i * 3 + 2] = (va[2] + vb[2] + vc[2]) / 3.f; + } + + unsigned int* remap = allocator.allocate<unsigned int>(face_count); + + meshopt_spatialSortRemap(remap, centroids, face_count, sizeof(float) * 3); + + // support in-order remap + if (destination == indices) + { + unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); + memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); + indices = indices_copy; + } + + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + unsigned int r = remap[i]; + + destination[r * 3 + 0] = a; + destination[r * 3 + 1] = b; + destination[r * 3 + 2] = c; + } +} diff --git a/thirdparty/meshoptimizer/stripifier.cpp b/thirdparty/meshoptimizer/stripifier.cpp new file mode 100644 index 0000000000..8ce17ef3dc --- /dev/null +++ b/thirdparty/meshoptimizer/stripifier.cpp @@ -0,0 +1,295 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <limits.h> +#include <string.h> + +// This work is based on: +// Francine Evans, Steven Skiena and Amitabh Varshney. Optimizing Triangle Strips for Fast Rendering. 1996 +namespace meshopt +{ + +static unsigned int findStripFirst(const unsigned int buffer[][3], unsigned int buffer_size, const unsigned int* valence) +{ + unsigned int index = 0; + unsigned int iv = ~0u; + + for (size_t i = 0; i < buffer_size; ++i) + { + unsigned int va = valence[buffer[i][0]], vb = valence[buffer[i][1]], vc = valence[buffer[i][2]]; + unsigned int v = (va < vb && va < vc) ? va : (vb < vc) ? vb : vc; + + if (v < iv) + { + index = unsigned(i); + iv = v; + } + } + + return index; +} + +static int findStripNext(const unsigned int buffer[][3], unsigned int buffer_size, unsigned int e0, unsigned int e1) +{ + for (size_t i = 0; i < buffer_size; ++i) + { + unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2]; + + if (e0 == a && e1 == b) + return (int(i) << 2) | 2; + else if (e0 == b && e1 == c) + return (int(i) << 2) | 0; + else if (e0 == c && e1 == a) + return (int(i) << 2) | 1; + } + + return -1; +} + +} // namespace meshopt + +size_t meshopt_stripify(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int restart_index) +{ + assert(destination != indices); + assert(index_count % 3 == 0); + + using namespace meshopt; + + meshopt_Allocator allocator; + + const size_t buffer_capacity = 8; + + unsigned int buffer[buffer_capacity][3] = {}; + unsigned int buffer_size = 0; + + size_t index_offset = 0; + + unsigned int strip[2] = {}; + unsigned int parity = 0; + + size_t strip_size = 0; + + // compute vertex valence; this is used to prioritize starting triangle for strips + unsigned int* valence = allocator.allocate<unsigned int>(vertex_count); + memset(valence, 0, vertex_count * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + valence[index]++; + } + + int next = -1; + + while (buffer_size > 0 || index_offset < index_count) + { + assert(next < 0 || (size_t(next >> 2) < buffer_size && (next & 3) < 3)); + + // fill triangle buffer + while (buffer_size < buffer_capacity && index_offset < index_count) + { + buffer[buffer_size][0] = indices[index_offset + 0]; + buffer[buffer_size][1] = indices[index_offset + 1]; + buffer[buffer_size][2] = indices[index_offset + 2]; + + buffer_size++; + index_offset += 3; + } + + assert(buffer_size > 0); + + if (next >= 0) + { + unsigned int i = next >> 2; + unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2]; + unsigned int v = buffer[i][next & 3]; + + // ordered removal from the buffer + memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0])); + buffer_size--; + + // update vertex valences for strip start heuristic + valence[a]--; + valence[b]--; + valence[c]--; + + // find next triangle (note that edge order flips on every iteration) + // in some cases we need to perform a swap to pick a different outgoing triangle edge + // for [a b c], the default strip edge is [b c], but we might want to use [a c] + int cont = findStripNext(buffer, buffer_size, parity ? strip[1] : v, parity ? v : strip[1]); + int swap = cont < 0 ? findStripNext(buffer, buffer_size, parity ? v : strip[0], parity ? strip[0] : v) : -1; + + if (cont < 0 && swap >= 0) + { + // [a b c] => [a b a c] + destination[strip_size++] = strip[0]; + destination[strip_size++] = v; + + // next strip has same winding + // ? a b => b a v + strip[1] = v; + + next = swap; + } + else + { + // emit the next vertex in the strip + destination[strip_size++] = v; + + // next strip has flipped winding + strip[0] = strip[1]; + strip[1] = v; + parity ^= 1; + + next = cont; + } + } + else + { + // if we didn't find anything, we need to find the next new triangle + // we use a heuristic to maximize the strip length + unsigned int i = findStripFirst(buffer, buffer_size, &valence[0]); + unsigned int a = buffer[i][0], b = buffer[i][1], c = buffer[i][2]; + + // ordered removal from the buffer + memmove(buffer[i], buffer[i + 1], (buffer_size - i - 1) * sizeof(buffer[0])); + buffer_size--; + + // update vertex valences for strip start heuristic + valence[a]--; + valence[b]--; + valence[c]--; + + // we need to pre-rotate the triangle so that we will find a match in the existing buffer on the next iteration + int ea = findStripNext(buffer, buffer_size, c, b); + int eb = findStripNext(buffer, buffer_size, a, c); + int ec = findStripNext(buffer, buffer_size, b, a); + + // in some cases we can have several matching edges; since we can pick any edge, we pick the one with the smallest + // triangle index in the buffer. this reduces the effect of stripification on ACMR and additionally - for unclear + // reasons - slightly improves the stripification efficiency + int mine = INT_MAX; + mine = (ea >= 0 && mine > ea) ? ea : mine; + mine = (eb >= 0 && mine > eb) ? eb : mine; + mine = (ec >= 0 && mine > ec) ? ec : mine; + + if (ea == mine) + { + // keep abc + next = ea; + } + else if (eb == mine) + { + // abc -> bca + unsigned int t = a; + a = b, b = c, c = t; + + next = eb; + } + else if (ec == mine) + { + // abc -> cab + unsigned int t = c; + c = b, b = a, a = t; + + next = ec; + } + + if (restart_index) + { + if (strip_size) + destination[strip_size++] = restart_index; + + destination[strip_size++] = a; + destination[strip_size++] = b; + destination[strip_size++] = c; + + // new strip always starts with the same edge winding + strip[0] = b; + strip[1] = c; + parity = 1; + } + else + { + if (strip_size) + { + // connect last strip using degenerate triangles + destination[strip_size++] = strip[1]; + destination[strip_size++] = a; + } + + // note that we may need to flip the emitted triangle based on parity + // we always end up with outgoing edge "cb" in the end + unsigned int e0 = parity ? c : b; + unsigned int e1 = parity ? b : c; + + destination[strip_size++] = a; + destination[strip_size++] = e0; + destination[strip_size++] = e1; + + strip[0] = e0; + strip[1] = e1; + parity ^= 1; + } + } + } + + return strip_size; +} + +size_t meshopt_stripifyBound(size_t index_count) +{ + assert(index_count % 3 == 0); + + // worst case without restarts is 2 degenerate indices and 3 indices per triangle + // worst case with restarts is 1 restart index and 3 indices per triangle + return (index_count / 3) * 5; +} + +size_t meshopt_unstripify(unsigned int* destination, const unsigned int* indices, size_t index_count, unsigned int restart_index) +{ + assert(destination != indices); + + size_t offset = 0; + size_t start = 0; + + for (size_t i = 0; i < index_count; ++i) + { + if (restart_index && indices[i] == restart_index) + { + start = i + 1; + } + else if (i - start >= 2) + { + unsigned int a = indices[i - 2], b = indices[i - 1], c = indices[i]; + + // flip winding for odd triangles + if ((i - start) & 1) + { + unsigned int t = a; + a = b, b = t; + } + + // although we use restart indices, strip swaps still produce degenerate triangles, so skip them + if (a != b && a != c && b != c) + { + destination[offset + 0] = a; + destination[offset + 1] = b; + destination[offset + 2] = c; + offset += 3; + } + } + } + + return offset; +} + +size_t meshopt_unstripifyBound(size_t index_count) +{ + assert(index_count == 0 || index_count >= 3); + + return (index_count == 0) ? 0 : (index_count - 2) * 3; +} diff --git a/thirdparty/meshoptimizer/vcacheanalyzer.cpp b/thirdparty/meshoptimizer/vcacheanalyzer.cpp new file mode 100644 index 0000000000..3682743820 --- /dev/null +++ b/thirdparty/meshoptimizer/vcacheanalyzer.cpp @@ -0,0 +1,73 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <string.h> + +meshopt_VertexCacheStatistics meshopt_analyzeVertexCache(const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size, unsigned int warp_size, unsigned int primgroup_size) +{ + assert(index_count % 3 == 0); + assert(cache_size >= 3); + assert(warp_size == 0 || warp_size >= 3); + + meshopt_Allocator allocator; + + meshopt_VertexCacheStatistics result = {}; + + unsigned int warp_offset = 0; + unsigned int primgroup_offset = 0; + + unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count); + memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); + + unsigned int timestamp = cache_size + 1; + + for (size_t i = 0; i < index_count; i += 3) + { + unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2]; + assert(a < vertex_count && b < vertex_count && c < vertex_count); + + bool ac = (timestamp - cache_timestamps[a]) > cache_size; + bool bc = (timestamp - cache_timestamps[b]) > cache_size; + bool cc = (timestamp - cache_timestamps[c]) > cache_size; + + // flush cache if triangle doesn't fit into warp or into the primitive buffer + if ((primgroup_size && primgroup_offset == primgroup_size) || (warp_size && warp_offset + ac + bc + cc > warp_size)) + { + result.warps_executed += warp_offset > 0; + + warp_offset = 0; + primgroup_offset = 0; + + // reset cache + timestamp += cache_size + 1; + } + + // update cache and add vertices to warp + for (int j = 0; j < 3; ++j) + { + unsigned int index = indices[i + j]; + + if (timestamp - cache_timestamps[index] > cache_size) + { + cache_timestamps[index] = timestamp++; + result.vertices_transformed++; + warp_offset++; + } + } + + primgroup_offset++; + } + + size_t unique_vertex_count = 0; + + for (size_t i = 0; i < vertex_count; ++i) + unique_vertex_count += cache_timestamps[i] > 0; + + result.warps_executed += warp_offset > 0; + + result.acmr = index_count == 0 ? 0 : float(result.vertices_transformed) / float(index_count / 3); + result.atvr = unique_vertex_count == 0 ? 0 : float(result.vertices_transformed) / float(unique_vertex_count); + + return result; +} diff --git a/thirdparty/meshoptimizer/vcacheoptimizer.cpp b/thirdparty/meshoptimizer/vcacheoptimizer.cpp new file mode 100644 index 0000000000..fb8ade4b77 --- /dev/null +++ b/thirdparty/meshoptimizer/vcacheoptimizer.cpp @@ -0,0 +1,473 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <string.h> + +// This work is based on: +// Tom Forsyth. Linear-Speed Vertex Cache Optimisation. 2006 +// Pedro Sander, Diego Nehab and Joshua Barczak. Fast Triangle Reordering for Vertex Locality and Reduced Overdraw. 2007 +namespace meshopt +{ + +const size_t kCacheSizeMax = 16; +const size_t kValenceMax = 8; + +struct VertexScoreTable +{ + float cache[1 + kCacheSizeMax]; + float live[1 + kValenceMax]; +}; + +// Tuned to minimize the ACMR of a GPU that has a cache profile similar to NVidia and AMD +static const VertexScoreTable kVertexScoreTable = { + {0.f, 0.779f, 0.791f, 0.789f, 0.981f, 0.843f, 0.726f, 0.847f, 0.882f, 0.867f, 0.799f, 0.642f, 0.613f, 0.600f, 0.568f, 0.372f, 0.234f}, + {0.f, 0.995f, 0.713f, 0.450f, 0.404f, 0.059f, 0.005f, 0.147f, 0.006f}, +}; + +// Tuned to minimize the encoded index buffer size +static const VertexScoreTable kVertexScoreTableStrip = { + {0.f, 1.000f, 1.000f, 1.000f, 0.453f, 0.561f, 0.490f, 0.459f, 0.179f, 0.526f, 0.000f, 0.227f, 0.184f, 0.490f, 0.112f, 0.050f, 0.131f}, + {0.f, 0.956f, 0.786f, 0.577f, 0.558f, 0.618f, 0.549f, 0.499f, 0.489f}, +}; + +struct TriangleAdjacency +{ + unsigned int* counts; + unsigned int* offsets; + unsigned int* data; +}; + +static void buildTriangleAdjacency(TriangleAdjacency& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator) +{ + size_t face_count = index_count / 3; + + // allocate arrays + adjacency.counts = allocator.allocate<unsigned int>(vertex_count); + adjacency.offsets = allocator.allocate<unsigned int>(vertex_count); + adjacency.data = allocator.allocate<unsigned int>(index_count); + + // fill triangle counts + memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int)); + + for (size_t i = 0; i < index_count; ++i) + { + assert(indices[i] < vertex_count); + + adjacency.counts[indices[i]]++; + } + + // fill offset table + unsigned int offset = 0; + + for (size_t i = 0; i < vertex_count; ++i) + { + adjacency.offsets[i] = offset; + offset += adjacency.counts[i]; + } + + assert(offset == index_count); + + // fill triangle data + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2]; + + adjacency.data[adjacency.offsets[a]++] = unsigned(i); + adjacency.data[adjacency.offsets[b]++] = unsigned(i); + adjacency.data[adjacency.offsets[c]++] = unsigned(i); + } + + // fix offsets that have been disturbed by the previous pass + for (size_t i = 0; i < vertex_count; ++i) + { + assert(adjacency.offsets[i] >= adjacency.counts[i]); + + adjacency.offsets[i] -= adjacency.counts[i]; + } +} + +static unsigned int getNextVertexDeadEnd(const unsigned int* dead_end, unsigned int& dead_end_top, unsigned int& input_cursor, const unsigned int* live_triangles, size_t vertex_count) +{ + // check dead-end stack + while (dead_end_top) + { + unsigned int vertex = dead_end[--dead_end_top]; + + if (live_triangles[vertex] > 0) + return vertex; + } + + // input order + while (input_cursor < vertex_count) + { + if (live_triangles[input_cursor] > 0) + return input_cursor; + + ++input_cursor; + } + + return ~0u; +} + +static unsigned int getNextVertexNeighbour(const unsigned int* next_candidates_begin, const unsigned int* next_candidates_end, const unsigned int* live_triangles, const unsigned int* cache_timestamps, unsigned int timestamp, unsigned int cache_size) +{ + unsigned int best_candidate = ~0u; + int best_priority = -1; + + for (const unsigned int* next_candidate = next_candidates_begin; next_candidate != next_candidates_end; ++next_candidate) + { + unsigned int vertex = *next_candidate; + + // otherwise we don't need to process it + if (live_triangles[vertex] > 0) + { + int priority = 0; + + // will it be in cache after fanning? + if (2 * live_triangles[vertex] + timestamp - cache_timestamps[vertex] <= cache_size) + { + priority = timestamp - cache_timestamps[vertex]; // position in cache + } + + if (priority > best_priority) + { + best_candidate = vertex; + best_priority = priority; + } + } + } + + return best_candidate; +} + +static float vertexScore(const VertexScoreTable* table, int cache_position, unsigned int live_triangles) +{ + assert(cache_position >= -1 && cache_position < int(kCacheSizeMax)); + + unsigned int live_triangles_clamped = live_triangles < kValenceMax ? live_triangles : kValenceMax; + + return table->cache[1 + cache_position] + table->live[live_triangles_clamped]; +} + +static unsigned int getNextTriangleDeadEnd(unsigned int& input_cursor, const unsigned char* emitted_flags, size_t face_count) +{ + // input order + while (input_cursor < face_count) + { + if (!emitted_flags[input_cursor]) + return input_cursor; + + ++input_cursor; + } + + return ~0u; +} + +} // namespace meshopt + +void meshopt_optimizeVertexCacheTable(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const meshopt::VertexScoreTable* table) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + + meshopt_Allocator allocator; + + // guard for empty meshes + if (index_count == 0 || vertex_count == 0) + return; + + // support in-place optimization + if (destination == indices) + { + unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); + memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); + indices = indices_copy; + } + + unsigned int cache_size = 16; + assert(cache_size <= kCacheSizeMax); + + size_t face_count = index_count / 3; + + // build adjacency information + TriangleAdjacency adjacency = {}; + buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); + + // live triangle counts + unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count); + memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); + + // emitted flags + unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count); + memset(emitted_flags, 0, face_count); + + // compute initial vertex scores + float* vertex_scores = allocator.allocate<float>(vertex_count); + + for (size_t i = 0; i < vertex_count; ++i) + vertex_scores[i] = vertexScore(table, -1, live_triangles[i]); + + // compute triangle scores + float* triangle_scores = allocator.allocate<float>(face_count); + + for (size_t i = 0; i < face_count; ++i) + { + unsigned int a = indices[i * 3 + 0]; + unsigned int b = indices[i * 3 + 1]; + unsigned int c = indices[i * 3 + 2]; + + triangle_scores[i] = vertex_scores[a] + vertex_scores[b] + vertex_scores[c]; + } + + unsigned int cache_holder[2 * (kCacheSizeMax + 3)]; + unsigned int* cache = cache_holder; + unsigned int* cache_new = cache_holder + kCacheSizeMax + 3; + size_t cache_count = 0; + + unsigned int current_triangle = 0; + unsigned int input_cursor = 1; + + unsigned int output_triangle = 0; + + while (current_triangle != ~0u) + { + assert(output_triangle < face_count); + + unsigned int a = indices[current_triangle * 3 + 0]; + unsigned int b = indices[current_triangle * 3 + 1]; + unsigned int c = indices[current_triangle * 3 + 2]; + + // output indices + destination[output_triangle * 3 + 0] = a; + destination[output_triangle * 3 + 1] = b; + destination[output_triangle * 3 + 2] = c; + output_triangle++; + + // update emitted flags + emitted_flags[current_triangle] = true; + triangle_scores[current_triangle] = 0; + + // new triangle + size_t cache_write = 0; + cache_new[cache_write++] = a; + cache_new[cache_write++] = b; + cache_new[cache_write++] = c; + + // old triangles + for (size_t i = 0; i < cache_count; ++i) + { + unsigned int index = cache[i]; + + if (index != a && index != b && index != c) + { + cache_new[cache_write++] = index; + } + } + + unsigned int* cache_temp = cache; + cache = cache_new, cache_new = cache_temp; + cache_count = cache_write > cache_size ? cache_size : cache_write; + + // update live triangle counts + live_triangles[a]--; + live_triangles[b]--; + live_triangles[c]--; + + // remove emitted triangle from adjacency data + // this makes sure that we spend less time traversing these lists on subsequent iterations + for (size_t k = 0; k < 3; ++k) + { + unsigned int index = indices[current_triangle * 3 + k]; + + unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index]; + size_t neighbours_size = adjacency.counts[index]; + + for (size_t i = 0; i < neighbours_size; ++i) + { + unsigned int tri = neighbours[i]; + + if (tri == current_triangle) + { + neighbours[i] = neighbours[neighbours_size - 1]; + adjacency.counts[index]--; + break; + } + } + } + + unsigned int best_triangle = ~0u; + float best_score = 0; + + // update cache positions, vertex scores and triangle scores, and find next best triangle + for (size_t i = 0; i < cache_write; ++i) + { + unsigned int index = cache[i]; + + int cache_position = i >= cache_size ? -1 : int(i); + + // update vertex score + float score = vertexScore(table, cache_position, live_triangles[index]); + float score_diff = score - vertex_scores[index]; + + vertex_scores[index] = score; + + // update scores of vertex triangles + const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[index]; + const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[index]; + + for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it) + { + unsigned int tri = *it; + assert(!emitted_flags[tri]); + + float tri_score = triangle_scores[tri] + score_diff; + assert(tri_score > 0); + + if (best_score < tri_score) + { + best_triangle = tri; + best_score = tri_score; + } + + triangle_scores[tri] = tri_score; + } + } + + // step through input triangles in order if we hit a dead-end + current_triangle = best_triangle; + + if (current_triangle == ~0u) + { + current_triangle = getNextTriangleDeadEnd(input_cursor, &emitted_flags[0], face_count); + } + } + + assert(input_cursor == face_count); + assert(output_triangle == face_count); +} + +void meshopt_optimizeVertexCache(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count) +{ + meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTable); +} + +void meshopt_optimizeVertexCacheStrip(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count) +{ + meshopt_optimizeVertexCacheTable(destination, indices, index_count, vertex_count, &meshopt::kVertexScoreTableStrip); +} + +void meshopt_optimizeVertexCacheFifo(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, unsigned int cache_size) +{ + using namespace meshopt; + + assert(index_count % 3 == 0); + assert(cache_size >= 3); + + meshopt_Allocator allocator; + + // guard for empty meshes + if (index_count == 0 || vertex_count == 0) + return; + + // support in-place optimization + if (destination == indices) + { + unsigned int* indices_copy = allocator.allocate<unsigned int>(index_count); + memcpy(indices_copy, indices, index_count * sizeof(unsigned int)); + indices = indices_copy; + } + + size_t face_count = index_count / 3; + + // build adjacency information + TriangleAdjacency adjacency = {}; + buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator); + + // live triangle counts + unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count); + memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int)); + + // cache time stamps + unsigned int* cache_timestamps = allocator.allocate<unsigned int>(vertex_count); + memset(cache_timestamps, 0, vertex_count * sizeof(unsigned int)); + + // dead-end stack + unsigned int* dead_end = allocator.allocate<unsigned int>(index_count); + unsigned int dead_end_top = 0; + + // emitted flags + unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count); + memset(emitted_flags, 0, face_count); + + unsigned int current_vertex = 0; + + unsigned int timestamp = cache_size + 1; + unsigned int input_cursor = 1; // vertex to restart from in case of dead-end + + unsigned int output_triangle = 0; + + while (current_vertex != ~0u) + { + const unsigned int* next_candidates_begin = &dead_end[0] + dead_end_top; + + // emit all vertex neighbours + const unsigned int* neighbours_begin = &adjacency.data[0] + adjacency.offsets[current_vertex]; + const unsigned int* neighbours_end = neighbours_begin + adjacency.counts[current_vertex]; + + for (const unsigned int* it = neighbours_begin; it != neighbours_end; ++it) + { + unsigned int triangle = *it; + + if (!emitted_flags[triangle]) + { + unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2]; + + // output indices + destination[output_triangle * 3 + 0] = a; + destination[output_triangle * 3 + 1] = b; + destination[output_triangle * 3 + 2] = c; + output_triangle++; + + // update dead-end stack + dead_end[dead_end_top + 0] = a; + dead_end[dead_end_top + 1] = b; + dead_end[dead_end_top + 2] = c; + dead_end_top += 3; + + // update live triangle counts + live_triangles[a]--; + live_triangles[b]--; + live_triangles[c]--; + + // update cache info + // if vertex is not in cache, put it in cache + if (timestamp - cache_timestamps[a] > cache_size) + cache_timestamps[a] = timestamp++; + + if (timestamp - cache_timestamps[b] > cache_size) + cache_timestamps[b] = timestamp++; + + if (timestamp - cache_timestamps[c] > cache_size) + cache_timestamps[c] = timestamp++; + + // update emitted flags + emitted_flags[triangle] = true; + } + } + + // next candidates are the ones we pushed to dead-end stack just now + const unsigned int* next_candidates_end = &dead_end[0] + dead_end_top; + + // get next vertex + current_vertex = getNextVertexNeighbour(next_candidates_begin, next_candidates_end, &live_triangles[0], &cache_timestamps[0], timestamp, cache_size); + + if (current_vertex == ~0u) + { + current_vertex = getNextVertexDeadEnd(&dead_end[0], dead_end_top, input_cursor, &live_triangles[0], vertex_count); + } + } + + assert(output_triangle == face_count); +} diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp new file mode 100644 index 0000000000..784c9a13db --- /dev/null +++ b/thirdparty/meshoptimizer/vertexcodec.cpp @@ -0,0 +1,1265 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <string.h> + +// The block below auto-detects SIMD ISA that can be used on the target platform +#ifndef MESHOPTIMIZER_NO_SIMD + +// The SIMD implementation requires SSSE3, which can be enabled unconditionally through compiler settings +#if defined(__AVX__) || defined(__SSSE3__) +#define SIMD_SSE +#endif + +// An experimental implementation using AVX512 instructions; it's only enabled when AVX512 is enabled through compiler settings +#if defined(__AVX512VBMI2__) && defined(__AVX512VBMI__) && defined(__AVX512VL__) && defined(__POPCNT__) +#undef SIMD_SSE +#define SIMD_AVX +#endif + +// MSVC supports compiling SSSE3 code regardless of compile options; we use a cpuid-based scalar fallback +#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) +#define SIMD_SSE +#define SIMD_FALLBACK +#endif + +// GCC 4.9+ and clang 3.8+ support targeting SIMD ISA from individual functions; we use a cpuid-based scalar fallback +#if !defined(SIMD_SSE) && !defined(SIMD_AVX) && ((defined(__clang__) && __clang_major__ * 100 + __clang_minor__ >= 308) || (defined(__GNUC__) && __GNUC__ * 100 + __GNUC_MINOR__ >= 409)) && (defined(__i386__) || defined(__x86_64__)) +#define SIMD_SSE +#define SIMD_FALLBACK +#define SIMD_TARGET __attribute__((target("ssse3"))) +#endif + +// GCC/clang define these when NEON support is available +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define SIMD_NEON +#endif + +// On MSVC, we assume that ARM builds always target NEON-capable devices +#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#define SIMD_NEON +#endif + +// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD +#if defined(__wasm_simd128__) +#define SIMD_WASM +#endif + +#ifndef SIMD_TARGET +#define SIMD_TARGET +#endif + +#endif // !MESHOPTIMIZER_NO_SIMD + +#ifdef SIMD_SSE +#include <tmmintrin.h> +#endif + +#if defined(SIMD_SSE) && defined(SIMD_FALLBACK) +#ifdef _MSC_VER +#include <intrin.h> // __cpuid +#else +#include <cpuid.h> // __cpuid +#endif +#endif + +#ifdef SIMD_AVX +#include <immintrin.h> +#endif + +#ifdef SIMD_NEON +#if defined(_MSC_VER) && defined(_M_ARM64) +#include <arm64_neon.h> +#else +#include <arm_neon.h> +#endif +#endif + +#ifdef SIMD_WASM +#include <wasm_simd128.h> +#endif + +#ifndef TRACE +#define TRACE 0 +#endif + +#if TRACE +#include <stdio.h> +#endif + +#ifdef SIMD_WASM +#define wasmx_splat_v32x4(v, i) wasm_v32x4_shuffle(v, v, i, i, i, i) +#define wasmx_unpacklo_v8x16(a, b) wasm_v8x16_shuffle(a, b, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23) +#define wasmx_unpackhi_v8x16(a, b) wasm_v8x16_shuffle(a, b, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31) +#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11) +#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15) +#define wasmx_unpacklo_v64x2(a, b) wasm_v64x2_shuffle(a, b, 0, 2) +#define wasmx_unpackhi_v64x2(a, b) wasm_v64x2_shuffle(a, b, 1, 3) +#endif + +namespace meshopt +{ + +const unsigned char kVertexHeader = 0xa0; + +static int gEncodeVertexVersion = 0; + +const size_t kVertexBlockSizeBytes = 8192; +const size_t kVertexBlockMaxSize = 256; +const size_t kByteGroupSize = 16; +const size_t kByteGroupDecodeLimit = 24; +const size_t kTailMaxSize = 32; + +static size_t getVertexBlockSize(size_t vertex_size) +{ + // make sure the entire block fits into the scratch buffer + size_t result = kVertexBlockSizeBytes / vertex_size; + + // align to byte group size; we encode each byte as a byte group + // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size + result &= ~(kByteGroupSize - 1); + + return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; +} + +inline unsigned char zigzag8(unsigned char v) +{ + return ((signed char)(v) >> 7) ^ (v << 1); +} + +inline unsigned char unzigzag8(unsigned char v) +{ + return -(v & 1) ^ (v >> 1); +} + +#if TRACE +struct Stats +{ + size_t size; + size_t header; + size_t bitg[4]; + size_t bitb[4]; +}; + +Stats* bytestats; +Stats vertexstats[256]; +#endif + +static bool encodeBytesGroupZero(const unsigned char* buffer) +{ + for (size_t i = 0; i < kByteGroupSize; ++i) + if (buffer[i]) + return false; + + return true; +} + +static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) +{ + assert(bits >= 1 && bits <= 8); + + if (bits == 1) + return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); + + if (bits == 8) + return kByteGroupSize; + + size_t result = kByteGroupSize * bits / 8; + + unsigned char sentinel = (1 << bits) - 1; + + for (size_t i = 0; i < kByteGroupSize; ++i) + result += buffer[i] >= sentinel; + + return result; +} + +static unsigned char* encodeBytesGroup(unsigned char* data, const unsigned char* buffer, int bits) +{ + assert(bits >= 1 && bits <= 8); + + if (bits == 1) + return data; + + if (bits == 8) + { + memcpy(data, buffer, kByteGroupSize); + return data + kByteGroupSize; + } + + size_t byte_size = 8 / bits; + assert(kByteGroupSize % byte_size == 0); + + // fixed portion: bits bits for each value + // variable portion: full byte for each out-of-range value (using 1...1 as sentinel) + unsigned char sentinel = (1 << bits) - 1; + + for (size_t i = 0; i < kByteGroupSize; i += byte_size) + { + unsigned char byte = 0; + + for (size_t k = 0; k < byte_size; ++k) + { + unsigned char enc = (buffer[i + k] >= sentinel) ? sentinel : buffer[i + k]; + + byte <<= bits; + byte |= enc; + } + + *data++ = byte; + } + + for (size_t i = 0; i < kByteGroupSize; ++i) + { + if (buffer[i] >= sentinel) + { + *data++ = buffer[i]; + } + } + + return data; +} + +static unsigned char* encodeBytes(unsigned char* data, unsigned char* data_end, const unsigned char* buffer, size_t buffer_size) +{ + assert(buffer_size % kByteGroupSize == 0); + + unsigned char* header = data; + + // round number of groups to 4 to get number of header bytes + size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; + + if (size_t(data_end - data) < header_size) + return 0; + + data += header_size; + + memset(header, 0, header_size); + + for (size_t i = 0; i < buffer_size; i += kByteGroupSize) + { + if (size_t(data_end - data) < kByteGroupDecodeLimit) + return 0; + + int best_bits = 8; + size_t best_size = encodeBytesGroupMeasure(buffer + i, 8); + + for (int bits = 1; bits < 8; bits *= 2) + { + size_t size = encodeBytesGroupMeasure(buffer + i, bits); + + if (size < best_size) + { + best_bits = bits; + best_size = size; + } + } + + int bitslog2 = (best_bits == 1) ? 0 : (best_bits == 2) ? 1 : (best_bits == 4) ? 2 : 3; + assert((1 << bitslog2) == best_bits); + + size_t header_offset = i / kByteGroupSize; + + header[header_offset / 4] |= bitslog2 << ((header_offset % 4) * 2); + + unsigned char* next = encodeBytesGroup(data, buffer + i, best_bits); + + assert(data + best_size == next); + data = next; + +#if TRACE > 1 + bytestats->bitg[bitslog2]++; + bytestats->bitb[bitslog2] += best_size; +#endif + } + +#if TRACE > 1 + bytestats->header += header_size; +#endif + + return data; +} + +static unsigned char* encodeVertexBlock(unsigned char* data, unsigned char* data_end, const unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +{ + assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); + + unsigned char buffer[kVertexBlockMaxSize]; + assert(sizeof(buffer) % kByteGroupSize == 0); + + // we sometimes encode elements we didn't fill when rounding to kByteGroupSize + memset(buffer, 0, sizeof(buffer)); + + for (size_t k = 0; k < vertex_size; ++k) + { + size_t vertex_offset = k; + + unsigned char p = last_vertex[k]; + + for (size_t i = 0; i < vertex_count; ++i) + { + buffer[i] = zigzag8(vertex_data[vertex_offset] - p); + + p = vertex_data[vertex_offset]; + + vertex_offset += vertex_size; + } + +#if TRACE + const unsigned char* olddata = data; + bytestats = &vertexstats[k]; +#endif + + data = encodeBytes(data, data_end, buffer, (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1)); + if (!data) + return 0; + +#if TRACE + bytestats = 0; + vertexstats[k].size += data - olddata; +#endif + } + + memcpy(last_vertex, &vertex_data[vertex_size * (vertex_count - 1)], vertex_size); + + return data; +} + +#if defined(SIMD_FALLBACK) || (!defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_AVX)) +static const unsigned char* decodeBytesGroup(const unsigned char* data, unsigned char* buffer, int bitslog2) +{ +#define READ() byte = *data++ +#define NEXT(bits) enc = byte >> (8 - bits), byte <<= bits, encv = *data_var, *buffer++ = (enc == (1 << bits) - 1) ? encv : enc, data_var += (enc == (1 << bits) - 1) + + unsigned char byte, enc, encv; + const unsigned char* data_var; + + switch (bitslog2) + { + case 0: + memset(buffer, 0, kByteGroupSize); + return data; + case 1: + data_var = data + 4; + + // 4 groups with 4 2-bit values in each byte + READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); + READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); + READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); + READ(), NEXT(2), NEXT(2), NEXT(2), NEXT(2); + + return data_var; + case 2: + data_var = data + 8; + + // 8 groups with 2 4-bit values in each byte + READ(), NEXT(4), NEXT(4); + READ(), NEXT(4), NEXT(4); + READ(), NEXT(4), NEXT(4); + READ(), NEXT(4), NEXT(4); + READ(), NEXT(4), NEXT(4); + READ(), NEXT(4), NEXT(4); + READ(), NEXT(4), NEXT(4); + READ(), NEXT(4), NEXT(4); + + return data_var; + case 3: + memcpy(buffer, data, kByteGroupSize); + return data + kByteGroupSize; + default: + assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + return data; + } + +#undef READ +#undef NEXT +} + +static const unsigned char* decodeBytes(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +{ + assert(buffer_size % kByteGroupSize == 0); + + const unsigned char* header = data; + + // round number of groups to 4 to get number of header bytes + size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; + + if (size_t(data_end - data) < header_size) + return 0; + + data += header_size; + + for (size_t i = 0; i < buffer_size; i += kByteGroupSize) + { + if (size_t(data_end - data) < kByteGroupDecodeLimit) + return 0; + + size_t header_offset = i / kByteGroupSize; + + int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; + + data = decodeBytesGroup(data, buffer + i, bitslog2); + } + + return data; +} + +static const unsigned char* decodeVertexBlock(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +{ + assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); + + unsigned char buffer[kVertexBlockMaxSize]; + unsigned char transposed[kVertexBlockSizeBytes]; + + size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + + for (size_t k = 0; k < vertex_size; ++k) + { + data = decodeBytes(data, data_end, buffer, vertex_count_aligned); + if (!data) + return 0; + + size_t vertex_offset = k; + + unsigned char p = last_vertex[k]; + + for (size_t i = 0; i < vertex_count; ++i) + { + unsigned char v = unzigzag8(buffer[i]) + p; + + transposed[vertex_offset] = v; + p = v; + + vertex_offset += vertex_size; + } + } + + memcpy(vertex_data, transposed, vertex_count * vertex_size); + + memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size); + + return data; +} +#endif + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) +static unsigned char kDecodeBytesGroupShuffle[256][8]; +static unsigned char kDecodeBytesGroupCount[256]; + +#ifdef __wasm__ +__attribute__((cold)) // this saves 500 bytes in the output binary - we don't need to vectorize this loop! +#endif +static bool +decodeBytesGroupBuildTables() +{ + for (int mask = 0; mask < 256; ++mask) + { + unsigned char shuffle[8]; + unsigned char count = 0; + + for (int i = 0; i < 8; ++i) + { + int maski = (mask >> i) & 1; + shuffle[i] = maski ? count : 0x80; + count += (unsigned char)(maski); + } + + memcpy(kDecodeBytesGroupShuffle[mask], shuffle, 8); + kDecodeBytesGroupCount[mask] = count; + } + + return true; +} + +static bool gDecodeBytesGroupInitialized = decodeBytesGroupBuildTables(); +#endif + +#ifdef SIMD_SSE +SIMD_TARGET +static __m128i decodeShuffleMask(unsigned char mask0, unsigned char mask1) +{ + __m128i sm0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask0])); + __m128i sm1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&kDecodeBytesGroupShuffle[mask1])); + __m128i sm1off = _mm_set1_epi8(kDecodeBytesGroupCount[mask0]); + + __m128i sm1r = _mm_add_epi8(sm1, sm1off); + + return _mm_unpacklo_epi64(sm0, sm1r); +} + +SIMD_TARGET +static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +{ + switch (bitslog2) + { + case 0: + { + __m128i result = _mm_setzero_si128(); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data; + } + + case 1: + { +#ifdef __GNUC__ + typedef int __attribute__((aligned(1))) unaligned_int; +#else + typedef int unaligned_int; +#endif + + __m128i sel2 = _mm_cvtsi32_si128(*reinterpret_cast<const unaligned_int*>(data)); + __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 4)); + + __m128i sel22 = _mm_unpacklo_epi8(_mm_srli_epi16(sel2, 4), sel2); + __m128i sel2222 = _mm_unpacklo_epi8(_mm_srli_epi16(sel22, 2), sel22); + __m128i sel = _mm_and_si128(sel2222, _mm_set1_epi8(3)); + + __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(3)); + int mask16 = _mm_movemask_epi8(mask); + unsigned char mask0 = (unsigned char)(mask16 & 255); + unsigned char mask1 = (unsigned char)(mask16 >> 8); + + __m128i shuf = decodeShuffleMask(mask0, mask1); + + __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 2: + { + __m128i sel4 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data)); + __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 8)); + + __m128i sel44 = _mm_unpacklo_epi8(_mm_srli_epi16(sel4, 4), sel4); + __m128i sel = _mm_and_si128(sel44, _mm_set1_epi8(15)); + + __m128i mask = _mm_cmpeq_epi8(sel, _mm_set1_epi8(15)); + int mask16 = _mm_movemask_epi8(mask); + unsigned char mask0 = (unsigned char)(mask16 & 255); + unsigned char mask1 = (unsigned char)(mask16 >> 8); + + __m128i shuf = decodeShuffleMask(mask0, mask1); + + __m128i result = _mm_or_si128(_mm_shuffle_epi8(rest, shuf), _mm_andnot_si128(mask, sel)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 3: + { + __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 16; + } + + default: + assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + return data; + } +} +#endif + +#ifdef SIMD_AVX +static const __m128i decodeBytesGroupConfig[] = { + _mm_set1_epi8(3), + _mm_set1_epi8(15), + _mm_setr_epi8(6, 4, 2, 0, 14, 12, 10, 8, 22, 20, 18, 16, 30, 28, 26, 24), + _mm_setr_epi8(4, 0, 12, 8, 20, 16, 28, 24, 36, 32, 44, 40, 52, 48, 60, 56), +}; + +static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +{ + switch (bitslog2) + { + case 0: + { + __m128i result = _mm_setzero_si128(); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data; + } + + case 1: + case 2: + { + const unsigned char* skip = data + (bitslog2 << 2); + + __m128i selb = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(data)); + __m128i rest = _mm_loadu_si128(reinterpret_cast<const __m128i*>(skip)); + + __m128i sent = decodeBytesGroupConfig[bitslog2 - 1]; + __m128i ctrl = decodeBytesGroupConfig[bitslog2 + 1]; + + __m128i selw = _mm_shuffle_epi32(selb, 0x44); + __m128i sel = _mm_and_si128(sent, _mm_multishift_epi64_epi8(ctrl, selw)); + __mmask16 mask16 = _mm_cmp_epi8_mask(sel, sent, _MM_CMPINT_EQ); + + __m128i result = _mm_mask_expand_epi8(sel, mask16, rest); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return skip + _mm_popcnt_u32(mask16); + } + + case 3: + { + __m128i result = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); + + return data + 16; + } + + default: + assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + return data; + } +} +#endif + +#ifdef SIMD_NEON +static uint8x16_t shuffleBytes(unsigned char mask0, unsigned char mask1, uint8x8_t rest0, uint8x8_t rest1) +{ + uint8x8_t sm0 = vld1_u8(kDecodeBytesGroupShuffle[mask0]); + uint8x8_t sm1 = vld1_u8(kDecodeBytesGroupShuffle[mask1]); + + uint8x8_t r0 = vtbl1_u8(rest0, sm0); + uint8x8_t r1 = vtbl1_u8(rest1, sm1); + + return vcombine_u8(r0, r1); +} + +static void neonMoveMask(uint8x16_t mask, unsigned char& mask0, unsigned char& mask1) +{ + static const unsigned char byte_mask_data[16] = {1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128}; + + uint8x16_t byte_mask = vld1q_u8(byte_mask_data); + uint8x16_t masked = vandq_u8(mask, byte_mask); + +#ifdef __aarch64__ + // aarch64 has horizontal sums; MSVC doesn't expose this via arm64_neon.h so this path is exclusive to clang/gcc + mask0 = vaddv_u8(vget_low_u8(masked)); + mask1 = vaddv_u8(vget_high_u8(masked)); +#else + // we need horizontal sums of each half of masked, which can be done in 3 steps (yielding sums of sizes 2, 4, 8) + uint8x8_t sum1 = vpadd_u8(vget_low_u8(masked), vget_high_u8(masked)); + uint8x8_t sum2 = vpadd_u8(sum1, sum1); + uint8x8_t sum3 = vpadd_u8(sum2, sum2); + + mask0 = vget_lane_u8(sum3, 0); + mask1 = vget_lane_u8(sum3, 1); +#endif +} + +static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +{ + switch (bitslog2) + { + case 0: + { + uint8x16_t result = vdupq_n_u8(0); + + vst1q_u8(buffer, result); + + return data; + } + + case 1: + { + uint8x8_t sel2 = vld1_u8(data); + uint8x8_t sel22 = vzip_u8(vshr_n_u8(sel2, 4), sel2).val[0]; + uint8x8x2_t sel2222 = vzip_u8(vshr_n_u8(sel22, 2), sel22); + uint8x16_t sel = vandq_u8(vcombine_u8(sel2222.val[0], sel2222.val[1]), vdupq_n_u8(3)); + + uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(3)); + unsigned char mask0, mask1; + neonMoveMask(mask, mask0, mask1); + + uint8x8_t rest0 = vld1_u8(data + 4); + uint8x8_t rest1 = vld1_u8(data + 4 + kDecodeBytesGroupCount[mask0]); + + uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel); + + vst1q_u8(buffer, result); + + return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 2: + { + uint8x8_t sel4 = vld1_u8(data); + uint8x8x2_t sel44 = vzip_u8(vshr_n_u8(sel4, 4), vand_u8(sel4, vdup_n_u8(15))); + uint8x16_t sel = vcombine_u8(sel44.val[0], sel44.val[1]); + + uint8x16_t mask = vceqq_u8(sel, vdupq_n_u8(15)); + unsigned char mask0, mask1; + neonMoveMask(mask, mask0, mask1); + + uint8x8_t rest0 = vld1_u8(data + 8); + uint8x8_t rest1 = vld1_u8(data + 8 + kDecodeBytesGroupCount[mask0]); + + uint8x16_t result = vbslq_u8(mask, shuffleBytes(mask0, mask1, rest0, rest1), sel); + + vst1q_u8(buffer, result); + + return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 3: + { + uint8x16_t result = vld1q_u8(data); + + vst1q_u8(buffer, result); + + return data + 16; + } + + default: + assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + return data; + } +} +#endif + +#ifdef SIMD_WASM +SIMD_TARGET +static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1) +{ + v128_t sm0 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask0]); + v128_t sm1 = wasm_v128_load(&kDecodeBytesGroupShuffle[mask1]); + + v128_t sm1off = wasm_v128_load(&kDecodeBytesGroupCount[mask0]); + sm1off = wasm_v8x16_shuffle(sm1off, sm1off, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + + v128_t sm1r = wasm_i8x16_add(sm1, sm1off); + + return wasmx_unpacklo_v64x2(sm0, sm1r); +} + +SIMD_TARGET +static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1) +{ + v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3); + + uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull; + uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull; + + // TODO: This can use v8x16_bitmask in the future + uint64_t mask_2 = mask_1a | mask_1b; + uint64_t mask_4 = mask_2 | (mask_2 >> 16); + uint64_t mask_8 = mask_4 | (mask_4 >> 8); + + mask0 = uint8_t(mask_8); + mask1 = uint8_t(mask_8 >> 32); +} + +SIMD_TARGET +static const unsigned char* decodeBytesGroupSimd(const unsigned char* data, unsigned char* buffer, int bitslog2) +{ + unsigned char byte, enc, encv; + const unsigned char* data_var; + + switch (bitslog2) + { + case 0: + { + v128_t result = wasm_i8x16_splat(0); + + wasm_v128_store(buffer, result); + + return data; + } + + case 1: + { + v128_t sel2 = wasm_v128_load(data); + v128_t rest = wasm_v128_load(data + 4); + + v128_t sel22 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel2, 4), sel2); + v128_t sel2222 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel22, 2), sel22); + v128_t sel = wasm_v128_and(sel2222, wasm_i8x16_splat(3)); + + v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(3)); + + unsigned char mask0, mask1; + wasmMoveMask(mask, mask0, mask1); + + v128_t shuf = decodeShuffleMask(mask0, mask1); + + v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask); + + wasm_v128_store(buffer, result); + + return data + 4 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 2: + { + v128_t sel4 = wasm_v128_load(data); + v128_t rest = wasm_v128_load(data + 8); + + v128_t sel44 = wasmx_unpacklo_v8x16(wasm_i16x8_shr(sel4, 4), sel4); + v128_t sel = wasm_v128_and(sel44, wasm_i8x16_splat(15)); + + v128_t mask = wasm_i8x16_eq(sel, wasm_i8x16_splat(15)); + + unsigned char mask0, mask1; + wasmMoveMask(mask, mask0, mask1); + + v128_t shuf = decodeShuffleMask(mask0, mask1); + + v128_t result = wasm_v128_bitselect(wasm_v8x16_swizzle(rest, shuf), sel, mask); + + wasm_v128_store(buffer, result); + + return data + 8 + kDecodeBytesGroupCount[mask0] + kDecodeBytesGroupCount[mask1]; + } + + case 3: + { + v128_t result = wasm_v128_load(data); + + wasm_v128_store(buffer, result); + + return data + 16; + } + + default: + assert(!"Unexpected bit length"); // unreachable since bitslog2 is a 2-bit value + return data; + } +} +#endif + +#if defined(SIMD_SSE) || defined(SIMD_AVX) +SIMD_TARGET +static void transpose8(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3) +{ + __m128i t0 = _mm_unpacklo_epi8(x0, x1); + __m128i t1 = _mm_unpackhi_epi8(x0, x1); + __m128i t2 = _mm_unpacklo_epi8(x2, x3); + __m128i t3 = _mm_unpackhi_epi8(x2, x3); + + x0 = _mm_unpacklo_epi16(t0, t2); + x1 = _mm_unpackhi_epi16(t0, t2); + x2 = _mm_unpacklo_epi16(t1, t3); + x3 = _mm_unpackhi_epi16(t1, t3); +} + +SIMD_TARGET +static __m128i unzigzag8(__m128i v) +{ + __m128i xl = _mm_sub_epi8(_mm_setzero_si128(), _mm_and_si128(v, _mm_set1_epi8(1))); + __m128i xr = _mm_and_si128(_mm_srli_epi16(v, 1), _mm_set1_epi8(127)); + + return _mm_xor_si128(xl, xr); +} +#endif + +#ifdef SIMD_NEON +static void transpose8(uint8x16_t& x0, uint8x16_t& x1, uint8x16_t& x2, uint8x16_t& x3) +{ + uint8x16x2_t t01 = vzipq_u8(x0, x1); + uint8x16x2_t t23 = vzipq_u8(x2, x3); + + uint16x8x2_t x01 = vzipq_u16(vreinterpretq_u16_u8(t01.val[0]), vreinterpretq_u16_u8(t23.val[0])); + uint16x8x2_t x23 = vzipq_u16(vreinterpretq_u16_u8(t01.val[1]), vreinterpretq_u16_u8(t23.val[1])); + + x0 = vreinterpretq_u8_u16(x01.val[0]); + x1 = vreinterpretq_u8_u16(x01.val[1]); + x2 = vreinterpretq_u8_u16(x23.val[0]); + x3 = vreinterpretq_u8_u16(x23.val[1]); +} + +static uint8x16_t unzigzag8(uint8x16_t v) +{ + uint8x16_t xl = vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(vandq_u8(v, vdupq_n_u8(1))))); + uint8x16_t xr = vshrq_n_u8(v, 1); + + return veorq_u8(xl, xr); +} +#endif + +#ifdef SIMD_WASM +SIMD_TARGET +static void transpose8(v128_t& x0, v128_t& x1, v128_t& x2, v128_t& x3) +{ + v128_t t0 = wasmx_unpacklo_v8x16(x0, x1); + v128_t t1 = wasmx_unpackhi_v8x16(x0, x1); + v128_t t2 = wasmx_unpacklo_v8x16(x2, x3); + v128_t t3 = wasmx_unpackhi_v8x16(x2, x3); + + x0 = wasmx_unpacklo_v16x8(t0, t2); + x1 = wasmx_unpackhi_v16x8(t0, t2); + x2 = wasmx_unpacklo_v16x8(t1, t3); + x3 = wasmx_unpackhi_v16x8(t1, t3); +} + +SIMD_TARGET +static v128_t unzigzag8(v128_t v) +{ + v128_t xl = wasm_i8x16_neg(wasm_v128_and(v, wasm_i8x16_splat(1))); + v128_t xr = wasm_u8x16_shr(v, 1); + + return wasm_v128_xor(xl, xr); +} +#endif + +#if defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) +SIMD_TARGET +static const unsigned char* decodeBytesSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* buffer, size_t buffer_size) +{ + assert(buffer_size % kByteGroupSize == 0); + assert(kByteGroupSize == 16); + + const unsigned char* header = data; + + // round number of groups to 4 to get number of header bytes + size_t header_size = (buffer_size / kByteGroupSize + 3) / 4; + + if (size_t(data_end - data) < header_size) + return 0; + + data += header_size; + + size_t i = 0; + + // fast-path: process 4 groups at a time, do a shared bounds check - each group reads <=24b + for (; i + kByteGroupSize * 4 <= buffer_size && size_t(data_end - data) >= kByteGroupDecodeLimit * 4; i += kByteGroupSize * 4) + { + size_t header_offset = i / kByteGroupSize; + unsigned char header_byte = header[header_offset / 4]; + + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 0, (header_byte >> 0) & 3); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 1, (header_byte >> 2) & 3); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 2, (header_byte >> 4) & 3); + data = decodeBytesGroupSimd(data, buffer + i + kByteGroupSize * 3, (header_byte >> 6) & 3); + } + + // slow-path: process remaining groups + for (; i < buffer_size; i += kByteGroupSize) + { + if (size_t(data_end - data) < kByteGroupDecodeLimit) + return 0; + + size_t header_offset = i / kByteGroupSize; + + int bitslog2 = (header[header_offset / 4] >> ((header_offset % 4) * 2)) & 3; + + data = decodeBytesGroupSimd(data, buffer + i, bitslog2); + } + + return data; +} + +SIMD_TARGET +static const unsigned char* decodeVertexBlockSimd(const unsigned char* data, const unsigned char* data_end, unsigned char* vertex_data, size_t vertex_count, size_t vertex_size, unsigned char last_vertex[256]) +{ + assert(vertex_count > 0 && vertex_count <= kVertexBlockMaxSize); + + unsigned char buffer[kVertexBlockMaxSize * 4]; + unsigned char transposed[kVertexBlockSizeBytes]; + + size_t vertex_count_aligned = (vertex_count + kByteGroupSize - 1) & ~(kByteGroupSize - 1); + + for (size_t k = 0; k < vertex_size; k += 4) + { + for (size_t j = 0; j < 4; ++j) + { + data = decodeBytesSimd(data, data_end, buffer + j * vertex_count_aligned, vertex_count_aligned); + if (!data) + return 0; + } + +#if defined(SIMD_SSE) || defined(SIMD_AVX) +#define TEMP __m128i +#define PREP() __m128i pi = _mm_cvtsi32_si128(*reinterpret_cast<const int*>(last_vertex + k)) +#define LOAD(i) __m128i r##i = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buffer + j + i * vertex_count_aligned)) +#define GRP4(i) t0 = _mm_shuffle_epi32(r##i, 0), t1 = _mm_shuffle_epi32(r##i, 1), t2 = _mm_shuffle_epi32(r##i, 2), t3 = _mm_shuffle_epi32(r##i, 3) +#define FIXD(i) t##i = pi = _mm_add_epi8(pi, t##i) +#define SAVE(i) *reinterpret_cast<int*>(savep) = _mm_cvtsi128_si32(t##i), savep += vertex_size +#endif + +#ifdef SIMD_NEON +#define TEMP uint8x8_t +#define PREP() uint8x8_t pi = vreinterpret_u8_u32(vld1_lane_u32(reinterpret_cast<uint32_t*>(last_vertex + k), vdup_n_u32(0), 0)) +#define LOAD(i) uint8x16_t r##i = vld1q_u8(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = vget_low_u8(r##i), t1 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t0), 1)), t2 = vget_high_u8(r##i), t3 = vreinterpret_u8_u32(vdup_lane_u32(vreinterpret_u32_u8(t2), 1)) +#define FIXD(i) t##i = pi = vadd_u8(pi, t##i) +#define SAVE(i) vst1_lane_u32(reinterpret_cast<uint32_t*>(savep), vreinterpret_u32_u8(t##i), 0), savep += vertex_size +#endif + +#ifdef SIMD_WASM +#define TEMP v128_t +#define PREP() v128_t pi = wasm_v128_load(last_vertex + k) +#define LOAD(i) v128_t r##i = wasm_v128_load(buffer + j + i * vertex_count_aligned) +#define GRP4(i) t0 = wasmx_splat_v32x4(r##i, 0), t1 = wasmx_splat_v32x4(r##i, 1), t2 = wasmx_splat_v32x4(r##i, 2), t3 = wasmx_splat_v32x4(r##i, 3) +#define FIXD(i) t##i = pi = wasm_i8x16_add(pi, t##i) +#define SAVE(i) *reinterpret_cast<int*>(savep) = wasm_i32x4_extract_lane(t##i, 0), savep += vertex_size +#endif + + PREP(); + + unsigned char* savep = transposed + k; + + for (size_t j = 0; j < vertex_count_aligned; j += 16) + { + LOAD(0); + LOAD(1); + LOAD(2); + LOAD(3); + + r0 = unzigzag8(r0); + r1 = unzigzag8(r1); + r2 = unzigzag8(r2); + r3 = unzigzag8(r3); + + transpose8(r0, r1, r2, r3); + + TEMP t0, t1, t2, t3; + + GRP4(0); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + GRP4(1); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + GRP4(2); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + + GRP4(3); + FIXD(0), FIXD(1), FIXD(2), FIXD(3); + SAVE(0), SAVE(1), SAVE(2), SAVE(3); + +#undef TEMP +#undef PREP +#undef LOAD +#undef GRP4 +#undef FIXD +#undef SAVE + } + } + + memcpy(vertex_data, transposed, vertex_count * vertex_size); + + memcpy(last_vertex, &transposed[vertex_size * (vertex_count - 1)], vertex_size); + + return data; +} +#endif + +#if defined(SIMD_SSE) && defined(SIMD_FALLBACK) +static unsigned int getCpuFeatures() +{ + int cpuinfo[4] = {}; +#ifdef _MSC_VER + __cpuid(cpuinfo, 1); +#else + __cpuid(1, cpuinfo[0], cpuinfo[1], cpuinfo[2], cpuinfo[3]); +#endif + return cpuinfo[2]; +} + +unsigned int cpuid = getCpuFeatures(); +#endif + +} // namespace meshopt + +size_t meshopt_encodeVertexBuffer(unsigned char* buffer, size_t buffer_size, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + using namespace meshopt; + + assert(vertex_size > 0 && vertex_size <= 256); + assert(vertex_size % 4 == 0); + +#if TRACE + memset(vertexstats, 0, sizeof(vertexstats)); +#endif + + const unsigned char* vertex_data = static_cast<const unsigned char*>(vertices); + + unsigned char* data = buffer; + unsigned char* data_end = buffer + buffer_size; + + if (size_t(data_end - data) < 1 + vertex_size) + return 0; + + int version = gEncodeVertexVersion; + + *data++ = (unsigned char)(kVertexHeader | version); + + unsigned char first_vertex[256] = {}; + if (vertex_count > 0) + memcpy(first_vertex, vertex_data, vertex_size); + + unsigned char last_vertex[256] = {}; + memcpy(last_vertex, first_vertex, vertex_size); + + size_t vertex_block_size = getVertexBlockSize(vertex_size); + + size_t vertex_offset = 0; + + while (vertex_offset < vertex_count) + { + size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; + + data = encodeVertexBlock(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + if (!data) + return 0; + + vertex_offset += block_size; + } + + size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + + if (size_t(data_end - data) < tail_size) + return 0; + + // write first vertex to the end of the stream and pad it to 32 bytes; this is important to simplify bounds checks in decoder + if (vertex_size < kTailMaxSize) + { + memset(data, 0, kTailMaxSize - vertex_size); + data += kTailMaxSize - vertex_size; + } + + memcpy(data, first_vertex, vertex_size); + data += vertex_size; + + assert(data >= buffer + tail_size); + assert(data <= buffer + buffer_size); + +#if TRACE + size_t total_size = data - buffer; + + for (size_t k = 0; k < vertex_size; ++k) + { + const Stats& vsk = vertexstats[k]; + + printf("%2d: %d bytes\t%.1f%%\t%.1f bpv", int(k), int(vsk.size), double(vsk.size) / double(total_size) * 100, double(vsk.size) / double(vertex_count) * 8); + +#if TRACE > 1 + printf("\t\thdr %d bytes\tbit0 %d (%d bytes)\tbit1 %d (%d bytes)\tbit2 %d (%d bytes)\tbit3 %d (%d bytes)", + int(vsk.header), + int(vsk.bitg[0]), int(vsk.bitb[0]), + int(vsk.bitg[1]), int(vsk.bitb[1]), + int(vsk.bitg[2]), int(vsk.bitb[2]), + int(vsk.bitg[3]), int(vsk.bitb[3])); +#endif + + printf("\n"); + } +#endif + + return data - buffer; +} + +size_t meshopt_encodeVertexBufferBound(size_t vertex_count, size_t vertex_size) +{ + using namespace meshopt; + + assert(vertex_size > 0 && vertex_size <= 256); + assert(vertex_size % 4 == 0); + + size_t vertex_block_size = getVertexBlockSize(vertex_size); + size_t vertex_block_count = (vertex_count + vertex_block_size - 1) / vertex_block_size; + + size_t vertex_block_header_size = (vertex_block_size / kByteGroupSize + 3) / 4; + size_t vertex_block_data_size = vertex_block_size; + + size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + + return 1 + vertex_block_count * vertex_size * (vertex_block_header_size + vertex_block_data_size) + tail_size; +} + +void meshopt_encodeVertexVersion(int version) +{ + assert(unsigned(version) <= 0); + + meshopt::gEncodeVertexVersion = version; +} + +int meshopt_decodeVertexBuffer(void* destination, size_t vertex_count, size_t vertex_size, const unsigned char* buffer, size_t buffer_size) +{ + using namespace meshopt; + + assert(vertex_size > 0 && vertex_size <= 256); + assert(vertex_size % 4 == 0); + + const unsigned char* (*decode)(const unsigned char*, const unsigned char*, unsigned char*, size_t, size_t, unsigned char[256]) = 0; + +#if defined(SIMD_SSE) && defined(SIMD_FALLBACK) + decode = (cpuid & (1 << 9)) ? decodeVertexBlockSimd : decodeVertexBlock; +#elif defined(SIMD_SSE) || defined(SIMD_AVX) || defined(SIMD_NEON) || defined(SIMD_WASM) + decode = decodeVertexBlockSimd; +#else + decode = decodeVertexBlock; +#endif + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) + assert(gDecodeBytesGroupInitialized); + (void)gDecodeBytesGroupInitialized; +#endif + + unsigned char* vertex_data = static_cast<unsigned char*>(destination); + + const unsigned char* data = buffer; + const unsigned char* data_end = buffer + buffer_size; + + if (size_t(data_end - data) < 1 + vertex_size) + return -2; + + unsigned char data_header = *data++; + + if ((data_header & 0xf0) != kVertexHeader) + return -1; + + int version = data_header & 0x0f; + if (version > 0) + return -1; + + unsigned char last_vertex[256]; + memcpy(last_vertex, data_end - vertex_size, vertex_size); + + size_t vertex_block_size = getVertexBlockSize(vertex_size); + + size_t vertex_offset = 0; + + while (vertex_offset < vertex_count) + { + size_t block_size = (vertex_offset + vertex_block_size < vertex_count) ? vertex_block_size : vertex_count - vertex_offset; + + data = decode(data, data_end, vertex_data + vertex_offset * vertex_size, block_size, vertex_size, last_vertex); + if (!data) + return -2; + + vertex_offset += block_size; + } + + size_t tail_size = vertex_size < kTailMaxSize ? kTailMaxSize : vertex_size; + + if (size_t(data_end - data) != tail_size) + return -3; + + return 0; +} + +#undef SIMD_NEON +#undef SIMD_SSE +#undef SIMD_AVX +#undef SIMD_WASM +#undef SIMD_FALLBACK +#undef SIMD_TARGET diff --git a/thirdparty/meshoptimizer/vertexfilter.cpp b/thirdparty/meshoptimizer/vertexfilter.cpp new file mode 100644 index 0000000000..e7ad2c9d39 --- /dev/null +++ b/thirdparty/meshoptimizer/vertexfilter.cpp @@ -0,0 +1,825 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <math.h> + +// The block below auto-detects SIMD ISA that can be used on the target platform +#ifndef MESHOPTIMIZER_NO_SIMD + +// The SIMD implementation requires SSE2, which can be enabled unconditionally through compiler settings +#if defined(__SSE2__) +#define SIMD_SSE +#endif + +// MSVC supports compiling SSE2 code regardless of compile options; we assume all 32-bit CPUs support SSE2 +#if !defined(SIMD_SSE) && defined(_MSC_VER) && !defined(__clang__) && (defined(_M_IX86) || defined(_M_X64)) +#define SIMD_SSE +#endif + +// GCC/clang define these when NEON support is available +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#define SIMD_NEON +#endif + +// On MSVC, we assume that ARM builds always target NEON-capable devices +#if !defined(SIMD_NEON) && defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#define SIMD_NEON +#endif + +// When targeting Wasm SIMD we can't use runtime cpuid checks so we unconditionally enable SIMD +#if defined(__wasm_simd128__) +#define SIMD_WASM +#endif + +#endif // !MESHOPTIMIZER_NO_SIMD + +#ifdef SIMD_SSE +#include <emmintrin.h> +#include <stdint.h> +#endif + +#ifdef _MSC_VER +#include <intrin.h> +#endif + +#ifdef SIMD_NEON +#if defined(_MSC_VER) && defined(_M_ARM64) +#include <arm64_neon.h> +#else +#include <arm_neon.h> +#endif +#endif + +#ifdef SIMD_WASM +#include <wasm_simd128.h> +#endif + +#ifdef SIMD_WASM +#define wasmx_unpacklo_v16x8(a, b) wasm_v16x8_shuffle(a, b, 0, 8, 1, 9, 2, 10, 3, 11) +#define wasmx_unpackhi_v16x8(a, b) wasm_v16x8_shuffle(a, b, 4, 12, 5, 13, 6, 14, 7, 15) +#define wasmx_unziplo_v32x4(a, b) wasm_v32x4_shuffle(a, b, 0, 2, 4, 6) +#define wasmx_unziphi_v32x4(a, b) wasm_v32x4_shuffle(a, b, 1, 3, 5, 7) +#endif + +namespace meshopt +{ + +#if !defined(SIMD_SSE) && !defined(SIMD_NEON) && !defined(SIMD_WASM) +template <typename T> +static void decodeFilterOct(T* data, size_t count) +{ + const float max = float((1 << (sizeof(T) * 8 - 1)) - 1); + + for (size_t i = 0; i < count; ++i) + { + // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count + float x = float(data[i * 4 + 0]); + float y = float(data[i * 4 + 1]); + float z = float(data[i * 4 + 2]) - fabsf(x) - fabsf(y); + + // fixup octahedral coordinates for z<0 + float t = (z >= 0.f) ? 0.f : z; + + x += (x >= 0.f) ? t : -t; + y += (y >= 0.f) ? t : -t; + + // compute normal length & scale + float l = sqrtf(x * x + y * y + z * z); + float s = max / l; + + // rounded signed float->int + int xf = int(x * s + (x >= 0.f ? 0.5f : -0.5f)); + int yf = int(y * s + (y >= 0.f ? 0.5f : -0.5f)); + int zf = int(z * s + (z >= 0.f ? 0.5f : -0.5f)); + + data[i * 4 + 0] = T(xf); + data[i * 4 + 1] = T(yf); + data[i * 4 + 2] = T(zf); + } +} + +static void decodeFilterQuat(short* data, size_t count) +{ + const float scale = 1.f / sqrtf(2.f); + + for (size_t i = 0; i < count; ++i) + { + // recover scale from the high byte of the component + int sf = data[i * 4 + 3] | 3; + float ss = scale / float(sf); + + // convert x/y/z to [-1..1] (scaled...) + float x = float(data[i * 4 + 0]) * ss; + float y = float(data[i * 4 + 1]) * ss; + float z = float(data[i * 4 + 2]) * ss; + + // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors + float ww = 1.f - x * x - y * y - z * z; + float w = sqrtf(ww >= 0.f ? ww : 0.f); + + // rounded signed float->int + int xf = int(x * 32767.f + (x >= 0.f ? 0.5f : -0.5f)); + int yf = int(y * 32767.f + (y >= 0.f ? 0.5f : -0.5f)); + int zf = int(z * 32767.f + (z >= 0.f ? 0.5f : -0.5f)); + int wf = int(w * 32767.f + 0.5f); + + int qc = data[i * 4 + 3] & 3; + + // output order is dictated by input index + data[i * 4 + ((qc + 1) & 3)] = short(xf); + data[i * 4 + ((qc + 2) & 3)] = short(yf); + data[i * 4 + ((qc + 3) & 3)] = short(zf); + data[i * 4 + ((qc + 0) & 3)] = short(wf); + } +} + +static void decodeFilterExp(unsigned int* data, size_t count) +{ + for (size_t i = 0; i < count; ++i) + { + unsigned int v = data[i]; + + // decode mantissa and exponent + int m = int(v << 8) >> 8; + int e = int(v) >> 24; + + union + { + float f; + unsigned int ui; + } u; + + // optimized version of ldexp(float(m), e) + u.ui = unsigned(e + 127) << 23; + u.f = u.f * float(m); + + data[i] = u.ui; + } +} +#endif + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) +inline uint64_t rotateleft64(uint64_t v, int x) +{ +#if defined(_MSC_VER) && !defined(__clang__) + return _rotl64(v, x); +// Apple's Clang 8 is actually vanilla Clang 3.9, there we need to look for +// version 11 instead: https://en.wikipedia.org/wiki/Xcode#Toolchain_versions +#elif defined(__clang__) && ((!defined(__apple_build_version__) && __clang_major__ >= 8) || __clang_major__ >= 11) + return __builtin_rotateleft64(v, x); +#else + return (v << (x & 63)) | (v >> ((64 - x) & 63)); +#endif +} +#endif + +#ifdef SIMD_SSE +static void decodeFilterOctSimd(signed char* data, size_t count) +{ + const __m128 sign = _mm_set1_ps(-0.f); + + for (size_t i = 0; i < count; i += 4) + { + __m128i n4 = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i * 4])); + + // sign-extends each of x,y in [x y ? ?] with arithmetic shifts + __m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 24), 24); + __m128i yf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 24); + + // unpack z; note that z is unsigned so we technically don't need to sign extend it + __m128i zf = _mm_srai_epi32(_mm_slli_epi32(n4, 8), 24); + + // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count + __m128 x = _mm_cvtepi32_ps(xf); + __m128 y = _mm_cvtepi32_ps(yf); + __m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y))); + + // fixup octahedral coordinates for z<0 + __m128 t = _mm_min_ps(z, _mm_setzero_ps()); + + x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign))); + y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign))); + + // compute normal length & scale + __m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))); + __m128 s = _mm_mul_ps(_mm_set1_ps(127.f), _mm_rsqrt_ps(ll)); + + // rounded signed float->int + __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s)); + __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s)); + __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s)); + + // combine xr/yr/zr into final value + __m128i res = _mm_and_si128(n4, _mm_set1_epi32(0xff000000)); + res = _mm_or_si128(res, _mm_and_si128(xr, _mm_set1_epi32(0xff))); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(yr, _mm_set1_epi32(0xff)), 8)); + res = _mm_or_si128(res, _mm_slli_epi32(_mm_and_si128(zr, _mm_set1_epi32(0xff)), 16)); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[i * 4]), res); + } +} + +static void decodeFilterOctSimd(short* data, size_t count) +{ + const __m128 sign = _mm_set1_ps(-0.f); + + for (size_t i = 0; i < count; i += 4) + { + __m128 n4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4])); + __m128 n4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4])); + + // gather both x/y 16-bit pairs in each 32-bit lane + __m128i n4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(2, 0, 2, 0))); + + // sign-extends each of x,y in [x y] with arithmetic shifts + __m128i xf = _mm_srai_epi32(_mm_slli_epi32(n4, 16), 16); + __m128i yf = _mm_srai_epi32(n4, 16); + + // unpack z; note that z is unsigned so we don't need to sign extend it + __m128i z4 = _mm_castps_si128(_mm_shuffle_ps(n4_0, n4_1, _MM_SHUFFLE(3, 1, 3, 1))); + __m128i zf = _mm_and_si128(z4, _mm_set1_epi32(0x7fff)); + + // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count + __m128 x = _mm_cvtepi32_ps(xf); + __m128 y = _mm_cvtepi32_ps(yf); + __m128 z = _mm_sub_ps(_mm_cvtepi32_ps(zf), _mm_add_ps(_mm_andnot_ps(sign, x), _mm_andnot_ps(sign, y))); + + // fixup octahedral coordinates for z<0 + __m128 t = _mm_min_ps(z, _mm_setzero_ps()); + + x = _mm_add_ps(x, _mm_xor_ps(t, _mm_and_ps(x, sign))); + y = _mm_add_ps(y, _mm_xor_ps(t, _mm_and_ps(y, sign))); + + // compute normal length & scale + __m128 ll = _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z))); + __m128 s = _mm_div_ps(_mm_set1_ps(32767.f), _mm_sqrt_ps(ll)); + + // rounded signed float->int + __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s)); + __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s)); + __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s)); + + // mix x/z and y/0 to make 16-bit unpack easier + __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16)); + __m128i y0r = _mm_and_si128(yr, _mm_set1_epi32(0xffff)); + + // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w + __m128i res_0 = _mm_unpacklo_epi16(xzr, y0r); + __m128i res_1 = _mm_unpackhi_epi16(xzr, y0r); + + // patch in .w + res_0 = _mm_or_si128(res_0, _mm_and_si128(_mm_castps_si128(n4_0), _mm_set1_epi64x(0xffff000000000000))); + res_1 = _mm_or_si128(res_1, _mm_and_si128(_mm_castps_si128(n4_1), _mm_set1_epi64x(0xffff000000000000))); + + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 0) * 4]), res_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&data[(i + 2) * 4]), res_1); + } +} + +static void decodeFilterQuatSimd(short* data, size_t count) +{ + const float scale = 1.f / sqrtf(2.f); + + for (size_t i = 0; i < count; i += 4) + { + __m128 q4_0 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 0) * 4])); + __m128 q4_1 = _mm_loadu_ps(reinterpret_cast<float*>(&data[(i + 2) * 4])); + + // gather both x/y 16-bit pairs in each 32-bit lane + __m128i q4_xy = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(2, 0, 2, 0))); + __m128i q4_zc = _mm_castps_si128(_mm_shuffle_ps(q4_0, q4_1, _MM_SHUFFLE(3, 1, 3, 1))); + + // sign-extends each of x,y in [x y] with arithmetic shifts + __m128i xf = _mm_srai_epi32(_mm_slli_epi32(q4_xy, 16), 16); + __m128i yf = _mm_srai_epi32(q4_xy, 16); + __m128i zf = _mm_srai_epi32(_mm_slli_epi32(q4_zc, 16), 16); + __m128i cf = _mm_srai_epi32(q4_zc, 16); + + // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) + __m128i sf = _mm_or_si128(cf, _mm_set1_epi32(3)); + __m128 ss = _mm_div_ps(_mm_set1_ps(scale), _mm_cvtepi32_ps(sf)); + + // convert x/y/z to [-1..1] (scaled...) + __m128 x = _mm_mul_ps(_mm_cvtepi32_ps(xf), ss); + __m128 y = _mm_mul_ps(_mm_cvtepi32_ps(yf), ss); + __m128 z = _mm_mul_ps(_mm_cvtepi32_ps(zf), ss); + + // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors + __m128 ww = _mm_sub_ps(_mm_set1_ps(1.f), _mm_add_ps(_mm_mul_ps(x, x), _mm_add_ps(_mm_mul_ps(y, y), _mm_mul_ps(z, z)))); + __m128 w = _mm_sqrt_ps(_mm_max_ps(ww, _mm_setzero_ps())); + + __m128 s = _mm_set1_ps(32767.f); + + // rounded signed float->int + __m128i xr = _mm_cvtps_epi32(_mm_mul_ps(x, s)); + __m128i yr = _mm_cvtps_epi32(_mm_mul_ps(y, s)); + __m128i zr = _mm_cvtps_epi32(_mm_mul_ps(z, s)); + __m128i wr = _mm_cvtps_epi32(_mm_mul_ps(w, s)); + + // mix x/z and w/y to make 16-bit unpack easier + __m128i xzr = _mm_or_si128(_mm_and_si128(xr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(zr, 16)); + __m128i wyr = _mm_or_si128(_mm_and_si128(wr, _mm_set1_epi32(0xffff)), _mm_slli_epi32(yr, 16)); + + // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0) + __m128i res_0 = _mm_unpacklo_epi16(wyr, xzr); + __m128i res_1 = _mm_unpackhi_epi16(wyr, xzr); + + // store results to stack so that we can rotate using scalar instructions + uint64_t res[4]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(&res[0]), res_0); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&res[2]), res_1); + + // rotate and store + uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]); + + out[0] = rotateleft64(res[0], data[(i + 0) * 4 + 3] << 4); + out[1] = rotateleft64(res[1], data[(i + 1) * 4 + 3] << 4); + out[2] = rotateleft64(res[2], data[(i + 2) * 4 + 3] << 4); + out[3] = rotateleft64(res[3], data[(i + 3) * 4 + 3] << 4); + } +} + +static void decodeFilterExpSimd(unsigned int* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + __m128i v = _mm_loadu_si128(reinterpret_cast<__m128i*>(&data[i])); + + // decode exponent into 2^x directly + __m128i ef = _mm_srai_epi32(v, 24); + __m128i es = _mm_slli_epi32(_mm_add_epi32(ef, _mm_set1_epi32(127)), 23); + + // decode 24-bit mantissa into floating-point value + __m128i mf = _mm_srai_epi32(_mm_slli_epi32(v, 8), 8); + __m128 m = _mm_cvtepi32_ps(mf); + + __m128 r = _mm_mul_ps(_mm_castsi128_ps(es), m); + + _mm_storeu_ps(reinterpret_cast<float*>(&data[i]), r); + } +} +#endif + +#if defined(SIMD_NEON) && !defined(__aarch64__) && !defined(_M_ARM64) +inline float32x4_t vsqrtq_f32(float32x4_t x) +{ + float32x4_t r = vrsqrteq_f32(x); + r = vmulq_f32(r, vrsqrtsq_f32(vmulq_f32(r, x), r)); // refine rsqrt estimate + return vmulq_f32(r, x); +} + +inline float32x4_t vdivq_f32(float32x4_t x, float32x4_t y) +{ + float32x4_t r = vrecpeq_f32(y); + r = vmulq_f32(r, vrecpsq_f32(y, r)); // refine rcp estimate + return vmulq_f32(x, r); +} +#endif + +#ifdef SIMD_NEON +static void decodeFilterOctSimd(signed char* data, size_t count) +{ + const int32x4_t sign = vdupq_n_s32(0x80000000); + + for (size_t i = 0; i < count; i += 4) + { + int32x4_t n4 = vld1q_s32(reinterpret_cast<int32_t*>(&data[i * 4])); + + // sign-extends each of x,y in [x y ? ?] with arithmetic shifts + int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 24), 24); + int32x4_t yf = vshrq_n_s32(vshlq_n_s32(n4, 16), 24); + + // unpack z; note that z is unsigned so we technically don't need to sign extend it + int32x4_t zf = vshrq_n_s32(vshlq_n_s32(n4, 8), 24); + + // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count + float32x4_t x = vcvtq_f32_s32(xf); + float32x4_t y = vcvtq_f32_s32(yf); + float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y))); + + // fixup octahedral coordinates for z<0 + float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f)); + + x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign)))); + y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); + + // compute normal length & scale + float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t rl = vrsqrteq_f32(ll); + float32x4_t s = vmulq_f32(vdupq_n_f32(127.f), rl); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const float32x4_t fsnap = vdupq_n_f32(3 << 22); + + int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); + int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); + int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + + // combine xr/yr/zr into final value + int32x4_t res = vandq_s32(n4, vdupq_n_s32(0xff000000)); + res = vorrq_s32(res, vandq_s32(xr, vdupq_n_s32(0xff))); + res = vorrq_s32(res, vshlq_n_s32(vandq_s32(yr, vdupq_n_s32(0xff)), 8)); + res = vorrq_s32(res, vshlq_n_s32(vandq_s32(zr, vdupq_n_s32(0xff)), 16)); + + vst1q_s32(reinterpret_cast<int32_t*>(&data[i * 4]), res); + } +} + +static void decodeFilterOctSimd(short* data, size_t count) +{ + const int32x4_t sign = vdupq_n_s32(0x80000000); + + for (size_t i = 0; i < count; i += 4) + { + int32x4_t n4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4])); + int32x4_t n4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4])); + + // gather both x/y 16-bit pairs in each 32-bit lane + int32x4_t n4 = vuzpq_s32(n4_0, n4_1).val[0]; + + // sign-extends each of x,y in [x y] with arithmetic shifts + int32x4_t xf = vshrq_n_s32(vshlq_n_s32(n4, 16), 16); + int32x4_t yf = vshrq_n_s32(n4, 16); + + // unpack z; note that z is unsigned so we don't need to sign extend it + int32x4_t z4 = vuzpq_s32(n4_0, n4_1).val[1]; + int32x4_t zf = vandq_s32(z4, vdupq_n_s32(0x7fff)); + + // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count + float32x4_t x = vcvtq_f32_s32(xf); + float32x4_t y = vcvtq_f32_s32(yf); + float32x4_t z = vsubq_f32(vcvtq_f32_s32(zf), vaddq_f32(vabsq_f32(x), vabsq_f32(y))); + + // fixup octahedral coordinates for z<0 + float32x4_t t = vminq_f32(z, vdupq_n_f32(0.f)); + + x = vaddq_f32(x, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(x), sign)))); + y = vaddq_f32(y, vreinterpretq_f32_s32(veorq_s32(vreinterpretq_s32_f32(t), vandq_s32(vreinterpretq_s32_f32(y), sign)))); + + // compute normal length & scale + float32x4_t ll = vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z))); + float32x4_t rl = vrsqrteq_f32(ll); + rl = vmulq_f32(rl, vrsqrtsq_f32(vmulq_f32(rl, ll), rl)); // refine rsqrt estimate + float32x4_t s = vmulq_f32(vdupq_n_f32(32767.f), rl); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const float32x4_t fsnap = vdupq_n_f32(3 << 22); + + int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); + int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); + int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + + // mix x/z and y/0 to make 16-bit unpack easier + int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); + int32x4_t y0r = vandq_s32(yr, vdupq_n_s32(0xffff)); + + // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w + int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[0]); + int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(xzr), vreinterpretq_s16_s32(y0r)).val[1]); + + // patch in .w + res_0 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_0, res_0); + res_1 = vbslq_s32(vreinterpretq_u32_u64(vdupq_n_u64(0xffff000000000000)), n4_1, res_1); + + vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4]), res_0); + vst1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4]), res_1); + } +} + +static void decodeFilterQuatSimd(short* data, size_t count) +{ + const float scale = 1.f / sqrtf(2.f); + + for (size_t i = 0; i < count; i += 4) + { + int32x4_t q4_0 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 0) * 4])); + int32x4_t q4_1 = vld1q_s32(reinterpret_cast<int32_t*>(&data[(i + 2) * 4])); + + // gather both x/y 16-bit pairs in each 32-bit lane + int32x4_t q4_xy = vuzpq_s32(q4_0, q4_1).val[0]; + int32x4_t q4_zc = vuzpq_s32(q4_0, q4_1).val[1]; + + // sign-extends each of x,y in [x y] with arithmetic shifts + int32x4_t xf = vshrq_n_s32(vshlq_n_s32(q4_xy, 16), 16); + int32x4_t yf = vshrq_n_s32(q4_xy, 16); + int32x4_t zf = vshrq_n_s32(vshlq_n_s32(q4_zc, 16), 16); + int32x4_t cf = vshrq_n_s32(q4_zc, 16); + + // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) + int32x4_t sf = vorrq_s32(cf, vdupq_n_s32(3)); + float32x4_t ss = vdivq_f32(vdupq_n_f32(scale), vcvtq_f32_s32(sf)); + + // convert x/y/z to [-1..1] (scaled...) + float32x4_t x = vmulq_f32(vcvtq_f32_s32(xf), ss); + float32x4_t y = vmulq_f32(vcvtq_f32_s32(yf), ss); + float32x4_t z = vmulq_f32(vcvtq_f32_s32(zf), ss); + + // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors + float32x4_t ww = vsubq_f32(vdupq_n_f32(1.f), vaddq_f32(vmulq_f32(x, x), vaddq_f32(vmulq_f32(y, y), vmulq_f32(z, z)))); + float32x4_t w = vsqrtq_f32(vmaxq_f32(ww, vdupq_n_f32(0.f))); + + float32x4_t s = vdupq_n_f32(32767.f); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const float32x4_t fsnap = vdupq_n_f32(3 << 22); + + int32x4_t xr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(x, s), fsnap)); + int32x4_t yr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(y, s), fsnap)); + int32x4_t zr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(z, s), fsnap)); + int32x4_t wr = vreinterpretq_s32_f32(vaddq_f32(vmulq_f32(w, s), fsnap)); + + // mix x/z and w/y to make 16-bit unpack easier + int32x4_t xzr = vorrq_s32(vandq_s32(xr, vdupq_n_s32(0xffff)), vshlq_n_s32(zr, 16)); + int32x4_t wyr = vorrq_s32(vandq_s32(wr, vdupq_n_s32(0xffff)), vshlq_n_s32(yr, 16)); + + // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0) + int32x4_t res_0 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[0]); + int32x4_t res_1 = vreinterpretq_s32_s16(vzipq_s16(vreinterpretq_s16_s32(wyr), vreinterpretq_s16_s32(xzr)).val[1]); + + // rotate and store + uint64_t* out = (uint64_t*)&data[i * 4]; + + out[0] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 0), vgetq_lane_s32(cf, 0) << 4); + out[1] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_0), 1), vgetq_lane_s32(cf, 1) << 4); + out[2] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 0), vgetq_lane_s32(cf, 2) << 4); + out[3] = rotateleft64(vgetq_lane_u64(vreinterpretq_u64_s32(res_1), 1), vgetq_lane_s32(cf, 3) << 4); + } +} + +static void decodeFilterExpSimd(unsigned int* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + int32x4_t v = vld1q_s32(reinterpret_cast<int32_t*>(&data[i])); + + // decode exponent into 2^x directly + int32x4_t ef = vshrq_n_s32(v, 24); + int32x4_t es = vshlq_n_s32(vaddq_s32(ef, vdupq_n_s32(127)), 23); + + // decode 24-bit mantissa into floating-point value + int32x4_t mf = vshrq_n_s32(vshlq_n_s32(v, 8), 8); + float32x4_t m = vcvtq_f32_s32(mf); + + float32x4_t r = vmulq_f32(vreinterpretq_f32_s32(es), m); + + vst1q_f32(reinterpret_cast<float*>(&data[i]), r); + } +} +#endif + +#ifdef SIMD_WASM +static void decodeFilterOctSimd(signed char* data, size_t count) +{ + const v128_t sign = wasm_f32x4_splat(-0.f); + + for (size_t i = 0; i < count; i += 4) + { + v128_t n4 = wasm_v128_load(&data[i * 4]); + + // sign-extends each of x,y in [x y ? ?] with arithmetic shifts + v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 24), 24); + v128_t yf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 24); + + // unpack z; note that z is unsigned so we technically don't need to sign extend it + v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 8), 24); + + // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count + v128_t x = wasm_f32x4_convert_i32x4(xf); + v128_t y = wasm_f32x4_convert_i32x4(yf); + v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y))); + + // fixup octahedral coordinates for z<0 + // note: i32x4_min with 0 is equvalent to f32x4_min + v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0)); + + x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign))); + y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign))); + + // compute normal length & scale + v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))); + v128_t s = wasm_f32x4_div(wasm_f32x4_splat(127.f), wasm_f32x4_sqrt(ll)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 8 bits so we can omit the subtraction + const v128_t fsnap = wasm_f32x4_splat(3 << 22); + + v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap); + v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap); + v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap); + + // combine xr/yr/zr into final value + v128_t res = wasm_v128_and(n4, wasm_i32x4_splat(0xff000000)); + res = wasm_v128_or(res, wasm_v128_and(xr, wasm_i32x4_splat(0xff))); + res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(yr, wasm_i32x4_splat(0xff)), 8)); + res = wasm_v128_or(res, wasm_i32x4_shl(wasm_v128_and(zr, wasm_i32x4_splat(0xff)), 16)); + + wasm_v128_store(&data[i * 4], res); + } +} + +static void decodeFilterOctSimd(short* data, size_t count) +{ + const v128_t sign = wasm_f32x4_splat(-0.f); + const v128_t zmask = wasm_i32x4_splat(0x7fff); + + for (size_t i = 0; i < count; i += 4) + { + v128_t n4_0 = wasm_v128_load(&data[(i + 0) * 4]); + v128_t n4_1 = wasm_v128_load(&data[(i + 2) * 4]); + + // gather both x/y 16-bit pairs in each 32-bit lane + v128_t n4 = wasmx_unziplo_v32x4(n4_0, n4_1); + + // sign-extends each of x,y in [x y] with arithmetic shifts + v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(n4, 16), 16); + v128_t yf = wasm_i32x4_shr(n4, 16); + + // unpack z; note that z is unsigned so we don't need to sign extend it + v128_t z4 = wasmx_unziphi_v32x4(n4_0, n4_1); + v128_t zf = wasm_v128_and(z4, zmask); + + // convert x and y to floats and reconstruct z; this assumes zf encodes 1.f at the same bit count + v128_t x = wasm_f32x4_convert_i32x4(xf); + v128_t y = wasm_f32x4_convert_i32x4(yf); + v128_t z = wasm_f32x4_sub(wasm_f32x4_convert_i32x4(zf), wasm_f32x4_add(wasm_f32x4_abs(x), wasm_f32x4_abs(y))); + + // fixup octahedral coordinates for z<0 + // note: i32x4_min with 0 is equvalent to f32x4_min + v128_t t = wasm_i32x4_min(z, wasm_i32x4_splat(0)); + + x = wasm_f32x4_add(x, wasm_v128_xor(t, wasm_v128_and(x, sign))); + y = wasm_f32x4_add(y, wasm_v128_xor(t, wasm_v128_and(y, sign))); + + // compute normal length & scale + v128_t ll = wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z))); + v128_t s = wasm_f32x4_div(wasm_f32x4_splat(32767.f), wasm_f32x4_sqrt(ll)); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const v128_t fsnap = wasm_f32x4_splat(3 << 22); + + v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap); + v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap); + v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap); + + // mix x/z and y/0 to make 16-bit unpack easier + v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16)); + v128_t y0r = wasm_v128_and(yr, wasm_i32x4_splat(0xffff)); + + // pack x/y/z using 16-bit unpacks; note that this has 0 where we should have .w + v128_t res_0 = wasmx_unpacklo_v16x8(xzr, y0r); + v128_t res_1 = wasmx_unpackhi_v16x8(xzr, y0r); + + // patch in .w + res_0 = wasm_v128_or(res_0, wasm_v128_and(n4_0, wasm_i64x2_splat(0xffff000000000000))); + res_1 = wasm_v128_or(res_1, wasm_v128_and(n4_1, wasm_i64x2_splat(0xffff000000000000))); + + wasm_v128_store(&data[(i + 0) * 4], res_0); + wasm_v128_store(&data[(i + 2) * 4], res_1); + } +} + +static void decodeFilterQuatSimd(short* data, size_t count) +{ + const float scale = 1.f / sqrtf(2.f); + + for (size_t i = 0; i < count; i += 4) + { + v128_t q4_0 = wasm_v128_load(&data[(i + 0) * 4]); + v128_t q4_1 = wasm_v128_load(&data[(i + 2) * 4]); + + // gather both x/y 16-bit pairs in each 32-bit lane + v128_t q4_xy = wasmx_unziplo_v32x4(q4_0, q4_1); + v128_t q4_zc = wasmx_unziphi_v32x4(q4_0, q4_1); + + // sign-extends each of x,y in [x y] with arithmetic shifts + v128_t xf = wasm_i32x4_shr(wasm_i32x4_shl(q4_xy, 16), 16); + v128_t yf = wasm_i32x4_shr(q4_xy, 16); + v128_t zf = wasm_i32x4_shr(wasm_i32x4_shl(q4_zc, 16), 16); + v128_t cf = wasm_i32x4_shr(q4_zc, 16); + + // get a floating-point scaler using zc with bottom 2 bits set to 1 (which represents 1.f) + v128_t sf = wasm_v128_or(cf, wasm_i32x4_splat(3)); + v128_t ss = wasm_f32x4_div(wasm_f32x4_splat(scale), wasm_f32x4_convert_i32x4(sf)); + + // convert x/y/z to [-1..1] (scaled...) + v128_t x = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(xf), ss); + v128_t y = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(yf), ss); + v128_t z = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(zf), ss); + + // reconstruct w as a square root; we clamp to 0.f to avoid NaN due to precision errors + // note: i32x4_max with 0 is equivalent to f32x4_max + v128_t ww = wasm_f32x4_sub(wasm_f32x4_splat(1.f), wasm_f32x4_add(wasm_f32x4_mul(x, x), wasm_f32x4_add(wasm_f32x4_mul(y, y), wasm_f32x4_mul(z, z)))); + v128_t w = wasm_f32x4_sqrt(wasm_i32x4_max(ww, wasm_i32x4_splat(0))); + + v128_t s = wasm_f32x4_splat(32767.f); + + // fast rounded signed float->int: addition triggers renormalization after which mantissa stores the integer value + // note: the result is offset by 0x4B40_0000, but we only need the low 16 bits so we can omit the subtraction + const v128_t fsnap = wasm_f32x4_splat(3 << 22); + + v128_t xr = wasm_f32x4_add(wasm_f32x4_mul(x, s), fsnap); + v128_t yr = wasm_f32x4_add(wasm_f32x4_mul(y, s), fsnap); + v128_t zr = wasm_f32x4_add(wasm_f32x4_mul(z, s), fsnap); + v128_t wr = wasm_f32x4_add(wasm_f32x4_mul(w, s), fsnap); + + // mix x/z and w/y to make 16-bit unpack easier + v128_t xzr = wasm_v128_or(wasm_v128_and(xr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(zr, 16)); + v128_t wyr = wasm_v128_or(wasm_v128_and(wr, wasm_i32x4_splat(0xffff)), wasm_i32x4_shl(yr, 16)); + + // pack x/y/z/w using 16-bit unpacks; we pack wxyz by default (for qc=0) + v128_t res_0 = wasmx_unpacklo_v16x8(wyr, xzr); + v128_t res_1 = wasmx_unpackhi_v16x8(wyr, xzr); + + // compute component index shifted left by 4 (and moved into i32x4 slot) + // TODO: volatile here works around LLVM mis-optimizing code; https://github.com/emscripten-core/emscripten/issues/11449 + volatile v128_t cm = wasm_i32x4_shl(cf, 4); + + // rotate and store + uint64_t* out = reinterpret_cast<uint64_t*>(&data[i * 4]); + + out[0] = rotateleft64(wasm_i64x2_extract_lane(res_0, 0), wasm_i32x4_extract_lane(cm, 0)); + out[1] = rotateleft64(wasm_i64x2_extract_lane(res_0, 1), wasm_i32x4_extract_lane(cm, 1)); + out[2] = rotateleft64(wasm_i64x2_extract_lane(res_1, 0), wasm_i32x4_extract_lane(cm, 2)); + out[3] = rotateleft64(wasm_i64x2_extract_lane(res_1, 1), wasm_i32x4_extract_lane(cm, 3)); + } +} + +static void decodeFilterExpSimd(unsigned int* data, size_t count) +{ + for (size_t i = 0; i < count; i += 4) + { + v128_t v = wasm_v128_load(&data[i]); + + // decode exponent into 2^x directly + v128_t ef = wasm_i32x4_shr(v, 24); + v128_t es = wasm_i32x4_shl(wasm_i32x4_add(ef, wasm_i32x4_splat(127)), 23); + + // decode 24-bit mantissa into floating-point value + v128_t mf = wasm_i32x4_shr(wasm_i32x4_shl(v, 8), 8); + v128_t m = wasm_f32x4_convert_i32x4(mf); + + v128_t r = wasm_f32x4_mul(es, m); + + wasm_v128_store(&data[i], r); + } +} +#endif + +} // namespace meshopt + +void meshopt_decodeFilterOct(void* buffer, size_t vertex_count, size_t vertex_size) +{ + using namespace meshopt; + + assert(vertex_count % 4 == 0); + assert(vertex_size == 4 || vertex_size == 8); + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) + if (vertex_size == 4) + decodeFilterOctSimd(static_cast<signed char*>(buffer), vertex_count); + else + decodeFilterOctSimd(static_cast<short*>(buffer), vertex_count); +#else + if (vertex_size == 4) + decodeFilterOct(static_cast<signed char*>(buffer), vertex_count); + else + decodeFilterOct(static_cast<short*>(buffer), vertex_count); +#endif +} + +void meshopt_decodeFilterQuat(void* buffer, size_t vertex_count, size_t vertex_size) +{ + using namespace meshopt; + + assert(vertex_count % 4 == 0); + assert(vertex_size == 8); + (void)vertex_size; + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) + decodeFilterQuatSimd(static_cast<short*>(buffer), vertex_count); +#else + decodeFilterQuat(static_cast<short*>(buffer), vertex_count); +#endif +} + +void meshopt_decodeFilterExp(void* buffer, size_t vertex_count, size_t vertex_size) +{ + using namespace meshopt; + + assert(vertex_count % 4 == 0); + assert(vertex_size % 4 == 0); + +#if defined(SIMD_SSE) || defined(SIMD_NEON) || defined(SIMD_WASM) + decodeFilterExpSimd(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4)); +#else + decodeFilterExp(static_cast<unsigned int*>(buffer), vertex_count * (vertex_size / 4)); +#endif +} + +#undef SIMD_SSE +#undef SIMD_NEON +#undef SIMD_WASM diff --git a/thirdparty/meshoptimizer/vfetchanalyzer.cpp b/thirdparty/meshoptimizer/vfetchanalyzer.cpp new file mode 100644 index 0000000000..51dca873f8 --- /dev/null +++ b/thirdparty/meshoptimizer/vfetchanalyzer.cpp @@ -0,0 +1,58 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <string.h> + +meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const unsigned int* indices, size_t index_count, size_t vertex_count, size_t vertex_size) +{ + assert(index_count % 3 == 0); + assert(vertex_size > 0 && vertex_size <= 256); + + meshopt_Allocator allocator; + + meshopt_VertexFetchStatistics result = {}; + + unsigned char* vertex_visited = allocator.allocate<unsigned char>(vertex_count); + memset(vertex_visited, 0, vertex_count); + + const size_t kCacheLine = 64; + const size_t kCacheSize = 128 * 1024; + + // simple direct mapped cache; on typical mesh data this is close to 4-way cache, and this model is a gross approximation anyway + size_t cache[kCacheSize / kCacheLine] = {}; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + vertex_visited[index] = 1; + + size_t start_address = index * vertex_size; + size_t end_address = start_address + vertex_size; + + size_t start_tag = start_address / kCacheLine; + size_t end_tag = (end_address + kCacheLine - 1) / kCacheLine; + + assert(start_tag < end_tag); + + for (size_t tag = start_tag; tag < end_tag; ++tag) + { + size_t line = tag % (sizeof(cache) / sizeof(cache[0])); + + // we store +1 since cache is filled with 0 by default + result.bytes_fetched += (cache[line] != tag + 1) * kCacheLine; + cache[line] = tag + 1; + } + } + + size_t unique_vertex_count = 0; + + for (size_t i = 0; i < vertex_count; ++i) + unique_vertex_count += vertex_visited[i]; + + result.overfetch = unique_vertex_count == 0 ? 0 : float(result.bytes_fetched) / float(unique_vertex_count * vertex_size); + + return result; +} diff --git a/thirdparty/meshoptimizer/vfetchoptimizer.cpp b/thirdparty/meshoptimizer/vfetchoptimizer.cpp new file mode 100644 index 0000000000..465d6df5ca --- /dev/null +++ b/thirdparty/meshoptimizer/vfetchoptimizer.cpp @@ -0,0 +1,74 @@ +// This file is part of meshoptimizer library; see meshoptimizer.h for version/license details +#include "meshoptimizer.h" + +#include <assert.h> +#include <string.h> + +size_t meshopt_optimizeVertexFetchRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count) +{ + assert(index_count % 3 == 0); + + memset(destination, -1, vertex_count * sizeof(unsigned int)); + + unsigned int next_vertex = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + if (destination[index] == ~0u) + { + destination[index] = next_vertex++; + } + } + + assert(next_vertex <= vertex_count); + + return next_vertex; +} + +size_t meshopt_optimizeVertexFetch(void* destination, unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size) +{ + assert(index_count % 3 == 0); + assert(vertex_size > 0 && vertex_size <= 256); + + meshopt_Allocator allocator; + + // support in-place optimization + if (destination == vertices) + { + unsigned char* vertices_copy = allocator.allocate<unsigned char>(vertex_count * vertex_size); + memcpy(vertices_copy, vertices, vertex_count * vertex_size); + vertices = vertices_copy; + } + + // build vertex remap table + unsigned int* vertex_remap = allocator.allocate<unsigned int>(vertex_count); + memset(vertex_remap, -1, vertex_count * sizeof(unsigned int)); + + unsigned int next_vertex = 0; + + for (size_t i = 0; i < index_count; ++i) + { + unsigned int index = indices[i]; + assert(index < vertex_count); + + unsigned int& remap = vertex_remap[index]; + + if (remap == ~0u) // vertex was not added to destination VB + { + // add vertex + memcpy(static_cast<unsigned char*>(destination) + next_vertex * vertex_size, static_cast<const unsigned char*>(vertices) + index * vertex_size, vertex_size); + + remap = next_vertex++; + } + + // modify indices in place + indices[i] = remap; + } + + assert(next_vertex <= vertex_count); + + return next_vertex; +} |