summaryrefslogtreecommitdiff
path: root/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/astcenc/astcenc_find_best_partitioning.cpp')
-rw-r--r--thirdparty/astcenc/astcenc_find_best_partitioning.cpp780
1 files changed, 780 insertions, 0 deletions
diff --git a/thirdparty/astcenc/astcenc_find_best_partitioning.cpp b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
new file mode 100644
index 0000000000..ffde3c7060
--- /dev/null
+++ b/thirdparty/astcenc/astcenc_find_best_partitioning.cpp
@@ -0,0 +1,780 @@
+// SPDX-License-Identifier: Apache-2.0
+// ----------------------------------------------------------------------------
+// Copyright 2011-2023 Arm Limited
+//
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy
+// of the License at:
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations
+// under the License.
+// ----------------------------------------------------------------------------
+
+#if !defined(ASTCENC_DECOMPRESS_ONLY)
+
+/**
+ * @brief Functions for finding best partition for a block.
+ *
+ * The partition search operates in two stages. The first pass uses kmeans clustering to group
+ * texels into an ideal partitioning for the requested partition count, and then compares that
+ * against the 1024 partitionings generated by the ASTC partition hash function. The generated
+ * partitions are then ranked by the number of texels in the wrong partition, compared to the ideal
+ * clustering. All 1024 partitions are tested for similarity and ranked, apart from duplicates and
+ * partitionings that actually generate fewer than the requested partition count, but only the top
+ * N candidates are actually put through a more detailed search. N is determined by the compressor
+ * quality preset.
+ *
+ * For the detailed search, each candidate is checked against two possible encoding methods:
+ *
+ * - The best partitioning assuming different chroma colors (RGB + RGB or RGB + delta endpoints).
+ * - The best partitioning assuming same chroma colors (RGB + scale endpoints).
+ *
+ * This is implemented by computing the compute mean color and dominant direction for each
+ * partition. This defines two lines, both of which go through the mean color value.
+ *
+ * - One line has a direction defined by the dominant direction; this is used to assess the error
+ * from using an uncorrelated color representation.
+ * - The other line goes through (0,0,0,1) and is used to assess the error from using a same chroma
+ * (RGB + scale) color representation.
+ *
+ * The best candidate is selected by computing the squared-errors that result from using these
+ * lines for endpoint selection.
+ */
+
+#include <limits>
+#include "astcenc_internal.h"
+
+/**
+ * @brief Pick some initial kmeans cluster centers.
+ *
+ * @param blk The image block color data to compress.
+ * @param texel_count The number of texels in the block.
+ * @param partition_count The number of partitions in the block.
+ * @param[out] cluster_centers The initial partition cluster center colors.
+ */
+static void kmeans_init(
+ const image_block& blk,
+ unsigned int texel_count,
+ unsigned int partition_count,
+ vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS]
+) {
+ promise(texel_count > 0);
+ promise(partition_count > 0);
+
+ unsigned int clusters_selected = 0;
+ float distances[BLOCK_MAX_TEXELS];
+
+ // Pick a random sample as first cluster center; 145897 from random.org
+ unsigned int sample = 145897 % texel_count;
+ vfloat4 center_color = blk.texel(sample);
+ cluster_centers[clusters_selected] = center_color;
+ clusters_selected++;
+
+ // Compute the distance to the first cluster center
+ float distance_sum = 0.0f;
+ for (unsigned int i = 0; i < texel_count; i++)
+ {
+ vfloat4 color = blk.texel(i);
+ vfloat4 diff = color - center_color;
+ float distance = dot_s(diff * diff, blk.channel_weight);
+ distance_sum += distance;
+ distances[i] = distance;
+ }
+
+ // More numbers from random.org for weighted-random center selection
+ const float cluster_cutoffs[9] {
+ 0.626220f, 0.932770f, 0.275454f,
+ 0.318558f, 0.240113f, 0.009190f,
+ 0.347661f, 0.731960f, 0.156391f
+ };
+
+ unsigned int cutoff = (clusters_selected - 1) + 3 * (partition_count - 2);
+
+ // Pick the remaining samples as needed
+ while (true)
+ {
+ // Pick the next center in a weighted-random fashion.
+ float summa = 0.0f;
+ float distance_cutoff = distance_sum * cluster_cutoffs[cutoff++];
+ for (sample = 0; sample < texel_count; sample++)
+ {
+ summa += distances[sample];
+ if (summa >= distance_cutoff)
+ {
+ break;
+ }
+ }
+
+ // Clamp to a valid range and store the selected cluster center
+ sample = astc::min(sample, texel_count - 1);
+
+ center_color = blk.texel(sample);
+ cluster_centers[clusters_selected++] = center_color;
+ if (clusters_selected >= partition_count)
+ {
+ break;
+ }
+
+ // Compute the distance to the new cluster center, keep the min dist
+ distance_sum = 0.0f;
+ for (unsigned int i = 0; i < texel_count; i++)
+ {
+ vfloat4 color = blk.texel(i);
+ vfloat4 diff = color - center_color;
+ float distance = dot_s(diff * diff, blk.channel_weight);
+ distance = astc::min(distance, distances[i]);
+ distance_sum += distance;
+ distances[i] = distance;
+ }
+ }
+}
+
+/**
+ * @brief Assign texels to clusters, based on a set of chosen center points.
+ *
+ * @param blk The image block color data to compress.
+ * @param texel_count The number of texels in the block.
+ * @param partition_count The number of partitions in the block.
+ * @param cluster_centers The partition cluster center colors.
+ * @param[out] partition_of_texel The partition assigned for each texel.
+ */
+static void kmeans_assign(
+ const image_block& blk,
+ unsigned int texel_count,
+ unsigned int partition_count,
+ const vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+ uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+ promise(texel_count > 0);
+ promise(partition_count > 0);
+
+ uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+ // Find the best partition for every texel
+ for (unsigned int i = 0; i < texel_count; i++)
+ {
+ float best_distance = std::numeric_limits<float>::max();
+ unsigned int best_partition = 0;
+
+ vfloat4 color = blk.texel(i);
+ for (unsigned int j = 0; j < partition_count; j++)
+ {
+ vfloat4 diff = color - cluster_centers[j];
+ float distance = dot_s(diff * diff, blk.channel_weight);
+ if (distance < best_distance)
+ {
+ best_distance = distance;
+ best_partition = j;
+ }
+ }
+
+ partition_of_texel[i] = static_cast<uint8_t>(best_partition);
+ partition_texel_count[best_partition]++;
+ }
+
+ // It is possible to get a situation where a partition ends up without any texels. In this case,
+ // assign texel N to partition N. This is silly, but ensures that every partition retains at
+ // least one texel. Reassigning a texel in this manner may cause another partition to go empty,
+ // so if we actually did a reassignment, run the whole loop over again.
+ bool problem_case;
+ do
+ {
+ problem_case = false;
+ for (unsigned int i = 0; i < partition_count; i++)
+ {
+ if (partition_texel_count[i] == 0)
+ {
+ partition_texel_count[partition_of_texel[i]]--;
+ partition_texel_count[i]++;
+ partition_of_texel[i] = static_cast<uint8_t>(i);
+ problem_case = true;
+ }
+ }
+ } while (problem_case);
+}
+
+/**
+ * @brief Compute new cluster centers based on their center of gravity.
+ *
+ * @param blk The image block color data to compress.
+ * @param texel_count The number of texels in the block.
+ * @param partition_count The number of partitions in the block.
+ * @param[out] cluster_centers The new cluster center colors.
+ * @param partition_of_texel The partition assigned for each texel.
+ */
+static void kmeans_update(
+ const image_block& blk,
+ unsigned int texel_count,
+ unsigned int partition_count,
+ vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS],
+ const uint8_t partition_of_texel[BLOCK_MAX_TEXELS]
+) {
+ promise(texel_count > 0);
+ promise(partition_count > 0);
+
+ vfloat4 color_sum[BLOCK_MAX_PARTITIONS] {
+ vfloat4::zero(),
+ vfloat4::zero(),
+ vfloat4::zero(),
+ vfloat4::zero()
+ };
+
+ uint8_t partition_texel_count[BLOCK_MAX_PARTITIONS] { 0 };
+
+ // Find the center-of-gravity in each cluster
+ for (unsigned int i = 0; i < texel_count; i++)
+ {
+ uint8_t partition = partition_of_texel[i];
+ color_sum[partition] += blk.texel(i);
+ partition_texel_count[partition]++;
+ }
+
+ // Set the center of gravity to be the new cluster center
+ for (unsigned int i = 0; i < partition_count; i++)
+ {
+ float scale = 1.0f / static_cast<float>(partition_texel_count[i]);
+ cluster_centers[i] = color_sum[i] * scale;
+ }
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 2-partition mode.
+ *
+ * @param a The texel assignment bitvector for the block.
+ * @param b The texel assignment bitvector for the partition table.
+ *
+ * @return The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch2(
+ const uint64_t a[2],
+ const uint64_t b[2]
+) {
+ int v1 = popcount(a[0] ^ b[0]) + popcount(a[1] ^ b[1]);
+ int v2 = popcount(a[0] ^ b[1]) + popcount(a[1] ^ b[0]);
+ return astc::min(v1, v2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 3-partition mode.
+ *
+ * @param a The texel assignment bitvector for the block.
+ * @param b The texel assignment bitvector for the partition table.
+ *
+ * @return The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch3(
+ const uint64_t a[3],
+ const uint64_t b[3]
+) {
+ int p00 = popcount(a[0] ^ b[0]);
+ int p01 = popcount(a[0] ^ b[1]);
+ int p02 = popcount(a[0] ^ b[2]);
+
+ int p10 = popcount(a[1] ^ b[0]);
+ int p11 = popcount(a[1] ^ b[1]);
+ int p12 = popcount(a[1] ^ b[2]);
+
+ int p20 = popcount(a[2] ^ b[0]);
+ int p21 = popcount(a[2] ^ b[1]);
+ int p22 = popcount(a[2] ^ b[2]);
+
+ int s0 = p11 + p22;
+ int s1 = p12 + p21;
+ int v0 = astc::min(s0, s1) + p00;
+
+ int s2 = p10 + p22;
+ int s3 = p12 + p20;
+ int v1 = astc::min(s2, s3) + p01;
+
+ int s4 = p10 + p21;
+ int s5 = p11 + p20;
+ int v2 = astc::min(s4, s5) + p02;
+
+ return astc::min(v0, v1, v2);
+}
+
+/**
+ * @brief Compute bit-mismatch for partitioning in 4-partition mode.
+ *
+ * @param a The texel assignment bitvector for the block.
+ * @param b The texel assignment bitvector for the partition table.
+ *
+ * @return The number of bit mismatches.
+ */
+static inline unsigned int partition_mismatch4(
+ const uint64_t a[4],
+ const uint64_t b[4]
+) {
+ int p00 = popcount(a[0] ^ b[0]);
+ int p01 = popcount(a[0] ^ b[1]);
+ int p02 = popcount(a[0] ^ b[2]);
+ int p03 = popcount(a[0] ^ b[3]);
+
+ int p10 = popcount(a[1] ^ b[0]);
+ int p11 = popcount(a[1] ^ b[1]);
+ int p12 = popcount(a[1] ^ b[2]);
+ int p13 = popcount(a[1] ^ b[3]);
+
+ int p20 = popcount(a[2] ^ b[0]);
+ int p21 = popcount(a[2] ^ b[1]);
+ int p22 = popcount(a[2] ^ b[2]);
+ int p23 = popcount(a[2] ^ b[3]);
+
+ int p30 = popcount(a[3] ^ b[0]);
+ int p31 = popcount(a[3] ^ b[1]);
+ int p32 = popcount(a[3] ^ b[2]);
+ int p33 = popcount(a[3] ^ b[3]);
+
+ int mx23 = astc::min(p22 + p33, p23 + p32);
+ int mx13 = astc::min(p21 + p33, p23 + p31);
+ int mx12 = astc::min(p21 + p32, p22 + p31);
+ int mx03 = astc::min(p20 + p33, p23 + p30);
+ int mx02 = astc::min(p20 + p32, p22 + p30);
+ int mx01 = astc::min(p21 + p30, p20 + p31);
+
+ int v0 = p00 + astc::min(p11 + mx23, p12 + mx13, p13 + mx12);
+ int v1 = p01 + astc::min(p10 + mx23, p12 + mx03, p13 + mx02);
+ int v2 = p02 + astc::min(p11 + mx03, p10 + mx13, p13 + mx01);
+ int v3 = p03 + astc::min(p11 + mx02, p12 + mx01, p10 + mx12);
+
+ return astc::min(v0, v1, v2, v3);
+}
+
+using mismatch_dispatch = unsigned int (*)(const uint64_t*, const uint64_t*);
+
+/**
+ * @brief Count the partition table mismatches vs the data clustering.
+ *
+ * @param bsd The block size information.
+ * @param partition_count The number of partitions in the block.
+ * @param bitmaps The block texel partition assignment patterns.
+ * @param[out] mismatch_counts The array storing per partitioning mismatch counts.
+ */
+static void count_partition_mismatch_bits(
+ const block_size_descriptor& bsd,
+ unsigned int partition_count,
+ const uint64_t bitmaps[BLOCK_MAX_PARTITIONS],
+ unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS]
+) {
+ unsigned int active_count = bsd.partitioning_count_selected[partition_count - 1];
+ promise(active_count > 0);
+
+ if (partition_count == 2)
+ {
+ for (unsigned int i = 0; i < active_count; i++)
+ {
+ mismatch_counts[i] = partition_mismatch2(bitmaps, bsd.coverage_bitmaps_2[i]);
+ }
+ }
+ else if (partition_count == 3)
+ {
+ for (unsigned int i = 0; i < active_count; i++)
+ {
+ mismatch_counts[i] = partition_mismatch3(bitmaps, bsd.coverage_bitmaps_3[i]);
+ }
+ }
+ else
+ {
+ for (unsigned int i = 0; i < active_count; i++)
+ {
+ mismatch_counts[i] = partition_mismatch4(bitmaps, bsd.coverage_bitmaps_4[i]);
+ }
+ }
+}
+
+/**
+ * @brief Use counting sort on the mismatch array to sort partition candidates.
+ *
+ * @param partitioning_count The number of packed partitionings.
+ * @param mismatch_count Partitioning mismatch counts, in index order.
+ * @param[out] partition_ordering Partition index values, in mismatch order.
+ *
+ * @return The number of active partitions in this selection.
+ */
+static unsigned int get_partition_ordering_by_mismatch_bits(
+ unsigned int partitioning_count,
+ const unsigned int mismatch_count[BLOCK_MAX_PARTITIONINGS],
+ unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+ promise(partitioning_count > 0);
+ unsigned int mscount[256] { 0 };
+
+ // Create the histogram of mismatch counts
+ for (unsigned int i = 0; i < partitioning_count; i++)
+ {
+ mscount[mismatch_count[i]]++;
+ }
+
+ unsigned int active_count = partitioning_count - mscount[255];
+
+ // Create a running sum from the histogram array
+ // Cells store previous values only; i.e. exclude self after sum
+ unsigned int summa = 0;
+ for (unsigned int i = 0; i < 256; i++)
+ {
+ unsigned int cnt = mscount[i];
+ mscount[i] = summa;
+ summa += cnt;
+ }
+
+ // Use the running sum as the index, incrementing after read to allow
+ // sequential entries with the same count
+ for (unsigned int i = 0; i < partitioning_count; i++)
+ {
+ unsigned int idx = mscount[mismatch_count[i]]++;
+ partition_ordering[idx] = i;
+ }
+
+ return active_count;
+}
+
+/**
+ * @brief Use k-means clustering to compute a partition ordering for a block..
+ *
+ * @param bsd The block size information.
+ * @param blk The image block color data to compress.
+ * @param partition_count The desired number of partitions in the block.
+ * @param[out] partition_ordering The list of recommended partition indices, in priority order.
+ *
+ * @return The number of active partitionings in this selection.
+ */
+static unsigned int compute_kmeans_partition_ordering(
+ const block_size_descriptor& bsd,
+ const image_block& blk,
+ unsigned int partition_count,
+ unsigned int partition_ordering[BLOCK_MAX_PARTITIONINGS]
+) {
+ vfloat4 cluster_centers[BLOCK_MAX_PARTITIONS];
+ uint8_t texel_partitions[BLOCK_MAX_TEXELS];
+
+ // Use three passes of k-means clustering to partition the block data
+ for (unsigned int i = 0; i < 3; i++)
+ {
+ if (i == 0)
+ {
+ kmeans_init(blk, bsd.texel_count, partition_count, cluster_centers);
+ }
+ else
+ {
+ kmeans_update(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+ }
+
+ kmeans_assign(blk, bsd.texel_count, partition_count, cluster_centers, texel_partitions);
+ }
+
+ // Construct the block bitmaps of texel assignments to each partition
+ uint64_t bitmaps[BLOCK_MAX_PARTITIONS] { 0 };
+ unsigned int texels_to_process = astc::min(bsd.texel_count, BLOCK_MAX_KMEANS_TEXELS);
+ promise(texels_to_process > 0);
+ for (unsigned int i = 0; i < texels_to_process; i++)
+ {
+ unsigned int idx = bsd.kmeans_texels[i];
+ bitmaps[texel_partitions[idx]] |= 1ULL << i;
+ }
+
+ // Count the mismatch between the block and the format's partition tables
+ unsigned int mismatch_counts[BLOCK_MAX_PARTITIONINGS];
+ count_partition_mismatch_bits(bsd, partition_count, bitmaps, mismatch_counts);
+
+ // Sort the partitions based on the number of mismatched bits
+ return get_partition_ordering_by_mismatch_bits(
+ bsd.partitioning_count_selected[partition_count - 1],
+ mismatch_counts, partition_ordering);
+}
+
+/**
+ * @brief Insert a partitioning into an order list of results, sorted by error.
+ *
+ * @param max_values The max number of entries in the best result arrays.
+ * @param this_error The error of the new entry.
+ * @param this_partition The partition ID of the new entry.
+ * @param[out] best_errors The array of best error values.
+ * @param[out] best_partitions The array of best partition values.
+ */
+static void insert_result(
+ unsigned int max_values,
+ float this_error,
+ unsigned int this_partition,
+ float* best_errors,
+ unsigned int* best_partitions)
+{
+ promise(max_values > 0);
+
+ // Don't bother searching if the current worst error beats the new error
+ if (this_error >= best_errors[max_values - 1])
+ {
+ return;
+ }
+
+ // Else insert into the list in error-order
+ for (unsigned int i = 0; i < max_values; i++)
+ {
+ // Existing result is better - move on ...
+ if (this_error > best_errors[i])
+ {
+ continue;
+ }
+
+ // Move existing results down one
+ for (unsigned int j = max_values - 1; j > i; j--)
+ {
+ best_errors[j] = best_errors[j - 1];
+ best_partitions[j] = best_partitions[j - 1];
+ }
+
+ // Insert new result
+ best_errors[i] = this_error;
+ best_partitions[i] = this_partition;
+ break;
+ }
+}
+
+/* See header for documentation. */
+unsigned int find_best_partition_candidates(
+ const block_size_descriptor& bsd,
+ const image_block& blk,
+ unsigned int partition_count,
+ unsigned int partition_search_limit,
+ unsigned int best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES],
+ unsigned int requested_candidates
+) {
+ // Constant used to estimate quantization error for a given partitioning; the optimal value for
+ // this depends on bitrate. These values have been determined empirically.
+ unsigned int texels_per_block = bsd.texel_count;
+ float weight_imprecision_estim = 0.055f;
+ if (texels_per_block <= 20)
+ {
+ weight_imprecision_estim = 0.03f;
+ }
+ else if (texels_per_block <= 31)
+ {
+ weight_imprecision_estim = 0.04f;
+ }
+ else if (texels_per_block <= 41)
+ {
+ weight_imprecision_estim = 0.05f;
+ }
+
+ promise(partition_count > 0);
+ promise(partition_search_limit > 0);
+
+ weight_imprecision_estim = weight_imprecision_estim * weight_imprecision_estim;
+
+ unsigned int partition_sequence[BLOCK_MAX_PARTITIONINGS];
+ unsigned int sequence_len = compute_kmeans_partition_ordering(bsd, blk, partition_count, partition_sequence);
+ partition_search_limit = astc::min(partition_search_limit, sequence_len);
+ requested_candidates = astc::min(partition_search_limit, requested_candidates);
+
+ bool uses_alpha = !blk.is_constant_channel(3);
+
+ // Partitioning errors assuming uncorrelated-chrominance endpoints
+ float uncor_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+ unsigned int uncor_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+ // Partitioning errors assuming same-chrominance endpoints
+ float samec_best_errors[TUNE_MAX_PARTITIONING_CANDIDATES];
+ unsigned int samec_best_partitions[TUNE_MAX_PARTITIONING_CANDIDATES];
+
+ for (unsigned int i = 0; i < requested_candidates; i++)
+ {
+ uncor_best_errors[i] = ERROR_CALC_DEFAULT;
+ samec_best_errors[i] = ERROR_CALC_DEFAULT;
+ }
+
+ if (uses_alpha)
+ {
+ for (unsigned int i = 0; i < partition_search_limit; i++)
+ {
+ unsigned int partition = partition_sequence[i];
+ const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+ // Compute weighting to give to each component in each partition
+ partition_metrics pms[BLOCK_MAX_PARTITIONS];
+
+ compute_avgs_and_dirs_4_comp(pi, blk, pms);
+
+ line4 uncor_lines[BLOCK_MAX_PARTITIONS];
+ line4 samec_lines[BLOCK_MAX_PARTITIONS];
+
+ processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS];
+ processed_line4 samec_plines[BLOCK_MAX_PARTITIONS];
+
+ float uncor_line_lens[BLOCK_MAX_PARTITIONS];
+ float samec_line_lens[BLOCK_MAX_PARTITIONS];
+
+ for (unsigned int j = 0; j < partition_count; j++)
+ {
+ partition_metrics& pm = pms[j];
+
+ uncor_lines[j].a = pm.avg;
+ uncor_lines[j].b = normalize_safe(pm.dir, unit4());
+
+ uncor_plines[j].amod = uncor_lines[j].a - uncor_lines[j].b * dot(uncor_lines[j].a, uncor_lines[j].b);
+ uncor_plines[j].bs = uncor_lines[j].b;
+
+ samec_lines[j].a = vfloat4::zero();
+ samec_lines[j].b = normalize_safe(pm.avg, unit4());
+
+ samec_plines[j].amod = vfloat4::zero();
+ samec_plines[j].bs = samec_lines[j].b;
+ }
+
+ float uncor_error = 0.0f;
+ float samec_error = 0.0f;
+
+ compute_error_squared_rgba(pi,
+ blk,
+ uncor_plines,
+ samec_plines,
+ uncor_line_lens,
+ samec_line_lens,
+ uncor_error,
+ samec_error);
+
+ // Compute an estimate of error introduced by weight quantization imprecision.
+ // This error is computed as follows, for each partition
+ // 1: compute the principal-axis vector (full length) in error-space
+ // 2: convert the principal-axis vector to regular RGB-space
+ // 3: scale the vector by a constant that estimates average quantization error
+ // 4: for each texel, square the vector, then do a dot-product with the texel's
+ // error weight; sum up the results across all texels.
+ // 4(optimized): square the vector once, then do a dot-product with the average
+ // texel error, then multiply by the number of texels.
+
+ for (unsigned int j = 0; j < partition_count; j++)
+ {
+ float tpp = static_cast<float>(pi.partition_texel_count[j]);
+ vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+ vfloat4 uncor_vector = uncor_lines[j].b * uncor_line_lens[j];
+ vfloat4 samec_vector = samec_lines[j].b * samec_line_lens[j];
+
+ uncor_error += dot_s(uncor_vector * uncor_vector, error_weights);
+ samec_error += dot_s(samec_vector * samec_vector, error_weights);
+ }
+
+ insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+ insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+ }
+ }
+ else
+ {
+ for (unsigned int i = 0; i < partition_search_limit; i++)
+ {
+ unsigned int partition = partition_sequence[i];
+ const auto& pi = bsd.get_raw_partition_info(partition_count, partition);
+
+ // Compute weighting to give to each component in each partition
+ partition_metrics pms[BLOCK_MAX_PARTITIONS];
+ compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
+
+ partition_lines3 plines[BLOCK_MAX_PARTITIONS];
+
+ for (unsigned int j = 0; j < partition_count; j++)
+ {
+ partition_metrics& pm = pms[j];
+ partition_lines3& pl = plines[j];
+
+ pl.uncor_line.a = pm.avg;
+ pl.uncor_line.b = normalize_safe(pm.dir, unit3());
+
+ pl.samec_line.a = vfloat4::zero();
+ pl.samec_line.b = normalize_safe(pm.avg, unit3());
+
+ pl.uncor_pline.amod = pl.uncor_line.a - pl.uncor_line.b * dot3(pl.uncor_line.a, pl.uncor_line.b);
+ pl.uncor_pline.bs = pl.uncor_line.b;
+
+ pl.samec_pline.amod = vfloat4::zero();
+ pl.samec_pline.bs = pl.samec_line.b;
+ }
+
+ float uncor_error = 0.0f;
+ float samec_error = 0.0f;
+
+ compute_error_squared_rgb(pi,
+ blk,
+ plines,
+ uncor_error,
+ samec_error);
+
+ // Compute an estimate of error introduced by weight quantization imprecision.
+ // This error is computed as follows, for each partition
+ // 1: compute the principal-axis vector (full length) in error-space
+ // 2: convert the principal-axis vector to regular RGB-space
+ // 3: scale the vector by a constant that estimates average quantization error
+ // 4: for each texel, square the vector, then do a dot-product with the texel's
+ // error weight; sum up the results across all texels.
+ // 4(optimized): square the vector once, then do a dot-product with the average
+ // texel error, then multiply by the number of texels.
+
+ for (unsigned int j = 0; j < partition_count; j++)
+ {
+ partition_lines3& pl = plines[j];
+
+ float tpp = static_cast<float>(pi.partition_texel_count[j]);
+ vfloat4 error_weights(tpp * weight_imprecision_estim);
+
+ vfloat4 uncor_vector = pl.uncor_line.b * pl.uncor_line_len;
+ vfloat4 samec_vector = pl.samec_line.b * pl.samec_line_len;
+
+ uncor_error += dot3_s(uncor_vector * uncor_vector, error_weights);
+ samec_error += dot3_s(samec_vector * samec_vector, error_weights);
+ }
+
+ insert_result(requested_candidates, uncor_error, partition, uncor_best_errors, uncor_best_partitions);
+ insert_result(requested_candidates, samec_error, partition, samec_best_errors, samec_best_partitions);
+ }
+ }
+
+ bool best_is_uncor = uncor_best_partitions[0] > samec_best_partitions[0];
+
+ unsigned int interleave[2 * TUNE_MAX_PARTITIONING_CANDIDATES];
+ for (unsigned int i = 0; i < requested_candidates; i++)
+ {
+ if (best_is_uncor)
+ {
+ interleave[2 * i] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+ interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+ }
+ else
+ {
+ interleave[2 * i] = bsd.get_raw_partition_info(partition_count, samec_best_partitions[i]).partition_index;
+ interleave[2 * i + 1] = bsd.get_raw_partition_info(partition_count, uncor_best_partitions[i]).partition_index;
+ }
+ }
+
+ uint64_t bitmasks[1024/64] { 0 };
+ unsigned int emitted = 0;
+
+ // Deduplicate the first "requested" entries
+ for (unsigned int i = 0; i < requested_candidates * 2; i++)
+ {
+ unsigned int partition = interleave[i];
+
+ unsigned int word = partition / 64;
+ unsigned int bit = partition % 64;
+
+ bool written = bitmasks[word] & (1ull << bit);
+
+ if (!written)
+ {
+ best_partitions[emitted] = partition;
+ bitmasks[word] |= 1ull << bit;
+ emitted++;
+
+ if (emitted == requested_candidates)
+ {
+ break;
+ }
+ }
+ }
+
+ return emitted;
+}
+
+#endif