summaryrefslogtreecommitdiff
path: root/thirdparty/basis_universal/encoder/basisu_frontend.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/basis_universal/encoder/basisu_frontend.cpp')
-rw-r--r--thirdparty/basis_universal/encoder/basisu_frontend.cpp1789
1 files changed, 1143 insertions, 646 deletions
diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.cpp b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
index 324fc8e447..00210e6679 100644
--- a/thirdparty/basis_universal/encoder/basisu_frontend.cpp
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
@@ -19,6 +19,7 @@
//
#include "../transcoder/basisu.h"
#include "basisu_frontend.h"
+#include "basisu_opencl.h"
#include <unordered_set>
#include <unordered_map>
@@ -43,61 +44,15 @@ namespace basisu
// TODO - How to handle internal verifies in the basisu lib
static inline void handle_verify_failure(int line)
{
- fprintf(stderr, "ERROR: basisu_frontend: verify check failed at line %i!\n", line);
+ error_printf("basisu_frontend: verify check failed at line %i!\n", line);
abort();
}
bool basisu_frontend::init(const params &p)
{
-#if 0
- // HACK HACK
- FILE* pFile;
- fopen_s(&pFile, "tv.bin", "rb");
- if (pFile)
- {
- debug_printf("Using tv.bin\n");
-
- fseek(pFile, 0, SEEK_END);
- uint32_t size = ftell(pFile);
- fseek(pFile, 0, SEEK_SET);
-
- uint32_t tv = size / sizeof(vec6F_quantizer::training_vec_with_weight);
-
- basisu::vector<vec6F_quantizer::training_vec_with_weight> v(tv);
- fread(&v[0], 1, sizeof(v[0]) * tv, pFile);
-
- for (uint32_t i = 0; i < tv; i++)
- m_endpoint_clusterizer.add_training_vec(v[i].first, v[i].second);
-
- m_endpoint_clusterizer.generate(16128);
- basisu::vector<uint_vec> codebook;
- m_endpoint_clusterizer.retrieve(codebook);
-
- printf("Generated %u entries\n", (uint32_t)codebook.size());
-
- fclose(pFile);
- exit(0);
- }
-#endif
-
- if (p.m_use_hybrid_selector_codebooks)
- {
- if (!p.m_pGlobal_sel_codebook)
- {
- debug_printf("basisu_frontend::init: No global sel codebook!\n");
- assert(0);
- return false;
- }
- }
-
- debug_printf("basisu_frontend::init: Multithreaded: %u, NumEndpointClusters: %u, NumSelectorClusters: %u, Perceptual: %u, CompressionLevel: %u\n",
- p.m_multithreaded, p.m_max_endpoint_clusters, p.m_max_selector_clusters, p.m_perceptual, p.m_compression_level);
-
- debug_printf("Global sel codebook pal bits: %u, Global sel codebook mod bits: %u, Use hybrid selector codebook: %u, Hybrid codebook quality thresh: %f\n",
- p.m_num_global_sel_codebook_pal_bits,
- p.m_num_global_sel_codebook_mod_bits,
- p.m_use_hybrid_selector_codebooks,
- p.m_hybrid_codebook_quality_thresh);
+ debug_printf("basisu_frontend::init: Multithreaded: %u, Job pool total threads: %u, NumEndpointClusters: %u, NumSelectorClusters: %u, Perceptual: %u, CompressionLevel: %u\n",
+ p.m_multithreaded, p.m_pJob_pool ? p.m_pJob_pool->get_total_threads() : 0,
+ p.m_max_endpoint_clusters, p.m_max_selector_clusters, p.m_perceptual, p.m_compression_level);
if ((p.m_max_endpoint_clusters < 1) || (p.m_max_endpoint_clusters > cMaxEndpointClusters))
return false;
@@ -106,8 +61,22 @@ namespace basisu
m_source_blocks.resize(0);
append_vector(m_source_blocks, p.m_pSource_blocks, p.m_num_source_blocks);
-
+
m_params = p;
+
+ if (m_params.m_pOpenCL_context)
+ {
+ BASISU_ASSUME(sizeof(cl_pixel_block) == sizeof(pixel_block));
+
+ // Upload the RGBA pixel blocks a single time.
+ if (!opencl_set_pixel_blocks(m_params.m_pOpenCL_context, m_source_blocks.size(), (cl_pixel_block*)m_source_blocks.data()))
+ {
+ // This is not fatal, we just won't use OpenCL.
+ error_printf("basisu_frontend::init: opencl_set_pixel_blocks() failed\n");
+ m_params.m_pOpenCL_context = nullptr;
+ m_opencl_failed = true;
+ }
+ }
m_encoded_blocks.resize(m_params.m_num_source_blocks);
memset(&m_encoded_blocks[0], 0, m_encoded_blocks.size() * sizeof(m_encoded_blocks[0]));
@@ -194,8 +163,12 @@ namespace basisu
m_total_blocks = m_params.m_num_source_blocks;
m_total_pixels = m_total_blocks * cPixelBlockTotalPixels;
+ // Encode the initial high quality ETC1S texture
+
init_etc1_images();
+ // First quantize the ETC1S endpoints
+
if (m_params.m_pGlobal_codebooks)
{
init_global_codebooks();
@@ -205,16 +178,26 @@ namespace basisu
init_endpoint_training_vectors();
generate_endpoint_clusters();
-
+
for (uint32_t refine_endpoint_step = 0; refine_endpoint_step < m_num_endpoint_codebook_iterations; refine_endpoint_step++)
{
- BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
+ if (m_params.m_validate)
+ {
+ BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
+
+ BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));
+ }
if (refine_endpoint_step)
{
introduce_new_endpoint_clusters();
}
+ if (m_params.m_validate)
+ {
+ BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));
+ }
+
generate_endpoint_codebook(refine_endpoint_step);
if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
@@ -236,7 +219,7 @@ namespace basisu
if ((m_params.m_tex_type == basist::cBASISTexTypeVideoFrames) && (!refine_endpoint_step) && (m_num_endpoint_codebook_iterations == 1))
{
eliminate_redundant_or_empty_endpoint_clusters();
- generate_endpoint_codebook(refine_endpoint_step);
+ generate_endpoint_codebook(basisu::maximum(1U, refine_endpoint_step));
}
if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
@@ -250,22 +233,37 @@ namespace basisu
dump_endpoint_clusterization_visualization(buf, true);
}
}
+
+ if (m_params.m_validate)
+ {
+ BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));
+ }
eliminate_redundant_or_empty_endpoint_clusters();
+ if (m_params.m_validate)
+ {
+ BASISU_FRONTEND_VERIFY(validate_endpoint_cluster_hierarchy(false));
+ }
+
if (m_params.m_debug_stats)
debug_printf("Total endpoint clusters: %u\n", (uint32_t)m_endpoint_clusters.size());
if (early_out)
break;
}
-
- BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
+
+ if (m_params.m_validate)
+ {
+ BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
+ }
generate_block_endpoint_clusters();
create_initial_packed_texture();
+ // Now quantize the ETC1S selectors
+
generate_selector_clusters();
if (m_use_hierarchical_selector_codebooks)
@@ -276,12 +274,12 @@ namespace basisu
create_optimized_selector_codebook(0);
find_optimal_selector_clusters_for_each_block();
-
+
introduce_special_selector_clusters();
}
else
{
- const uint32_t num_refine_selector_steps = m_params.m_pGlobal_sel_codebook ? 1 : m_num_selector_codebook_iterations;
+ const uint32_t num_refine_selector_steps = m_num_selector_codebook_iterations;
for (uint32_t refine_selector_steps = 0; refine_selector_steps < num_refine_selector_steps; refine_selector_steps++)
{
create_optimized_selector_codebook(refine_selector_steps);
@@ -289,7 +287,7 @@ namespace basisu
find_optimal_selector_clusters_for_each_block();
introduce_special_selector_clusters();
-
+
if ((m_params.m_compression_level >= 4) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
{
if (!refine_block_endpoints_given_selectors())
@@ -297,7 +295,7 @@ namespace basisu
}
}
}
-
+
optimize_selector_codebook();
if (m_params.m_debug_stats)
@@ -574,9 +572,6 @@ namespace basisu
{
debug_printf("introduce_special_selector_clusters\n");
- if (m_params.m_pGlobal_sel_codebook)
- return;
-
uint32_t total_blocks_relocated = 0;
const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
@@ -722,23 +717,15 @@ namespace basisu
}
basisu::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);
- basist::etc1_global_selector_codebook_entry_id_vec new_optimized_cluster_selector_global_cb_ids(m_optimized_cluster_selector_global_cb_ids.size() ? total_new_entries : 0);
basisu::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_block_indices.size() ? total_new_entries : 0);
- bool_vec new_selector_cluster_uses_global_cb(m_selector_cluster_uses_global_cb.size() ? total_new_entries : 0);
for (uint32_t i = 0; i < total_new_entries; i++)
{
if (m_optimized_cluster_selectors.size())
new_optimized_cluster_selectors[i] = m_optimized_cluster_selectors[new_to_old[i]];
- if (m_optimized_cluster_selector_global_cb_ids.size())
- new_optimized_cluster_selector_global_cb_ids[i] = m_optimized_cluster_selector_global_cb_ids[new_to_old[i]];
-
//if (m_selector_cluster_block_indices.size())
// new_selector_cluster_indices[i] = m_selector_cluster_block_indices[new_to_old[i]];
-
- if (m_selector_cluster_uses_global_cb.size())
- new_selector_cluster_uses_global_cb[i] = m_selector_cluster_uses_global_cb[new_to_old[i]];
}
for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)
@@ -747,9 +734,7 @@ namespace basisu
}
m_optimized_cluster_selectors.swap(new_optimized_cluster_selectors);
- m_optimized_cluster_selector_global_cb_ids.swap(new_optimized_cluster_selector_global_cb_ids);
m_selector_cluster_block_indices.swap(new_selector_cluster_indices);
- m_selector_cluster_uses_global_cb.swap(new_selector_cluster_uses_global_cb);
// This isn't strictly necessary - doing it for completeness/future sanity.
if (m_selector_clusters_within_each_parent_cluster.size())
@@ -771,66 +756,93 @@ namespace basisu
m_etc1_blocks_etc1s.resize(m_total_blocks);
- const uint32_t N = 4096;
- for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+ bool use_cpu = true;
+
+ if (m_params.m_pOpenCL_context)
+ {
+ uint32_t total_perms = 64;
+ if (m_params.m_compression_level == 0)
+ total_perms = 4;
+ else if (m_params.m_compression_level == 1)
+ total_perms = 16;
+ else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+ total_perms = OPENCL_ENCODE_ETC1S_MAX_PERMS;
+
+ bool status = opencl_encode_etc1s_blocks(m_params.m_pOpenCL_context, m_etc1_blocks_etc1s.data(), m_params.m_perceptual, total_perms);
+ if (status)
+ use_cpu = false;
+ else
+ {
+ error_printf("basisu_frontend::init_etc1_images: opencl_encode_etc1s_blocks() failed! Using CPU.\n");
+ m_params.m_pOpenCL_context = nullptr;
+ m_opencl_failed = true;
+ }
+ }
+
+ if (use_cpu)
{
- const uint32_t first_index = block_index_iter;
- const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+ const uint32_t N = 4096;
+ for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+ {
+ const uint32_t first_index = block_index_iter;
+ const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+ m_params.m_pJob_pool->add_job([this, first_index, last_index] {
#endif
- for (uint32_t block_index = first_index; block_index < last_index; block_index++)
- {
- const pixel_block &source_blk = get_source_pixel_block(block_index);
+ for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+ {
+ const pixel_block& source_blk = get_source_pixel_block(block_index);
- etc1_optimizer optimizer;
- etc1_optimizer::params optimizer_params;
- etc1_optimizer::results optimizer_results;
-
- if (m_params.m_compression_level == 0)
- optimizer_params.m_quality = cETCQualityFast;
- else if (m_params.m_compression_level == 1)
- optimizer_params.m_quality = cETCQualityMedium;
- else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
- optimizer_params.m_quality = cETCQualityUber;
-
- optimizer_params.m_num_src_pixels = 16;
- optimizer_params.m_pSrc_pixels = source_blk.get_ptr();
- optimizer_params.m_perceptual = m_params.m_perceptual;
+ etc1_optimizer optimizer;
+ etc1_optimizer::params optimizer_params;
+ etc1_optimizer::results optimizer_results;
- uint8_t selectors[16];
- optimizer_results.m_pSelectors = selectors;
- optimizer_results.m_n = 16;
+ if (m_params.m_compression_level == 0)
+ optimizer_params.m_quality = cETCQualityFast;
+ else if (m_params.m_compression_level == 1)
+ optimizer_params.m_quality = cETCQualityMedium;
+ else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+ optimizer_params.m_quality = cETCQualityUber;
- optimizer.init(optimizer_params, optimizer_results);
- if (!optimizer.compute())
- BASISU_FRONTEND_VERIFY(false);
+ optimizer_params.m_num_src_pixels = 16;
+ optimizer_params.m_pSrc_pixels = source_blk.get_ptr();
+ optimizer_params.m_perceptual = m_params.m_perceptual;
- etc_block &blk = m_etc1_blocks_etc1s[block_index];
+ uint8_t selectors[16];
+ optimizer_results.m_pSelectors = selectors;
+ optimizer_results.m_n = 16;
- memset(&blk, 0, sizeof(blk));
- blk.set_block_color5_etc1s(optimizer_results.m_block_color_unscaled);
- blk.set_inten_tables_etc1s(optimizer_results.m_block_inten_table);
- blk.set_flip_bit(true);
+ optimizer.init(optimizer_params, optimizer_results);
+ if (!optimizer.compute())
+ BASISU_FRONTEND_VERIFY(false);
- for (uint32_t y = 0; y < 4; y++)
- for (uint32_t x = 0; x < 4; x++)
- blk.set_selector(x, y, selectors[x + y * 4]);
- }
+ etc_block& blk = m_etc1_blocks_etc1s[block_index];
+
+ memset(&blk, 0, sizeof(blk));
+ blk.set_block_color5_etc1s(optimizer_results.m_block_color_unscaled);
+ blk.set_inten_tables_etc1s(optimizer_results.m_block_inten_table);
+ blk.set_flip_bit(true);
+
+ for (uint32_t y = 0; y < 4; y++)
+ for (uint32_t x = 0; x < 4; x++)
+ blk.set_selector(x, y, selectors[x + y * 4]);
+ }
#ifndef __EMSCRIPTEN__
- } );
+ });
#endif
- }
+ }
#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->wait_for_all();
+ m_params.m_pJob_pool->wait_for_all();
#endif
- debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
+ } // use_cpu
+
+ debug_printf("init_etc1_images: Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
}
void basisu_frontend::init_endpoint_training_vectors()
@@ -889,13 +901,15 @@ namespace basisu
const uint32_t parent_codebook_size = (m_params.m_max_endpoint_clusters >= 256) ? BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE : 0;
uint32_t max_threads = 0;
max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
+ if (m_params.m_pJob_pool)
+ max_threads = minimum<int>((int)m_params.m_pJob_pool->get_total_threads(), max_threads);
- debug_printf("Using %u threads to create codebook\n", max_threads);
+ debug_printf("max_threads: %u\n", max_threads);
bool status = generate_hierarchical_codebook_threaded(m_endpoint_clusterizer,
m_params.m_max_endpoint_clusters, m_use_hierarchical_endpoint_codebooks ? parent_codebook_size : 0,
m_endpoint_clusters,
m_endpoint_parent_clusters,
- max_threads, m_params.m_pJob_pool);
+ max_threads, m_params.m_pJob_pool, true);
BASISU_FRONTEND_VERIFY(status);
if (m_use_hierarchical_endpoint_codebooks)
@@ -940,6 +954,9 @@ namespace basisu
for (uint32_t j = 0; j < cluster.size(); j++)
{
const uint32_t block_index = cluster[j] >> 1;
+
+ BASISU_FRONTEND_VERIFY(block_index < m_block_parent_endpoint_cluster.size());
+
if (!j)
{
parent_cluster_index = m_block_parent_endpoint_cluster[block_index];
@@ -956,6 +973,7 @@ namespace basisu
debug_printf("Total endpoint clusters: %u, parent clusters: %u\n", (uint32_t)m_endpoint_clusters.size(), (uint32_t)m_endpoint_parent_clusters.size());
}
+ // Iterate through each array of endpoint cluster block indices and set the m_block_endpoint_clusters_indices[][] array to indicaste which cluster index each block uses.
void basisu_frontend::generate_block_endpoint_clusters()
{
m_block_endpoint_clusters_indices.resize(m_total_blocks);
@@ -974,11 +992,14 @@ namespace basisu
} // cluster_indices_iter
}
- for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+ if (m_params.m_validate)
{
- uint32_t cluster_0 = m_block_endpoint_clusters_indices[block_index][0];
- uint32_t cluster_1 = m_block_endpoint_clusters_indices[block_index][1];
- BASISU_FRONTEND_VERIFY(cluster_0 == cluster_1);
+ for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+ {
+ uint32_t cluster_0 = m_block_endpoint_clusters_indices[block_index][0];
+ uint32_t cluster_1 = m_block_endpoint_clusters_indices[block_index][1];
+ BASISU_FRONTEND_VERIFY(cluster_0 == cluster_1);
+ }
}
}
@@ -989,6 +1010,7 @@ namespace basisu
m_endpoint_clusters_within_each_parent_cluster.resize(0);
m_endpoint_clusters_within_each_parent_cluster.resize(m_endpoint_parent_clusters.size());
+ // Note: It's possible that some blocks got moved into the same cluster, but live in different parent clusters.
for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
{
const uint32_t cluster_index = m_block_endpoint_clusters_indices[block_index][0];
@@ -1125,6 +1147,8 @@ namespace basisu
std::unordered_set<uint32_t> ignore_cluster;
+ uint32_t total_new_clusters = 0;
+
while (num_new_endpoint_clusters)
{
if (m_subblock_endpoint_quant_err_vec.size() == 0)
@@ -1173,10 +1197,14 @@ namespace basisu
cluster_sizes[subblock_to_move.m_cluster_index] -= 2;
ignore_cluster.insert(subblock_to_move.m_cluster_index);
+
+ total_new_clusters++;
num_new_endpoint_clusters--;
}
+ debug_printf("Introduced %i new endpoint clusters\n", total_new_clusters);
+
for (uint32_t i = 0; i < num_orig_endpoint_clusters; i++)
{
uint_vec &cluster_indices = m_endpoint_clusters[i];
@@ -1200,150 +1228,433 @@ namespace basisu
generate_block_endpoint_clusters();
}
+ struct color_rgba_hasher
+ {
+ inline std::size_t operator()(const color_rgba& k) const
+ {
+ uint32_t v = *(const uint32_t*)&k;
+
+ //return bitmix32(v);
+
+ //v ^= (v << 10);
+ //v ^= (v >> 12);
+
+ return v;
+ }
+ };
+
// Given each endpoint cluster, gather all the block pixels which are in that cluster and compute optimized ETC1S endpoints for them.
// TODO: Don't optimize endpoint clusters which haven't changed.
+ // If step>=1, we check to ensure the new endpoint values actually decrease quantization error.
void basisu_frontend::generate_endpoint_codebook(uint32_t step)
{
debug_printf("generate_endpoint_codebook\n");
+
+ interval_timer tm;
+ tm.start();
m_endpoint_cluster_etc_params.resize(m_endpoint_clusters.size());
- const uint32_t N = 128;
- for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)
+ bool use_cpu = true;
+ // TODO: Get this working when step>0
+ if (m_params.m_pOpenCL_context && !step)
{
- const uint32_t first_index = cluster_index_iter;
- const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);
+ const uint32_t total_clusters = m_endpoint_clusters.size();
-#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->add_job( [this, first_index, last_index, step ] {
-#endif
+ basisu::vector<cl_pixel_cluster> pixel_clusters(total_clusters);
+
+ std::vector<color_rgba> input_pixels;
+ input_pixels.reserve(m_total_blocks * 16);
- for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+ std::vector<uint32_t> pixel_weights;
+ pixel_weights.reserve(m_total_blocks * 16);
+
+ uint_vec cluster_sizes(total_clusters);
+
+ //typedef basisu::hash_map<color_rgba, uint32_t, color_rgba_hasher> color_hasher_type;
+ //color_hasher_type color_hasher;
+ //color_hasher.reserve(2048);
+
+ interval_timer hash_tm;
+ hash_tm.start();
+
+ basisu::vector<uint32_t> colors, colors2;
+ colors.reserve(65536);
+ colors2.reserve(65536);
+
+ for (uint32_t cluster_index = 0; cluster_index < m_endpoint_clusters.size(); cluster_index++)
+ {
+ const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+ assert((cluster_indices.size() & 1) == 0);
+
+#if 0
+ uint64_t first_pixel_index = input_pixels.size();
+ const uint32_t total_pixels = 16 * (cluster_indices.size() / 2);
+
+ input_pixels.resize(input_pixels.size() + total_pixels);
+ pixel_weights.resize(pixel_weights.size() + total_pixels);
+
+ uint64_t dst_ofs = first_pixel_index;
+
+ uint64_t total_r = 0, total_g = 0, total_b = 0;
+ for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
{
- const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+ const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+ if (subblock_index)
+ continue;
- BASISU_FRONTEND_VERIFY(cluster_indices.size());
+ const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+ const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
- const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;
+ for (uint32_t i = 0; i < 16; i++)
+ {
+ input_pixels[dst_ofs] = pBlock_pixels[i];
+ pixel_weights[dst_ofs] = 1;
+ dst_ofs++;
- basisu::vector<color_rgba> cluster_pixels(total_pixels);
+ total_r += pBlock_pixels[i].r;
+ total_g += pBlock_pixels[i].g;
+ total_b += pBlock_pixels[i].b;
+ }
+ }
- for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+ //printf("%i %f %f %f\n", cluster_index, total_r / (float)total_pixels, total_g / (float)total_pixels, total_b / (float)total_pixels);
+
+ pixel_clusters[cluster_index].m_first_pixel_index = first_pixel_index;
+ pixel_clusters[cluster_index].m_total_pixels = total_pixels;
+ cluster_sizes[cluster_index] = total_pixels;
+#elif 1
+ colors.resize(cluster_indices.size() * 8);
+ colors2.resize(cluster_indices.size() * 8);
+ uint32_t dst_ofs = 0;
+
+ for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+ {
+ const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+ if (subblock_index)
+ continue;
+
+ const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+ const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+ memcpy(colors.data() + dst_ofs, pBlock_pixels, sizeof(color_rgba) * 16);
+ dst_ofs += 16;
+
+ } // cluster_indices_iter
+
+ uint32_t* pSorted = radix_sort(colors.size(), colors.data(), colors2.data(), 0, 3);
+
+ const uint64_t first_pixel_index = input_pixels.size();
+
+ uint32_t prev_color = 0, cur_weight = 0;
+
+ for (uint32_t i = 0; i < colors.size(); i++)
+ {
+ uint32_t cur_color = pSorted[i];
+ if (cur_color == prev_color)
{
- const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
- const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+ if (++cur_weight == 0)
+ cur_weight--;
+ }
+ else
+ {
+ if (cur_weight)
+ {
+ input_pixels.push_back(*(const color_rgba*)&prev_color);
+ pixel_weights.push_back(cur_weight);
+ }
- const bool flipped = true;
+ prev_color = cur_color;
+ cur_weight = 1;
+ }
+ }
- const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+ if (cur_weight)
+ {
+ input_pixels.push_back(*(const color_rgba*)&prev_color);
+ pixel_weights.push_back(cur_weight);
+ }
- for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
+ uint32_t total_unique_pixels = (uint32_t)(input_pixels.size() - first_pixel_index);
+
+ pixel_clusters[cluster_index].m_first_pixel_index = first_pixel_index;
+ pixel_clusters[cluster_index].m_total_pixels = total_unique_pixels;
+
+ cluster_sizes[cluster_index] = total_unique_pixels;
+#else
+ color_hasher.reset();
+
+ for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+ {
+ const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+ if (subblock_index)
+ continue;
+
+ const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+ const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+ uint32_t *pPrev_weight = nullptr;
+ color_rgba prev_color;
+
+ {
+ color_rgba cur_color = pBlock_pixels[0];
+ auto res = color_hasher.insert(cur_color, 0);
+
+ uint32_t& weight = (res.first)->second;
+ if (weight != UINT32_MAX)
+ weight++;
+
+ prev_color = cur_color;
+ pPrev_weight = &(res.first)->second;
+ }
+
+ for (uint32_t i = 1; i < 16; i++)
+ {
+ color_rgba cur_color = pBlock_pixels[i];
+
+ if (cur_color == prev_color)
+ {
+ if (*pPrev_weight != UINT32_MAX)
+ *pPrev_weight = *pPrev_weight + 1;
+ }
+ else
{
- const color_rgba &c = pBlock_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
- cluster_pixels[cluster_indices_iter * 8 + pixel_index] = c;
+ auto res = color_hasher.insert(cur_color, 0);
+
+ uint32_t& weight = (res.first)->second;
+ if (weight != UINT32_MAX)
+ weight++;
+
+ prev_color = cur_color;
+ pPrev_weight = &(res.first)->second;
}
}
- endpoint_cluster_etc_params new_subblock_params;
+ } // cluster_indices_iter
+
+ const uint64_t first_pixel_index = input_pixels.size();
+ uint32_t total_unique_pixels = color_hasher.size();
+
+ pixel_clusters[cluster_index].m_first_pixel_index = first_pixel_index;
+ pixel_clusters[cluster_index].m_total_pixels = total_unique_pixels;
+
+ input_pixels.resize(first_pixel_index + total_unique_pixels);
+ pixel_weights.resize(first_pixel_index + total_unique_pixels);
- {
- etc1_optimizer optimizer;
- etc1_solution_coordinates solutions[2];
+ uint32_t j = 0;
+
+ for (auto it = color_hasher.begin(); it != color_hasher.end(); ++it, ++j)
+ {
+ input_pixels[first_pixel_index + j] = it->first;
+ pixel_weights[first_pixel_index + j] = it->second;
+ }
- etc1_optimizer::params cluster_optimizer_params;
- cluster_optimizer_params.m_num_src_pixels = total_pixels;
- cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
+ cluster_sizes[cluster_index] = total_unique_pixels;
+#endif
- cluster_optimizer_params.m_use_color4 = false;
- cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
+ } // cluster_index
- if (m_params.m_compression_level <= 1)
- cluster_optimizer_params.m_quality = cETCQualityMedium;
- else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
- cluster_optimizer_params.m_quality = cETCQualityUber;
+ debug_printf("Total hash time: %3.3f secs\n", hash_tm.get_elapsed_secs());
- etc1_optimizer::results cluster_optimizer_results;
+ debug_printf("Total unique colors: %llu\n", input_pixels.size());
- basisu::vector<uint8_t> cluster_selectors(total_pixels);
- cluster_optimizer_results.m_n = total_pixels;
- cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
+ uint_vec sorted_cluster_indices_new_to_old(total_clusters);
+ indirect_sort(total_clusters, sorted_cluster_indices_new_to_old.data(), cluster_sizes.data());
+ //for (uint32_t i = 0; i < total_clusters; i++)
+ // sorted_cluster_indices_new_to_old[i] = i;
- optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
+ uint_vec sorted_cluster_indices_old_to_new(total_clusters);
+ for (uint32_t i = 0; i < total_clusters; i++)
+ sorted_cluster_indices_old_to_new[sorted_cluster_indices_new_to_old[i]] = i;
- if (!optimizer.compute())
- BASISU_FRONTEND_VERIFY(false);
+ basisu::vector<cl_pixel_cluster> sorted_pixel_clusters(total_clusters);
+ for (uint32_t i = 0; i < total_clusters; i++)
+ sorted_pixel_clusters[i] = pixel_clusters[sorted_cluster_indices_new_to_old[i]];
- new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
- new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
- new_subblock_params.m_color_error[0] = cluster_optimizer_results.m_error;
- }
+ uint32_t total_perms = 64;
+ if (m_params.m_compression_level <= 1)
+ total_perms = 16;
+ else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+ total_perms = OPENCL_ENCODE_ETC1S_MAX_PERMS;
- endpoint_cluster_etc_params &prev_etc_params = m_endpoint_cluster_etc_params[cluster_index];
+ basisu::vector<etc_block> output_blocks(total_clusters);
- bool use_new_subblock_params = false;
- if ((!step) || (!prev_etc_params.m_valid))
- use_new_subblock_params = true;
- else
+ if (opencl_encode_etc1s_pixel_clusters(
+ m_params.m_pOpenCL_context,
+ output_blocks.data(),
+ total_clusters,
+ sorted_pixel_clusters.data(),
+ input_pixels.size(),
+ input_pixels.data(),
+ pixel_weights.data(),
+ m_params.m_perceptual, total_perms))
+ {
+ for (uint32_t old_cluster_index = 0; old_cluster_index < m_endpoint_clusters.size(); old_cluster_index++)
+ {
+ const uint32_t new_cluster_index = sorted_cluster_indices_old_to_new[old_cluster_index];
+
+ const etc_block& blk = output_blocks[new_cluster_index];
+
+ endpoint_cluster_etc_params& prev_etc_params = m_endpoint_cluster_etc_params[old_cluster_index];
+
+ prev_etc_params.m_valid = true;
+ etc_block::unpack_color5(prev_etc_params.m_color_unscaled[0], blk.get_base5_color(), false);
+ prev_etc_params.m_inten_table[0] = blk.get_inten_table(0);
+ prev_etc_params.m_color_error[0] = 0; // dummy value - we don't actually use this
+ }
+
+ use_cpu = false;
+ }
+ else
+ {
+ error_printf("basisu_frontend::generate_endpoint_codebook: opencl_encode_etc1s_pixel_clusters() failed! Using CPU.\n");
+ m_params.m_pOpenCL_context = nullptr;
+ m_opencl_failed = true;
+ }
+
+ } // if (opencl_is_available() && m_params.m_use_opencl)
+
+ if (use_cpu)
+ {
+ const uint32_t N = 128;
+ for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)
+ {
+ const uint32_t first_index = cluster_index_iter;
+ const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);
+
+#ifndef __EMSCRIPTEN__
+ m_params.m_pJob_pool->add_job([this, first_index, last_index, step] {
+#endif
+
+ for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
{
- assert(prev_etc_params.m_valid);
+ const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
- uint64_t total_prev_err = 0;
-
+ BASISU_FRONTEND_VERIFY(cluster_indices.size());
+
+ const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;
+
+ basisu::vector<color_rgba> cluster_pixels(total_pixels);
+
+ for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
{
- color_rgba block_colors[4];
+ const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+ const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
- etc_block::get_block_colors5(block_colors, prev_etc_params.m_color_unscaled[0], prev_etc_params.m_inten_table[0], false);
+ const bool flipped = true;
- uint64_t total_err = 0;
+ const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
- for (uint32_t i = 0; i < total_pixels; i++)
+ for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
{
- const color_rgba &c = cluster_pixels[i];
+ const color_rgba& c = pBlock_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
+ cluster_pixels[cluster_indices_iter * 8 + pixel_index] = c;
+ }
+ }
- uint64_t best_err = UINT64_MAX;
- //uint32_t best_index = 0;
+ endpoint_cluster_etc_params new_subblock_params;
+
+ {
+ etc1_optimizer optimizer;
+ etc1_solution_coordinates solutions[2];
+
+ etc1_optimizer::params cluster_optimizer_params;
+ cluster_optimizer_params.m_num_src_pixels = total_pixels;
+ cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
+
+ cluster_optimizer_params.m_use_color4 = false;
+ cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
+
+ if (m_params.m_compression_level <= 1)
+ cluster_optimizer_params.m_quality = cETCQualityMedium;
+ else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+ cluster_optimizer_params.m_quality = cETCQualityUber;
+
+ etc1_optimizer::results cluster_optimizer_results;
+
+ basisu::vector<uint8_t> cluster_selectors(total_pixels);
+ cluster_optimizer_results.m_n = total_pixels;
+ cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
+
+ optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
+
+ if (!optimizer.compute())
+ BASISU_FRONTEND_VERIFY(false);
+
+ new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
+ new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
+ new_subblock_params.m_color_error[0] = cluster_optimizer_results.m_error;
+ }
+
+ endpoint_cluster_etc_params& prev_etc_params = m_endpoint_cluster_etc_params[cluster_index];
+
+ bool use_new_subblock_params = false;
+ if ((!step) || (!prev_etc_params.m_valid))
+ use_new_subblock_params = true;
+ else
+ {
+ assert(prev_etc_params.m_valid);
+
+ uint64_t total_prev_err = 0;
+
+ {
+ color_rgba block_colors[4];
+
+ etc_block::get_block_colors5(block_colors, prev_etc_params.m_color_unscaled[0], prev_etc_params.m_inten_table[0], false);
- for (uint32_t s = 0; s < 4; s++)
+ uint64_t total_err = 0;
+
+ for (uint32_t i = 0; i < total_pixels; i++)
{
- uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
- if (err < best_err)
+ const color_rgba& c = cluster_pixels[i];
+
+ uint64_t best_err = UINT64_MAX;
+ //uint32_t best_index = 0;
+
+ for (uint32_t s = 0; s < 4; s++)
{
- best_err = err;
- //best_index = s;
+ uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
+ if (err < best_err)
+ {
+ best_err = err;
+ //best_index = s;
+ }
}
+
+ total_err += best_err;
}
- total_err += best_err;
+ total_prev_err += total_err;
}
- total_prev_err += total_err;
+ // See if we should update this cluster's endpoints (if the error has actually fallen)
+ if (total_prev_err > new_subblock_params.m_color_error[0])
+ {
+ use_new_subblock_params = true;
+ }
}
- // See if we should update this cluster's endpoints (if the error has actually fallen)
- if (total_prev_err > new_subblock_params.m_color_error[0])
+ if (use_new_subblock_params)
{
- use_new_subblock_params = true;
- }
- }
+ new_subblock_params.m_valid = true;
- if (use_new_subblock_params)
- {
- new_subblock_params.m_valid = true;
+ prev_etc_params = new_subblock_params;
+ }
- prev_etc_params = new_subblock_params;
- }
-
- } // cluster_index
+ } // cluster_index
#ifndef __EMSCRIPTEN__
- } );
+ });
#endif
- } // cluster_index_iter
+ } // cluster_index_iter
#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->wait_for_all();
+ m_params.m_pJob_pool->wait_for_all();
#endif
+ }
+
+ debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
}
bool basisu_frontend::check_etc1s_constraints() const
@@ -1373,6 +1684,8 @@ namespace basisu
return true;
}
+ // For each block, determine which ETC1S endpoint cluster can encode that block with lowest error.
+ // This reassigns blocks to different endpoint clusters.
uint32_t basisu_frontend::refine_endpoint_clusterization()
{
debug_printf("refine_endpoint_clusterization\n");
@@ -1380,6 +1693,8 @@ namespace basisu
if (m_use_hierarchical_endpoint_codebooks)
compute_endpoint_clusters_within_each_parent_cluster();
+ // Note: It's possible that an endpoint cluster may live in more than one parent cluster after the first refinement step.
+
basisu::vector<vec2U> block_clusters(m_total_blocks);
for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
@@ -1400,156 +1715,255 @@ namespace basisu
// Create a new endpoint clusterization
+ interval_timer tm;
+ tm.start();
+
uint_vec best_cluster_indices(m_total_blocks);
- const uint32_t N = 1024;
- for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+ bool use_cpu = true;
+ // TODO: Support non-hierarchical endpoint codebooks here
+ if (m_params.m_pOpenCL_context && m_use_hierarchical_endpoint_codebooks)
{
- const uint32_t first_index = block_index_iter;
- const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+ // For the OpenCL kernel, we order the parent endpoint clusters by smallest to largest for efficiency.
+ // We also prepare an array of block info structs that point into this new parent endpoint cluster array.
+ const uint32_t total_parent_clusters = m_endpoint_clusters_within_each_parent_cluster.size();
-#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->add_job( [this, first_index, last_index, &best_cluster_indices, &block_clusters] {
-#endif
+ basisu::vector<cl_block_info_struct> cl_block_info_structs(m_total_blocks);
+
+ // the size of each parent cluster, in total clusters
+ uint_vec parent_cluster_sizes(total_parent_clusters);
+ for (uint32_t i = 0; i < total_parent_clusters; i++)
+ parent_cluster_sizes[i] = m_endpoint_clusters_within_each_parent_cluster[i].size();
+
+ uint_vec first_parent_cluster_ofs(total_parent_clusters);
+ uint32_t cur_ofs = 0;
+ for (uint32_t i = 0; i < total_parent_clusters; i++)
+ {
+ first_parent_cluster_ofs[i] = cur_ofs;
- for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+ cur_ofs += parent_cluster_sizes[i];
+ }
+
+ // Note: total_actual_endpoint_clusters is not necessarly equal to m_endpoint_clusters.size(), because clusters may live in multiple parent clusters after the first refinement step.
+ BASISU_FRONTEND_VERIFY(cur_ofs >= m_endpoint_clusters.size());
+ const uint32_t total_actual_endpoint_clusters = cur_ofs;
+ basisu::vector<cl_endpoint_cluster_struct> cl_endpoint_cluster_structs(total_actual_endpoint_clusters);
+
+ for (uint32_t i = 0; i < total_parent_clusters; i++)
+ {
+ const uint32_t dst_ofs = first_parent_cluster_ofs[i];
+
+ const uint32_t parent_cluster_size = parent_cluster_sizes[i];
+
+ assert(m_endpoint_clusters_within_each_parent_cluster[i].size() == parent_cluster_size);
+
+ for (uint32_t j = 0; j < parent_cluster_size; j++)
{
- const uint32_t cluster_index = block_clusters[block_index][0];
- BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);
+ const uint32_t endpoint_cluster_index = m_endpoint_clusters_within_each_parent_cluster[i][j];
- const color_rgba *pSubblock_pixels = get_source_pixel_block(block_index).get_ptr();
- const uint32_t num_subblock_pixels = 16;
+ color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[endpoint_cluster_index].m_color_unscaled[0]);
+ uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[endpoint_cluster_index].m_inten_table[0];
- uint64_t best_cluster_err = INT64_MAX;
- uint32_t best_cluster_index = 0;
+ cl_endpoint_cluster_structs[dst_ofs + j].m_unscaled_color = cluster_etc_base_color;
+ cl_endpoint_cluster_structs[dst_ofs + j].m_etc_inten = (uint8_t)cluster_etc_inten;
+ cl_endpoint_cluster_structs[dst_ofs + j].m_cluster_index = (uint16_t)endpoint_cluster_index;
+ }
+ }
+
+ for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+ {
+ const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster[block_index];
+
+ cl_block_info_structs[block_index].m_num_clusters = (uint16_t)(parent_cluster_sizes[block_parent_endpoint_cluster_index]);
+ cl_block_info_structs[block_index].m_first_cluster_ofs = (uint16_t)(first_parent_cluster_ofs[block_parent_endpoint_cluster_index]);
- const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;
- const uint_vec *pCluster_indices = m_endpoint_clusters_within_each_parent_cluster.size() ? &m_endpoint_clusters_within_each_parent_cluster[block_parent_endpoint_cluster_index] : nullptr;
+ const uint32_t block_cluster_index = block_clusters[block_index][0];
+ cl_block_info_structs[block_index].m_cur_cluster_index = (uint16_t)block_cluster_index;
+ cl_block_info_structs[block_index].m_cur_cluster_etc_inten = (uint8_t)m_endpoint_cluster_etc_params[block_cluster_index].m_inten_table[0];
+ }
+
+ uint_vec block_cluster_indices(m_total_blocks);
+ for (uint32_t i = 0; i < m_total_blocks; i++)
+ block_cluster_indices[i] = block_clusters[i][0];
- const uint32_t total_clusters = m_use_hierarchical_endpoint_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_endpoint_clusters.size();
+ uint_vec sorted_block_indices(m_total_blocks);
+ indirect_sort(m_total_blocks, sorted_block_indices.data(), block_cluster_indices.data());
- for (uint32_t i = 0; i < total_clusters; i++)
+ bool status = opencl_refine_endpoint_clusterization(
+ m_params.m_pOpenCL_context,
+ cl_block_info_structs.data(),
+ total_actual_endpoint_clusters,
+ cl_endpoint_cluster_structs.data(),
+ sorted_block_indices.data(),
+ best_cluster_indices.data(),
+ m_params.m_perceptual);
+
+ if (status)
+ {
+ use_cpu = false;
+ }
+ else
+ {
+ error_printf("basisu_frontend::refine_endpoint_clusterization: opencl_refine_endpoint_clusterization() failed! Using CPU.\n");
+ m_params.m_pOpenCL_context = nullptr;
+ m_opencl_failed = true;
+ }
+ }
+
+ if (use_cpu)
+ {
+ const uint32_t N = 1024;
+ for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+ {
+ const uint32_t first_index = block_index_iter;
+ const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+ m_params.m_pJob_pool->add_job([this, first_index, last_index, &best_cluster_indices, &block_clusters] {
+#endif
+
+ for (uint32_t block_index = first_index; block_index < last_index; block_index++)
{
- const uint32_t cluster_iter = m_use_hierarchical_endpoint_codebooks ? (*pCluster_indices)[i] : i;
+ const uint32_t cluster_index = block_clusters[block_index][0];
+ BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);
- color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0]);
- uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0];
+ const color_rgba* pSubblock_pixels = get_source_pixel_block(block_index).get_ptr();
+ const uint32_t num_subblock_pixels = 16;
- uint64_t total_err = 0;
+ uint64_t best_cluster_err = INT64_MAX;
+ uint32_t best_cluster_index = 0;
- const uint32_t low_selector = 0;//subblock_etc_params_vec[j].m_low_selectors[0];
- const uint32_t high_selector = 3;//subblock_etc_params_vec[j].m_high_selectors[0];
- color_rgba subblock_colors[4];
- // Can't assign it here - may result in too much error when selector quant occurs
- if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])
- {
- total_err = INT64_MAX;
- goto skip_cluster;
- }
+ const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;
+ const uint_vec* pCluster_indices = m_endpoint_clusters_within_each_parent_cluster.size() ? &m_endpoint_clusters_within_each_parent_cluster[block_parent_endpoint_cluster_index] : nullptr;
- etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);
-
-#if 0
- for (uint32_t p = 0; p < num_subblock_pixels; p++)
+ const uint32_t total_clusters = m_use_hierarchical_endpoint_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_endpoint_clusters.size();
+
+ for (uint32_t i = 0; i < total_clusters; i++)
{
- uint64_t best_err = UINT64_MAX;
+ const uint32_t cluster_iter = m_use_hierarchical_endpoint_codebooks ? (*pCluster_indices)[i] : i;
- for (uint32_t r = low_selector; r <= high_selector; r++)
+ color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0]);
+ uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0];
+
+ uint64_t total_err = 0;
+
+ const uint32_t low_selector = 0;//subblock_etc_params_vec[j].m_low_selectors[0];
+ const uint32_t high_selector = 3;//subblock_etc_params_vec[j].m_high_selectors[0];
+ color_rgba subblock_colors[4];
+ // Can't assign it here - may result in too much error when selector quant occurs
+ if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])
{
- uint64_t err = color_distance(m_params.m_perceptual, pSubblock_pixels[p], subblock_colors[r], false);
- best_err = minimum(best_err, err);
- if (!best_err)
- break;
+ total_err = INT64_MAX;
+ goto skip_cluster;
}
- total_err += best_err;
- if (total_err > best_cluster_err)
- break;
- } // p
-#else
- if (m_params.m_perceptual)
- {
- if (!g_cpu_supports_sse41)
+ etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);
+
+#if 0
+ for (uint32_t p = 0; p < num_subblock_pixels; p++)
{
- for (uint32_t p = 0; p < num_subblock_pixels; p++)
+ uint64_t best_err = UINT64_MAX;
+
+ for (uint32_t r = low_selector; r <= high_selector; r++)
{
- uint64_t best_err = UINT64_MAX;
+ uint64_t err = color_distance(m_params.m_perceptual, pSubblock_pixels[p], subblock_colors[r], false);
+ best_err = minimum(best_err, err);
+ if (!best_err)
+ break;
+ }
- for (uint32_t r = low_selector; r <= high_selector; r++)
+ total_err += best_err;
+ if (total_err > best_cluster_err)
+ break;
+ } // p
+#else
+ if (m_params.m_perceptual)
+ {
+ if (!g_cpu_supports_sse41)
+ {
+ for (uint32_t p = 0; p < num_subblock_pixels; p++)
{
- uint64_t err = color_distance(true, pSubblock_pixels[p], subblock_colors[r], false);
- best_err = minimum(best_err, err);
- if (!best_err)
+ uint64_t best_err = UINT64_MAX;
+
+ for (uint32_t r = low_selector; r <= high_selector; r++)
+ {
+ uint64_t err = color_distance(true, pSubblock_pixels[p], subblock_colors[r], false);
+ best_err = minimum(best_err, err);
+ if (!best_err)
+ break;
+ }
+
+ total_err += best_err;
+ if (total_err > best_cluster_err)
break;
- }
-
- total_err += best_err;
- if (total_err > best_cluster_err)
- break;
- } // p
- }
- else
- {
+ } // p
+ }
+ else
+ {
#if BASISU_SUPPORT_SSE
- find_lowest_error_perceptual_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+ find_lowest_error_perceptual_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
#endif
+ }
}
- }
- else
- {
- if (!g_cpu_supports_sse41)
+ else
{
- for (uint32_t p = 0; p < num_subblock_pixels; p++)
+ if (!g_cpu_supports_sse41)
{
- uint64_t best_err = UINT64_MAX;
-
- for (uint32_t r = low_selector; r <= high_selector; r++)
+ for (uint32_t p = 0; p < num_subblock_pixels; p++)
{
- uint64_t err = color_distance(false, pSubblock_pixels[p], subblock_colors[r], false);
- best_err = minimum(best_err, err);
- if (!best_err)
+ uint64_t best_err = UINT64_MAX;
+
+ for (uint32_t r = low_selector; r <= high_selector; r++)
+ {
+ uint64_t err = color_distance(false, pSubblock_pixels[p], subblock_colors[r], false);
+ best_err = minimum(best_err, err);
+ if (!best_err)
+ break;
+ }
+
+ total_err += best_err;
+ if (total_err > best_cluster_err)
break;
- }
-
- total_err += best_err;
- if (total_err > best_cluster_err)
- break;
- } // p
- }
- else
- {
+ } // p
+ }
+ else
+ {
#if BASISU_SUPPORT_SSE
- find_lowest_error_linear_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+ find_lowest_error_linear_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
#endif
+ }
}
- }
#endif
- skip_cluster:
- if ((total_err < best_cluster_err) ||
- ((cluster_iter == cluster_index) && (total_err == best_cluster_err)))
- {
- best_cluster_err = total_err;
- best_cluster_index = cluster_iter;
-
- if (!best_cluster_err)
- break;
- }
- } // j
-
- best_cluster_indices[block_index] = best_cluster_index;
+ skip_cluster:
+ if ((total_err < best_cluster_err) ||
+ ((cluster_iter == cluster_index) && (total_err == best_cluster_err)))
+ {
+ best_cluster_err = total_err;
+ best_cluster_index = cluster_iter;
- } // block_index
+ if (!best_cluster_err)
+ break;
+ }
+ } // j
+
+ best_cluster_indices[block_index] = best_cluster_index;
+
+ } // block_index
#ifndef __EMSCRIPTEN__
- } );
+ });
#endif
-
- } // block_index_iter
+
+ } // block_index_iter
#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->wait_for_all();
+ m_params.m_pJob_pool->wait_for_all();
#endif
+
+ } // use_cpu
+
+ debug_printf("refine_endpoint_clusterization time: %3.3f secs\n", tm.get_elapsed_secs());
basisu::vector<typename basisu::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());
uint32_t total_subblocks_reassigned = 0;
@@ -1647,51 +2061,91 @@ namespace basisu
void basisu_frontend::create_initial_packed_texture()
{
debug_printf("create_initial_packed_texture\n");
+
+ interval_timer tm;
+ tm.start();
- const uint32_t N = 4096;
- for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+ bool use_cpu = true;
+
+ if ((m_params.m_pOpenCL_context) && (opencl_is_available()))
{
- const uint32_t first_index = block_index_iter;
- const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+ basisu::vector<color_rgba> block_etc5_color_intens(m_total_blocks);
+
+ for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+ {
+ uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];
+
+ const color_rgba& color_unscaled = m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0];
+ uint32_t inten = m_endpoint_cluster_etc_params[cluster0].m_inten_table[0];
+
+ block_etc5_color_intens[block_index].set(color_unscaled.r, color_unscaled.g, color_unscaled.b, inten);
+ }
+
+ bool status = opencl_determine_selectors(m_params.m_pOpenCL_context, block_etc5_color_intens.data(),
+ m_encoded_blocks.data(),
+ m_params.m_perceptual);
+ if (!status)
+ {
+ error_printf("basisu_frontend::create_initial_packed_texture: opencl_determine_selectors() failed! Using CPU.\n");
+ m_params.m_pOpenCL_context = nullptr;
+ m_opencl_failed = true;
+ }
+ else
+ {
+ use_cpu = false;
+ }
+ }
+
+ if (use_cpu)
+ {
+ const uint32_t N = 4096;
+ for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+ {
+ const uint32_t first_index = block_index_iter;
+ const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+ m_params.m_pJob_pool->add_job([this, first_index, last_index] {
#endif
-
- for (uint32_t block_index = first_index; block_index < last_index; block_index++)
- {
- uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];
- uint32_t cluster1 = m_block_endpoint_clusters_indices[block_index][1];
- BASISU_FRONTEND_VERIFY(cluster0 == cluster1);
- const color_rgba *pSource_pixels = get_source_pixel_block(block_index).get_ptr();
+ for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+ {
+ uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];
+ uint32_t cluster1 = m_block_endpoint_clusters_indices[block_index][1];
+ BASISU_FRONTEND_VERIFY(cluster0 == cluster1);
- etc_block &blk = m_encoded_blocks[block_index];
+ const color_rgba* pSource_pixels = get_source_pixel_block(block_index).get_ptr();
- color_rgba unscaled[2] = { m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0], m_endpoint_cluster_etc_params[cluster1].m_color_unscaled[0] };
- uint32_t inten[2] = { m_endpoint_cluster_etc_params[cluster0].m_inten_table[0], m_endpoint_cluster_etc_params[cluster1].m_inten_table[0] };
-
- blk.set_block_color5(unscaled[0], unscaled[1]);
- blk.set_flip_bit(true);
+ etc_block& blk = m_encoded_blocks[block_index];
- blk.set_inten_table(0, inten[0]);
- blk.set_inten_table(1, inten[1]);
+ color_rgba unscaled[2] = { m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0], m_endpoint_cluster_etc_params[cluster1].m_color_unscaled[0] };
+ uint32_t inten[2] = { m_endpoint_cluster_etc_params[cluster0].m_inten_table[0], m_endpoint_cluster_etc_params[cluster1].m_inten_table[0] };
- blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
-
- } // block_index
+ blk.set_block_color5(unscaled[0], unscaled[1]);
+ blk.set_flip_bit(true);
+
+ blk.set_inten_table(0, inten[0]);
+ blk.set_inten_table(1, inten[1]);
+
+ blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
+
+ } // block_index
#ifndef __EMSCRIPTEN__
- } );
+ });
#endif
- } // block_index_iter
+ } // block_index_iter
#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->wait_for_all();
+ m_params.m_pJob_pool->wait_for_all();
#endif
+ } // use_cpu
+
m_orig_encoded_blocks = m_encoded_blocks;
+
+ debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
}
void basisu_frontend::compute_selector_clusters_within_each_parent_cluster()
@@ -1739,8 +2193,7 @@ namespace basisu
void basisu_frontend::generate_selector_clusters()
{
debug_printf("generate_selector_clusters\n");
-
- typedef vec<16, float> vec16F;
+
typedef tree_vector_quant<vec16F> vec16F_clusterizer;
vec16F_clusterizer::array_of_weighted_training_vecs training_vecs(m_total_blocks);
@@ -1800,12 +2253,14 @@ namespace basisu
uint32_t max_threads = 0;
max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
+ if (m_params.m_pJob_pool)
+ max_threads = minimum<int>((int)m_params.m_pJob_pool->get_total_threads(), max_threads);
bool status = generate_hierarchical_codebook_threaded(selector_clusterizer,
m_params.m_max_selector_clusters, m_use_hierarchical_selector_codebooks ? parent_codebook_size : 0,
m_selector_cluster_block_indices,
m_selector_parent_cluster_block_indices,
- max_threads, m_params.m_pJob_pool);
+ max_threads, m_params.m_pJob_pool, false);
BASISU_FRONTEND_VERIFY(status);
if (m_use_hierarchical_selector_codebooks)
@@ -1864,235 +2319,105 @@ namespace basisu
{
debug_printf("create_optimized_selector_codebook\n");
+ interval_timer tm;
+ tm.start();
+
const uint32_t total_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
debug_printf("Total selector clusters (from m_selector_cluster_block_indices.size()): %u\n", (uint32_t)m_selector_cluster_block_indices.size());
m_optimized_cluster_selectors.resize(total_selector_clusters);
+
+ uint32_t total_clusters_processed = 0;
- if ((m_params.m_pGlobal_sel_codebook) && (!m_params.m_use_hybrid_selector_codebooks))
+ // For each selector codebook entry, and for each of the 4x4 selectors, determine which selector minimizes the error across all the blocks that use that quantized selector.
+ const uint32_t N = 256;
+ for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)
{
- uint32_t total_clusters_processed = 0;
-
- m_optimized_cluster_selector_global_cb_ids.resize(total_selector_clusters);
+ const uint32_t first_index = cluster_index_iter;
+ const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);
- const uint32_t N = 256;
- for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)
- {
- const uint32_t first_index = cluster_index_iter;
- const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);
-
-#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->add_job( [this, first_index, last_index, &total_clusters_processed, &total_selector_clusters] {
-#endif
-
- for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
- {
- const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
-
- if (!cluster_block_indices.size())
- continue;
-
- etc_block_vec etc_blocks;
- pixel_block_vec pixel_blocks;
-
- for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
- {
- const uint32_t block_index = cluster_block_indices[cluster_block_index];
-
- etc_blocks.push_back(m_encoded_blocks[block_index]);
-
- pixel_blocks.push_back(get_source_pixel_block(block_index));
- }
-
- uint32_t palette_index;
- basist::etc1_global_palette_entry_modifier palette_modifier;
-
- #if 0
- m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
- palette_index, palette_modifier,
- m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
- #else
- etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook,
- (uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
- palette_index, palette_modifier,
- m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
- #endif
-
- m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
-
- basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
-
- for (uint32_t y = 0; y < 4; y++)
- for (uint32_t x = 0; x < 4; x++)
- m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
-
- {
- std::lock_guard<std::mutex> lock(m_lock);
-
- total_clusters_processed++;
- if ((total_clusters_processed % 63) == 0)
- debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
- }
-
- } // cluster_index
-
-#ifndef __EMSCRIPTEN__
- } );
+#ifndef __EMSCRIPTEN__
+ m_params.m_pJob_pool->add_job([this, first_index, last_index, &total_clusters_processed, &total_selector_clusters] {
#endif
- } // cluster_index_iter
-
-#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->wait_for_all();
-#endif
- }
- else
- {
- const bool uses_hybrid_sel_codebook = ((m_params.m_pGlobal_sel_codebook) && (m_params.m_use_hybrid_selector_codebooks));
- if (uses_hybrid_sel_codebook)
- {
- m_selector_cluster_uses_global_cb.resize(total_selector_clusters);
- m_optimized_cluster_selector_global_cb_ids.resize(total_selector_clusters);
- }
+ for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+ {
+ const basisu::vector<uint32_t>& cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
- uint32_t total_clusters_processed = 0;
+ if (!cluster_block_indices.size())
+ continue;
- // For each selector codebook entry, and for each of the 4x4 selectors, determine which selector minimizes the error across all the blocks that use that quantized selector.
+ uint64_t overall_best_err = 0;
- const uint32_t N = 256;
- for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)
- {
- const uint32_t first_index = cluster_index_iter;
- const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);
+ uint64_t total_err[4][4][4];
+ clear_obj(total_err);
-#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->add_job( [this, first_index, last_index, &uses_hybrid_sel_codebook, &total_clusters_processed, &total_selector_clusters] {
-#endif
-
- for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+ for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
{
- const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
+ const uint32_t block_index = cluster_block_indices[cluster_block_index];
- if (!cluster_block_indices.size())
- continue;
+ const etc_block& blk = m_encoded_blocks[block_index];
- uint64_t overall_best_err = 0;
+ color_rgba blk_colors[4];
+ blk.get_block_colors(blk_colors, 0);
for (uint32_t y = 0; y < 4; y++)
{
for (uint32_t x = 0; x < 4; x++)
{
- uint64_t best_err = UINT64_MAX;
- uint32_t best_s = 0;
+ const color_rgba& orig_color = get_source_pixel_block(block_index)(x, y);
- for (uint32_t s = 0; s < 4; s++)
+ if (m_params.m_perceptual)
{
- uint32_t total_err = 0;
-
- for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
- {
- const uint32_t block_index = cluster_block_indices[cluster_block_index];
-
- const etc_block &blk = m_encoded_blocks[block_index];
-
- const color_rgba &orig_color = get_source_pixel_block(block_index)(x, y);
-
- color_rgba block_color;
- blk.get_block_color(block_color, blk.get_subblock_index(x, y), s);
- total_err += color_distance(m_params.m_perceptual, block_color, orig_color, false);
-
- if (total_err > best_err)
- break;
-
- } // block_index
-
- if (total_err < best_err)
- {
- best_err = total_err;
- best_s = s;
- if (!best_err)
- break;
- }
-
- } // s
-
- m_optimized_cluster_selectors[cluster_index].set_selector(x, y, best_s);
-
- overall_best_err += best_err;
-
+ for (uint32_t s = 0; s < 4; s++)
+ total_err[y][x][s] += color_distance(true, blk_colors[s], orig_color, false);
+ }
+ else
+ {
+ for (uint32_t s = 0; s < 4; s++)
+ total_err[y][x][s] += color_distance(false, blk_colors[s], orig_color, false);
+ }
} // x
} // y
- if (uses_hybrid_sel_codebook)
- {
- etc_block_vec etc_blocks;
- pixel_block_vec pixel_blocks;
+ } // cluster_block_index
- for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
- {
- const uint32_t block_index = cluster_block_indices[cluster_block_index];
-
- etc_blocks.push_back(m_encoded_blocks[block_index]);
-
- pixel_blocks.push_back(get_source_pixel_block(block_index));
- }
-
- uint32_t palette_index;
- basist::etc1_global_palette_entry_modifier palette_modifier;
-
- #if 0
- uint64_t best_global_cb_err = m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
- palette_index, palette_modifier,
- m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
- #else
- uint64_t best_global_cb_err = etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook, (uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
- palette_index, palette_modifier,
- m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
- #endif
+ for (uint32_t y = 0; y < 4; y++)
+ {
+ for (uint32_t x = 0; x < 4; x++)
+ {
+ uint64_t best_err = total_err[y][x][0];
+ uint8_t best_sel = 0;
- if (best_global_cb_err <= overall_best_err * m_params.m_hybrid_codebook_quality_thresh)
+ for (uint32_t s = 1; s < 4; s++)
{
- m_selector_cluster_uses_global_cb[cluster_index] = true;
-
- m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
-
- basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
-
- for (uint32_t y = 0; y < 4; y++)
- for (uint32_t x = 0; x < 4; x++)
- m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
+ if (total_err[y][x][s] < best_err)
+ {
+ best_err = total_err[y][x][s];
+ best_sel = (uint8_t)s;
+ }
}
- else
- {
- m_optimized_cluster_selector_global_cb_ids[cluster_index].set(0, basist::etc1_global_palette_entry_modifier(0));
- m_selector_cluster_uses_global_cb[cluster_index] = false;
- }
- }
+ m_optimized_cluster_selectors[cluster_index].set_selector(x, y, best_sel);
- if (uses_hybrid_sel_codebook)
- {
- std::lock_guard<std::mutex> lock(m_lock);
-
- total_clusters_processed++;
- if ((total_clusters_processed % 63) == 0)
- debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
- }
+ overall_best_err += best_err;
+ } // x
+ } // y
- } // cluster_index
+ } // cluster_index
#ifndef __EMSCRIPTEN__
- } );
+ });
#endif
- } // cluster_index_iter
+ } // cluster_index_iter
#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->wait_for_all();
+ m_params.m_pJob_pool->wait_for_all();
#endif
- } // if (m_params.m_pGlobal_sel_codebook)
+ debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
if (m_params.m_debug_images)
{
@@ -2133,17 +2458,25 @@ namespace basisu
}
}
+ // For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.
+ // Note that this method may leave some empty clusters (i.e. arrays with no block indices), including at the end.
void basisu_frontend::find_optimal_selector_clusters_for_each_block()
{
debug_printf("find_optimal_selector_clusters_for_each_block\n");
- // Sanity checks
- BASISU_FRONTEND_VERIFY(m_selector_cluster_block_indices.size() == m_optimized_cluster_selectors.size());
- for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+ interval_timer tm;
+ tm.start();
+
+ if (m_params.m_validate)
{
- for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+ // Sanity checks
+ BASISU_FRONTEND_VERIFY(m_selector_cluster_block_indices.size() == m_optimized_cluster_selectors.size());
+ for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
{
- BASISU_FRONTEND_VERIFY(m_selector_clusters_within_each_parent_cluster[i][j] < m_optimized_cluster_selectors.size());
+ for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+ {
+ BASISU_FRONTEND_VERIFY(m_selector_clusters_within_each_parent_cluster[i][j] < m_optimized_cluster_selectors.size());
+ }
}
}
@@ -2151,20 +2484,120 @@ namespace basisu
if (m_params.m_compression_level == 0)
{
- // Don't do anything, just leave the blocks in their original selector clusters.
- for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+ // Just leave the blocks in their original selector clusters.
+ for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_block_indices.size(); selector_cluster_index++)
{
- for (uint32_t j = 0; j < m_selector_cluster_block_indices[i].size(); j++)
- m_block_selector_cluster_index[m_selector_cluster_block_indices[i][j]] = i;
+ for (uint32_t j = 0; j < m_selector_cluster_block_indices[selector_cluster_index].size(); j++)
+ {
+ const uint32_t block_index = m_selector_cluster_block_indices[selector_cluster_index][j];
+
+ m_block_selector_cluster_index[block_index] = selector_cluster_index;
+
+ etc_block& blk = m_encoded_blocks[block_index];
+ blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_cluster_index].get_raw_selector_bits());
+ }
}
+
+ debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
+
+ return;
}
- else
+
+ bool use_cpu = true;
+
+ if ((m_params.m_pOpenCL_context) && m_use_hierarchical_selector_codebooks)
{
- // Note that this method may leave some empty clusters (i.e. arrays with no block indices), including at the end.
- basisu::vector< basisu::vector<uint32_t> > new_cluster_indices(m_optimized_cluster_selectors.size());
+ const uint32_t num_parent_clusters = m_selector_clusters_within_each_parent_cluster.size();
+
+ basisu::vector<fosc_selector_struct> selector_structs;
+ selector_structs.reserve(m_optimized_cluster_selectors.size());
+
+ uint_vec parent_selector_cluster_offsets(num_parent_clusters);
+
+ uint_vec selector_cluster_indices;
+ selector_cluster_indices.reserve(m_optimized_cluster_selectors.size());
+
+ uint32_t cur_ofs = 0;
+ for (uint32_t parent_index = 0; parent_index < num_parent_clusters; parent_index++)
+ {
+ parent_selector_cluster_offsets[parent_index] = cur_ofs;
+
+ for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[parent_index].size(); j++)
+ {
+ const uint32_t selector_cluster_index = m_selector_clusters_within_each_parent_cluster[parent_index][j];
+
+ uint32_t sel_bits = 0;
+ for (uint32_t p = 0; p < 16; p++)
+ sel_bits |= (m_optimized_cluster_selectors[selector_cluster_index].get_selector(p & 3, p >> 2) << (p * 2));
+
+ selector_structs.enlarge(1)->m_packed_selectors = sel_bits;
+
+ selector_cluster_indices.push_back(selector_cluster_index);
+ }
+
+ cur_ofs += m_selector_clusters_within_each_parent_cluster[parent_index].size();
+ }
+
+ const uint32_t total_input_selectors = cur_ofs;
- // For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.
+ basisu::vector<fosc_block_struct> block_structs(m_total_blocks);
+ for (uint32_t i = 0; i < m_total_blocks; i++)
+ {
+ const uint32_t parent_selector_cluster = m_block_parent_selector_cluster[i];
+
+ const etc_block& blk = m_encoded_blocks[i];
+ blk.unpack_color5(block_structs[i].m_etc_color5_inten, blk.get_base5_color(), false);
+
+ block_structs[i].m_etc_color5_inten.a = (uint8_t)blk.get_inten_table(0);
+ block_structs[i].m_first_selector = parent_selector_cluster_offsets[parent_selector_cluster];
+ block_structs[i].m_num_selectors = m_selector_clusters_within_each_parent_cluster[parent_selector_cluster].size();
+ }
+
+ uint_vec output_selector_cluster_indices(m_total_blocks);
+
+ bool status = opencl_find_optimal_selector_clusters_for_each_block(
+ m_params.m_pOpenCL_context,
+ block_structs.data(),
+ total_input_selectors,
+ selector_structs.data(),
+ selector_cluster_indices.data(),
+ output_selector_cluster_indices.data(),
+ m_params.m_perceptual);
+
+ if (!status)
+ {
+ error_printf("basisu_frontend::find_optimal_selector_clusters_for_each_block: opencl_find_optimal_selector_clusters_for_each_block() failed! Using CPU.\n");
+ m_params.m_pOpenCL_context = nullptr;
+ m_opencl_failed = true;
+ }
+ else
+ {
+ for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+ {
+ m_selector_cluster_block_indices[i].resize(0);
+ m_selector_cluster_block_indices[i].reserve(128);
+ }
+
+ for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+ {
+ etc_block& blk = m_encoded_blocks[block_index];
+
+ uint32_t best_cluster_index = output_selector_cluster_indices[block_index];
+
+ blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());
+ m_block_selector_cluster_index[block_index] = best_cluster_index;
+
+ vector_ensure_element_is_valid(m_selector_cluster_block_indices, best_cluster_index);
+ m_selector_cluster_block_indices[best_cluster_index].push_back(block_index);
+ }
+
+ use_cpu = false;
+ }
+ }
+
+ if (use_cpu)
+ {
basisu::vector<uint8_t> unpacked_optimized_cluster_selectors(16 * m_optimized_cluster_selectors.size());
for (uint32_t cluster_index = 0; cluster_index < m_optimized_cluster_selectors.size(); cluster_index++)
{
@@ -2176,36 +2609,74 @@ namespace basisu
}
}
}
-
- const uint32_t N = 1024;
+
+ const uint32_t N = 2048;
for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
{
const uint32_t first_index = block_index_iter;
const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
-#ifndef __EMSCRIPTEN__
- m_params.m_pJob_pool->add_job( [this, first_index, last_index, &new_cluster_indices, &unpacked_optimized_cluster_selectors] {
-#endif
+ #ifndef __EMSCRIPTEN__
+ m_params.m_pJob_pool->add_job( [this, first_index, last_index, &unpacked_optimized_cluster_selectors] {
+ #endif
+
+ int prev_best_cluster_index = 0;
for (uint32_t block_index = first_index; block_index < last_index; block_index++)
{
- const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
-
+ const pixel_block& block = get_source_pixel_block(block_index);
+
etc_block& blk = m_encoded_blocks[block_index];
-
+
+ if ((block_index > first_index) && (block == get_source_pixel_block(block_index - 1)))
+ {
+ blk.set_raw_selector_bits(m_optimized_cluster_selectors[prev_best_cluster_index].get_raw_selector_bits());
+
+ m_block_selector_cluster_index[block_index] = prev_best_cluster_index;
+
+ continue;
+ }
+
+ const color_rgba* pBlock_pixels = block.get_ptr();
+
color_rgba trial_block_colors[4];
- blk.get_block_colors(trial_block_colors, 0);
+ blk.get_block_colors_etc1s(trial_block_colors);
// precompute errors for the i-th block pixel and selector sel: [sel][i]
uint32_t trial_errors[4][16];
-
- for (int sel = 0; sel < 4; ++sel)
+
+ if (m_params.m_perceptual)
{
- for (int i = 0; i < 16; ++i)
- {
- trial_errors[sel][i] = color_distance(m_params.m_perceptual, pBlock_pixels[i], trial_block_colors[sel], false);
- }
+ for (uint32_t sel = 0; sel < 4; ++sel)
+ for (uint32_t i = 0; i < 16; ++i)
+ trial_errors[sel][i] = color_distance(true, pBlock_pixels[i], trial_block_colors[sel], false);
}
+ else
+ {
+ for (uint32_t sel = 0; sel < 4; ++sel)
+ for (uint32_t i = 0; i < 16; ++i)
+ trial_errors[sel][i] = color_distance(false, pBlock_pixels[i], trial_block_colors[sel], false);
+ }
+
+ // Compute the minimum possible errors (given any selectors) for pixels 0-15
+ uint64_t min_possible_error_0_15 = 0;
+ for (uint32_t i = 0; i < 16; i++)
+ min_possible_error_0_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);
+
+ // Compute the minimum possible errors (given any selectors) for pixels 4-15
+ uint64_t min_possible_error_4_15 = 0;
+ for (uint32_t i = 4; i < 16; i++)
+ min_possible_error_4_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);
+
+ // Compute the minimum possible errors (given any selectors) for pixels 8-15
+ uint64_t min_possible_error_8_15 = 0;
+ for (uint32_t i = 8; i < 16; i++)
+ min_possible_error_8_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);
+
+ // Compute the minimum possible errors (given any selectors) for pixels 12-15
+ uint64_t min_possible_error_12_15 = 0;
+ for (uint32_t i = 12; i < 16; i++)
+ min_possible_error_12_15 += basisu::minimum(trial_errors[0][i], trial_errors[1][i], trial_errors[2][i], trial_errors[3][i]);
uint64_t best_cluster_err = INT64_MAX;
uint32_t best_cluster_index = 0;
@@ -2215,7 +2686,7 @@ namespace basisu
const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_block_indices.size();
-#if 0
+ #if 0
for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
{
const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
@@ -2246,99 +2717,73 @@ namespace basisu
early_out:
;
}
-#else
- if (m_params.m_perceptual)
- {
- for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
- {
- const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
- //const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
-
- uint64_t trial_err = 0;
-
- for (int i = 0; i < 16; i++)
- {
- const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
-
- trial_err += trial_errors[sel][i];
- if (trial_err > best_cluster_err)
- goto early_out;
- }
-
- if (trial_err < best_cluster_err)
- {
- best_cluster_err = trial_err;
- best_cluster_index = cluster_index;
- if (!best_cluster_err)
- break;
- }
-
- early_out:
- ;
-
- } // cluster_iter
- }
- else
+ #else
+ for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
{
- for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
- {
- const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
- //const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+ const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+
+ const uint8_t* pSels = &unpacked_optimized_cluster_selectors[cluster_index * 16];
- uint64_t trial_err = 0;
+ uint64_t trial_err = (uint64_t)trial_errors[pSels[0]][0] + trial_errors[pSels[1]][1] + trial_errors[pSels[2]][2] + trial_errors[pSels[3]][3];
+ if ((trial_err + min_possible_error_4_15) >= best_cluster_err)
+ continue;
- for (int i = 0; i < 16; i++)
- {
- const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
+ trial_err += (uint64_t)trial_errors[pSels[4]][4] + trial_errors[pSels[5]][5] + trial_errors[pSels[6]][6] + trial_errors[pSels[7]][7];
+ if ((trial_err + min_possible_error_8_15) >= best_cluster_err)
+ continue;
- trial_err += trial_errors[sel][i];
- if (trial_err > best_cluster_err)
- goto early_out2;
- }
+ trial_err += (uint64_t)trial_errors[pSels[8]][8] + trial_errors[pSels[9]][9] + trial_errors[pSels[10]][10] + trial_errors[pSels[11]][11];
+ if ((trial_err + min_possible_error_12_15) >= best_cluster_err)
+ continue;
- if (trial_err < best_cluster_err)
- {
- best_cluster_err = trial_err;
- best_cluster_index = cluster_index;
- if (!best_cluster_err)
- break;
- }
+ trial_err += (uint64_t)trial_errors[pSels[12]][12] + trial_errors[pSels[13]][13] + trial_errors[pSels[14]][14] + trial_errors[pSels[15]][15];
- early_out2:
- ;
+ if (trial_err < best_cluster_err)
+ {
+ best_cluster_err = trial_err;
+ best_cluster_index = cluster_index;
+ if (best_cluster_err == min_possible_error_0_15)
+ break;
+ }
- } // cluster_iter
- }
-#endif
+ } // cluster_iter
+ #endif
blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());
m_block_selector_cluster_index[block_index] = best_cluster_index;
-
- {
- std::lock_guard<std::mutex> lock(m_lock);
- vector_ensure_element_is_valid(new_cluster_indices, best_cluster_index);
- new_cluster_indices[best_cluster_index].push_back(block_index);
- }
+ prev_best_cluster_index = best_cluster_index;
} // block_index
-#ifndef __EMSCRIPTEN__
+ #ifndef __EMSCRIPTEN__
} );
-#endif
+ #endif
} // block_index_iter
-#ifndef __EMSCRIPTEN__
+ #ifndef __EMSCRIPTEN__
m_params.m_pJob_pool->wait_for_all();
-#endif
+ #endif
+
+ for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+ {
+ m_selector_cluster_block_indices[i].resize(0);
+ m_selector_cluster_block_indices[i].reserve(128);
+ }
- m_selector_cluster_block_indices.swap(new_cluster_indices);
- }
+ for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+ {
+ const uint32_t best_cluster_index = m_block_selector_cluster_index[block_index];
- for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
- vector_sort(m_selector_cluster_block_indices[i]);
+ vector_ensure_element_is_valid(m_selector_cluster_block_indices, best_cluster_index);
+ m_selector_cluster_block_indices[best_cluster_index].push_back(block_index);
+ }
+
+ } // if (use_cpu)
+
+ debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
}
// TODO: Remove old ETC1 specific stuff, and thread this.
@@ -2837,7 +3282,81 @@ namespace basisu
//debug_printf("validate_output: %u\n", validate_output());
}
-
+
+ // Endpoint clusterization hierarchy integrity checker.
+ // Note this doesn't check for empty clusters.
+ bool basisu_frontend::validate_endpoint_cluster_hierarchy(bool ensure_clusters_have_same_parents) const
+ {
+ if (!m_endpoint_parent_clusters.size())
+ return true;
+
+ int_vec subblock_parent_indices(m_total_blocks * 2);
+ subblock_parent_indices.set_all(-1);
+
+ int_vec subblock_cluster_indices(m_total_blocks * 2);
+ subblock_cluster_indices.set_all(-1);
+
+ for (uint32_t parent_index = 0; parent_index < m_endpoint_parent_clusters.size(); parent_index++)
+ {
+ for (uint32_t i = 0; i < m_endpoint_parent_clusters[parent_index].size(); i++)
+ {
+ uint32_t subblock_index = m_endpoint_parent_clusters[parent_index][i];
+ if (subblock_index >= m_total_blocks * 2)
+ return false;
+
+ // If the endpoint cluster lives in more than one parent node, that's wrong.
+ if (subblock_parent_indices[subblock_index] != -1)
+ return false;
+
+ subblock_parent_indices[subblock_index] = parent_index;
+ }
+ }
+
+ // Make sure all endpoint clusters are present in the parent cluster.
+ for (uint32_t i = 0; i < subblock_parent_indices.size(); i++)
+ {
+ if (subblock_parent_indices[i] == -1)
+ return false;
+ }
+
+ for (uint32_t cluster_index = 0; cluster_index < m_endpoint_clusters.size(); cluster_index++)
+ {
+ int parent_index = 0;
+
+ for (uint32_t i = 0; i < m_endpoint_clusters[cluster_index].size(); i++)
+ {
+ uint32_t subblock_index = m_endpoint_clusters[cluster_index][i];
+ if (subblock_index >= m_total_blocks * 2)
+ return false;
+
+ if (subblock_cluster_indices[subblock_index] != -1)
+ return false;
+
+ subblock_cluster_indices[subblock_index] = cluster_index;
+
+ // There are transformations on the endpoint clusters that can break the strict tree requirement
+ if (ensure_clusters_have_same_parents)
+ {
+ // Make sure all the subblocks are in the same parent cluster
+ if (!i)
+ parent_index = subblock_parent_indices[subblock_index];
+ else if (subblock_parent_indices[subblock_index] != parent_index)
+ return false;
+ }
+ }
+ }
+
+ // Make sure all endpoint clusters are present in the parent cluster.
+ for (uint32_t i = 0; i < subblock_cluster_indices.size(); i++)
+ {
+ if (subblock_cluster_indices[i] == -1)
+ return false;
+ }
+
+ return true;
+ }
+
+ // This is very slow and only intended for debugging/development. It's enabled using the "-validate_etc1s" command line option.
bool basisu_frontend::validate_output() const
{
debug_printf("validate_output\n");
@@ -2889,29 +3408,7 @@ namespace basisu
CHECK(rdo_output_block.get_base5_color() == blk.get_base5_color());
CHECK(rdo_output_block.get_delta3_color() == blk.get_delta3_color());
CHECK(rdo_output_block.get_raw_selector_bits() == blk.get_raw_selector_bits());
-
- if (m_params.m_pGlobal_sel_codebook)
- {
- bool used_global_cb = true;
- if (m_params.m_use_hybrid_selector_codebooks)
- used_global_cb = m_selector_cluster_uses_global_cb[selector_cluster_index];
-
- if (used_global_cb)
- {
- basist::etc1_global_selector_codebook_entry_id pal_id(get_selector_cluster_global_selector_entry_ids()[selector_cluster_index]);
-
- basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(pal_id));
-
- for (uint32_t y = 0; y < 4; y++)
- {
- for (uint32_t x = 0; x < 4; x++)
- {
- CHECK(pal_entry(x, y) == blk.get_selector(x, y));
- }
- }
- }
- }
-
+
#undef CHECK
}