// SPDX-License-Identifier: Apache-2.0 // ---------------------------------------------------------------------------- // Copyright 2011-2023 Arm Limited // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy // of the License at: // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. // ---------------------------------------------------------------------------- /** * @brief Functions for the library entrypoint. */ #include #include #include #include "astcenc.h" #include "astcenc_internal_entry.h" #include "astcenc_diagnostic_trace.h" /** * @brief Record of the quality tuning parameter values. * * See the @c astcenc_config structure for detailed parameter documentation. * * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit. * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios * for the more through search presets because the underlying db_limit is so much higher. */ struct astcenc_preset_config { float quality; unsigned int tune_partition_count_limit; unsigned int tune_2partition_index_limit; unsigned int tune_3partition_index_limit; unsigned int tune_4partition_index_limit; unsigned int tune_block_mode_limit; unsigned int tune_refinement_limit; unsigned int tune_candidate_limit; unsigned int tune_2partitioning_candidate_limit; unsigned int tune_3partitioning_candidate_limit; unsigned int tune_4partitioning_candidate_limit; float tune_db_limit_a_base; float tune_db_limit_b_base; float tune_mse_overshoot; float tune_2_partition_early_out_limit_factor; float tune_3_partition_early_out_limit_factor; float tune_2_plane_early_out_limit_correlation; }; /** * @brief The static presets for high bandwidth encodings (x < 25 texels per block). */ static const std::array preset_configs_high {{ { ASTCENC_PRE_FASTEST, 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_FAST, 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f }, { ASTCENC_PRE_MEDIUM, 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f }, { ASTCENC_PRE_THOROUGH, 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f }, { ASTCENC_PRE_VERYTHOROUGH, 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; /** * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block). */ static const std::array preset_configs_mid {{ { ASTCENC_PRE_FASTEST, 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f }, { ASTCENC_PRE_FAST, 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_MEDIUM, 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f }, { ASTCENC_PRE_THOROUGH, 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f }, { ASTCENC_PRE_VERYTHOROUGH, 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; /** * @brief The static presets for low bandwidth encodings (64 <= x texels per block). */ static const std::array preset_configs_low {{ { ASTCENC_PRE_FASTEST, 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f }, { ASTCENC_PRE_FAST, 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f }, { ASTCENC_PRE_MEDIUM, 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f }, { ASTCENC_PRE_THOROUGH, 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f }, { ASTCENC_PRE_VERYTHOROUGH, 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f }, { ASTCENC_PRE_EXHAUSTIVE, 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f } }}; /** * @brief Validate CPU floating point meets assumptions made in the codec. * * The codec is written with the assumption that a float threaded through the @c if32 union will be * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the * case in an IEEE-754 compliant system, however not every system or compilation mode is actually * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_cpu_float() { if32 p; volatile float xprec_testval = 2.51f; p.f = xprec_testval + 12582912.0f; float q = p.f - 12582912.0f; if (q != 3.0f) { return ASTCENC_ERR_BAD_CPU_FLOAT; } return ASTCENC_SUCCESS; } /** * @brief Validate CPU ISA support meets the requirements of this build of the library. * * Each library build is statically compiled for a particular set of CPU ISA features, such as the * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU * actually supports everything this build needs. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_cpu_isa() { #if ASTCENC_SSE >= 41 if (!cpu_supports_sse41()) { return ASTCENC_ERR_BAD_CPU_ISA; } #endif #if ASTCENC_POPCNT >= 1 if (!cpu_supports_popcnt()) { return ASTCENC_ERR_BAD_CPU_ISA; } #endif #if ASTCENC_F16C >= 1 if (!cpu_supports_f16c()) { return ASTCENC_ERR_BAD_CPU_ISA; } #endif #if ASTCENC_AVX >= 2 if (!cpu_supports_avx2()) { return ASTCENC_ERR_BAD_CPU_ISA; } #endif return ASTCENC_SUCCESS; } /** * @brief Validate config profile. * * @param profile The profile to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_profile( astcenc_profile profile ) { // Values in this enum are from an external user, so not guaranteed to be // bounded to the enum values switch (static_cast(profile)) { case ASTCENC_PRF_LDR_SRGB: case ASTCENC_PRF_LDR: case ASTCENC_PRF_HDR_RGB_LDR_A: case ASTCENC_PRF_HDR: return ASTCENC_SUCCESS; default: return ASTCENC_ERR_BAD_PROFILE; } } /** * @brief Validate block size. * * @param block_x The block x dimensions. * @param block_y The block y dimensions. * @param block_z The block z dimensions. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_block_size( unsigned int block_x, unsigned int block_y, unsigned int block_z ) { // Test if this is a legal block size at all bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) || ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z))); if (!is_legal) { return ASTCENC_ERR_BAD_BLOCK_SIZE; } // Test if this build has sufficient capacity for this block size bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS; if (!have_capacity) { return ASTCENC_ERR_NOT_IMPLEMENTED; } return ASTCENC_SUCCESS; } /** * @brief Validate flags. * * @param flags The flags to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_flags( unsigned int flags ) { // Flags field must not contain any unknown flag bits unsigned int exMask = ~ASTCENC_ALL_FLAGS; if (popcount(flags & exMask) != 0) { return ASTCENC_ERR_BAD_FLAGS; } // Flags field must only contain at most a single map type exMask = ASTCENC_FLG_MAP_NORMAL | ASTCENC_FLG_MAP_RGBM; if (popcount(flags & exMask) > 1) { return ASTCENC_ERR_BAD_FLAGS; } return ASTCENC_SUCCESS; } #if !defined(ASTCENC_DECOMPRESS_ONLY) /** * @brief Validate single channel compression swizzle. * * @param swizzle The swizzle to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_compression_swz( astcenc_swz swizzle ) { // Not all enum values are handled; SWZ_Z is invalid for compression switch (static_cast(swizzle)) { case ASTCENC_SWZ_R: case ASTCENC_SWZ_G: case ASTCENC_SWZ_B: case ASTCENC_SWZ_A: case ASTCENC_SWZ_0: case ASTCENC_SWZ_1: return ASTCENC_SUCCESS; default: return ASTCENC_ERR_BAD_SWIZZLE; } } /** * @brief Validate overall compression swizzle. * * @param swizzle The swizzle to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_compression_swizzle( const astcenc_swizzle& swizzle ) { if (validate_compression_swz(swizzle.r) || validate_compression_swz(swizzle.g) || validate_compression_swz(swizzle.b) || validate_compression_swz(swizzle.a)) { return ASTCENC_ERR_BAD_SWIZZLE; } return ASTCENC_SUCCESS; } #endif /** * @brief Validate single channel decompression swizzle. * * @param swizzle The swizzle to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_decompression_swz( astcenc_swz swizzle ) { // Values in this enum are from an external user, so not guaranteed to be // bounded to the enum values switch (static_cast(swizzle)) { case ASTCENC_SWZ_R: case ASTCENC_SWZ_G: case ASTCENC_SWZ_B: case ASTCENC_SWZ_A: case ASTCENC_SWZ_0: case ASTCENC_SWZ_1: case ASTCENC_SWZ_Z: return ASTCENC_SUCCESS; default: return ASTCENC_ERR_BAD_SWIZZLE; } } /** * @brief Validate overall decompression swizzle. * * @param swizzle The swizzle to check. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_decompression_swizzle( const astcenc_swizzle& swizzle ) { if (validate_decompression_swz(swizzle.r) || validate_decompression_swz(swizzle.g) || validate_decompression_swz(swizzle.b) || validate_decompression_swz(swizzle.a)) { return ASTCENC_ERR_BAD_SWIZZLE; } return ASTCENC_SUCCESS; } /** * Validate that an incoming configuration is in-spec. * * This function can respond in two ways: * * * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown * for out-of-range inputs in this case. * * Numerical inputs and logic inputs are are logically invalid and which make no sense * algorithmically will return an error. * * @param[in,out] config The input compressor configuration. * * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. */ static astcenc_error validate_config( astcenc_config &config ) { astcenc_error status; status = validate_profile(config.profile); if (status != ASTCENC_SUCCESS) { return status; } status = validate_flags(config.flags); if (status != ASTCENC_SUCCESS) { return status; } status = validate_block_size(config.block_x, config.block_y, config.block_z); if (status != ASTCENC_SUCCESS) { return status; } #if defined(ASTCENC_DECOMPRESS_ONLY) // Decompress-only builds only support decompress-only contexts if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)) { return ASTCENC_ERR_BAD_PARAM; } #endif config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f); config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u); config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u); config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u); config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES); config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f); config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f); config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f); config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f); config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f); // Specifying a zero weight color component is not allowed; force to small value float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight), astc::max(config.cw_b_weight, config.cw_a_weight)); if (max_weight > 0.0f) { max_weight /= 1000.0f; config.cw_r_weight = astc::max(config.cw_r_weight, max_weight); config.cw_g_weight = astc::max(config.cw_g_weight, max_weight); config.cw_b_weight = astc::max(config.cw_b_weight, max_weight); config.cw_a_weight = astc::max(config.cw_a_weight, max_weight); } // If all color components error weights are zero then return an error else { return ASTCENC_ERR_BAD_PARAM; } return ASTCENC_SUCCESS; } /* See header for documentation. */ astcenc_error astcenc_config_init( astcenc_profile profile, unsigned int block_x, unsigned int block_y, unsigned int block_z, float quality, unsigned int flags, astcenc_config* configp ) { astcenc_error status; // Check basic library compatibility options here so they are checked early. Note, these checks // are repeated in context_alloc for cases where callers use a manually defined config struct status = validate_cpu_isa(); if (status != ASTCENC_SUCCESS) { return status; } status = validate_cpu_float(); if (status != ASTCENC_SUCCESS) { return status; } // Zero init all config fields; although most of will be over written astcenc_config& config = *configp; std::memset(&config, 0, sizeof(config)); // Process the block size block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1 status = validate_block_size(block_x, block_y, block_z); if (status != ASTCENC_SUCCESS) { return status; } config.block_x = block_x; config.block_y = block_y; config.block_z = block_z; float texels = static_cast(block_x * block_y * block_z); float ltexels = logf(texels) / logf(10.0f); // Process the performance quality level or preset; note that this must be done before we // process any additional settings, such as color profile and flags, which may replace some of // these settings with more use case tuned values if (quality < ASTCENC_PRE_FASTEST || quality > ASTCENC_PRE_EXHAUSTIVE) { return ASTCENC_ERR_BAD_QUALITY; } static const std::array* preset_configs; int texels_int = block_x * block_y * block_z; if (texels_int < 25) { preset_configs = &preset_configs_high; } else if (texels_int < 64) { preset_configs = &preset_configs_mid; } else { preset_configs = &preset_configs_low; } // Determine which preset to use, or which pair to interpolate size_t start; size_t end; for (end = 0; end < preset_configs->size(); end++) { if ((*preset_configs)[end].quality >= quality) { break; } } start = end == 0 ? 0 : end - 1; // Start and end node are the same - so just transfer the values. if (start == end) { config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit; config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit; config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit; config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit; config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit; config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit; config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES); config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels, (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels); config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot; config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor; config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor; config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation; } // Start and end node are not the same - so interpolate between them else { auto& node_a = (*preset_configs)[start]; auto& node_b = (*preset_configs)[end]; float wt_range = node_b.quality - node_a.quality; assert(wt_range > 0); // Compute interpolation factors float wt_node_a = (node_b.quality - quality) / wt_range; float wt_node_b = (quality - node_a.quality) / wt_range; #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b)) #define LERPI(param) astc::flt2int_rtn(\ (static_cast(node_a.param) * wt_node_a) + \ (static_cast(node_b.param) * wt_node_b)) #define LERPUI(param) static_cast(LERPI(param)) config.tune_partition_count_limit = LERPI(tune_partition_count_limit); config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit); config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit); config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit); config.tune_block_mode_limit = LERPI(tune_block_mode_limit); config.tune_refinement_limit = LERPI(tune_refinement_limit); config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit), TUNE_MAX_TRIAL_CANDIDATES); config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit), BLOCK_MAX_PARTITIONINGS); config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit), BLOCK_MAX_PARTITIONINGS); config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit), BLOCK_MAX_PARTITIONINGS); config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels, LERP(tune_db_limit_b_base) - 19 * ltexels); config.tune_mse_overshoot = LERP(tune_mse_overshoot); config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor); config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor); config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation); #undef LERP #undef LERPI #undef LERPUI } // Set heuristics to the defaults for each color profile config.cw_r_weight = 1.0f; config.cw_g_weight = 1.0f; config.cw_b_weight = 1.0f; config.cw_a_weight = 1.0f; config.a_scale_radius = 0; config.rgbm_m_scale = 0.0f; config.profile = profile; // Values in this enum are from an external user, so not guaranteed to be // bounded to the enum values switch (static_cast(profile)) { case ASTCENC_PRF_LDR: case ASTCENC_PRF_LDR_SRGB: break; case ASTCENC_PRF_HDR_RGB_LDR_A: case ASTCENC_PRF_HDR: config.tune_db_limit = 999.0f; break; default: return ASTCENC_ERR_BAD_PROFILE; } // Flags field must not contain any unknown flag bits status = validate_flags(flags); if (status != ASTCENC_SUCCESS) { return status; } if (flags & ASTCENC_FLG_MAP_NORMAL) { // Normal map encoding uses L+A blocks, so allow one more partitioning // than normal. We need need fewer bits for endpoints, so more likely // to be able to use more partitions than an RGB/RGBA block config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u); config.cw_g_weight = 0.0f; config.cw_b_weight = 0.0f; config.tune_2_partition_early_out_limit_factor *= 1.5f; config.tune_3_partition_early_out_limit_factor *= 1.5f; config.tune_2_plane_early_out_limit_correlation = 0.99f; // Normals are prone to blocking artifacts on smooth curves // so force compressor to try harder here ... config.tune_db_limit *= 1.03f; } else if (flags & ASTCENC_FLG_MAP_RGBM) { config.rgbm_m_scale = 5.0f; config.cw_a_weight = 2.0f * config.rgbm_m_scale; } else // (This is color data) { // This is a very basic perceptual metric for RGB color data, which weights error // significance by the perceptual luminance contribution of each color channel. For // luminance the usual weights to compute luminance from a linear RGB value are as // follows: // // l = r * 0.3 + g * 0.59 + b * 0.11 // // ... but we scale these up to keep a better balance between color and alpha. Note // that if the content is using alpha we'd recommend using the -a option to weight // the color contribution by the alpha transparency. if (flags & ASTCENC_FLG_USE_PERCEPTUAL) { config.cw_r_weight = 0.30f * 2.25f; config.cw_g_weight = 0.59f * 2.25f; config.cw_b_weight = 0.11f * 2.25f; } } config.flags = flags; return ASTCENC_SUCCESS; } /* See header for documentation. */ astcenc_error astcenc_context_alloc( const astcenc_config* configp, unsigned int thread_count, astcenc_context** context ) { astcenc_error status; const astcenc_config& config = *configp; status = validate_cpu_isa(); if (status != ASTCENC_SUCCESS) { return status; } status = validate_cpu_float(); if (status != ASTCENC_SUCCESS) { return status; } if (thread_count == 0) { return ASTCENC_ERR_BAD_PARAM; } #if defined(ASTCENC_DIAGNOSTICS) // Force single threaded compressor use in diagnostic mode. if (thread_count != 1) { return ASTCENC_ERR_BAD_PARAM; } #endif astcenc_context* ctxo = new astcenc_context; astcenc_contexti* ctx = &ctxo->context; ctx->thread_count = thread_count; ctx->config = config; ctx->working_buffers = nullptr; // These are allocated per-compress, as they depend on image size ctx->input_alpha_averages = nullptr; // Copy the config first and validate the copy (we may modify it) status = validate_config(ctx->config); if (status != ASTCENC_SUCCESS) { delete ctxo; return status; } ctx->bsd = aligned_malloc(sizeof(block_size_descriptor), ASTCENC_VECALIGN); bool can_omit_modes = static_cast(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY); init_block_size_descriptor(config.block_x, config.block_y, config.block_z, can_omit_modes, config.tune_partition_count_limit, static_cast(config.tune_block_mode_limit) / 100.0f, *ctx->bsd); #if !defined(ASTCENC_DECOMPRESS_ONLY) // Do setup only needed by compression if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) { // Turn a dB limit into a per-texel error for faster use later if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) { ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f; } else { ctx->config.tune_db_limit = 0.0f; } size_t worksize = sizeof(compression_working_buffers) * thread_count; ctx->working_buffers = aligned_malloc(worksize, ASTCENC_VECALIGN); static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0, "compression_working_buffers size must be multiple of vector alignment"); if (!ctx->working_buffers) { aligned_free(ctx->bsd); delete ctxo; *context = nullptr; return ASTCENC_ERR_OUT_OF_MEM; } } #endif #if defined(ASTCENC_DIAGNOSTICS) ctx->trace_log = new TraceLog(ctx->config.trace_file_path); if (!ctx->trace_log->m_file) { return ASTCENC_ERR_DTRACE_FAILURE; } trace_add_data("block_x", config.block_x); trace_add_data("block_y", config.block_y); trace_add_data("block_z", config.block_z); #endif *context = ctxo; #if !defined(ASTCENC_DECOMPRESS_ONLY) prepare_angular_tables(); #endif return ASTCENC_SUCCESS; } /* See header dor documentation. */ void astcenc_context_free( astcenc_context* ctxo ) { if (ctxo) { astcenc_contexti* ctx = &ctxo->context; aligned_free(ctx->working_buffers); aligned_free(ctx->bsd); #if defined(ASTCENC_DIAGNOSTICS) delete ctx->trace_log; #endif delete ctxo; } } #if !defined(ASTCENC_DECOMPRESS_ONLY) /** * @brief Compress an image, after any preflight has completed. * * @param[out] ctxo The compressor context. * @param thread_index The thread index. * @param image The intput image. * @param swizzle The input swizzle. * @param[out] buffer The output array for the compressed data. */ static void compress_image( astcenc_context& ctxo, unsigned int thread_index, const astcenc_image& image, const astcenc_swizzle& swizzle, uint8_t* buffer ) { astcenc_contexti& ctx = ctxo.context; const block_size_descriptor& bsd = *ctx.bsd; astcenc_profile decode_mode = ctx.config.profile; image_block blk; int block_x = bsd.xdim; int block_y = bsd.ydim; int block_z = bsd.zdim; blk.texel_count = static_cast(block_x * block_y * block_z); int dim_x = image.dim_x; int dim_y = image.dim_y; int dim_z = image.dim_z; int xblocks = (dim_x + block_x - 1) / block_x; int yblocks = (dim_y + block_y - 1) / block_y; int zblocks = (dim_z + block_z - 1) / block_z; int block_count = zblocks * yblocks * xblocks; int row_blocks = xblocks; int plane_blocks = xblocks * yblocks; // Populate the block channel weights blk.channel_weight = vfloat4(ctx.config.cw_r_weight, ctx.config.cw_g_weight, ctx.config.cw_b_weight, ctx.config.cw_a_weight); // Use preallocated scratch buffer auto& temp_buffers = ctx.working_buffers[thread_index]; // Only the first thread actually runs the initializer ctxo.manage_compress.init(block_count); // Determine if we can use an optimized load function bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A); bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) || (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A); bool use_fast_load = !needs_swz && !needs_hdr && block_z == 1 && image.data_type == ASTCENC_TYPE_U8; auto load_func = load_image_block; if (use_fast_load) { load_func = load_image_block_fast_ldr; } // All threads run this processing loop until there is no work remaining while (true) { unsigned int count; unsigned int base = ctxo.manage_compress.get_task_assignment(16, count); if (!count) { break; } for (unsigned int i = base; i < base + count; i++) { // Decode i into x, y, z block indices int z = i / plane_blocks; unsigned int rem = i - (z * plane_blocks); int y = rem / row_blocks; int x = rem - (y * row_blocks); // Test if we can apply some basic alpha-scale RDO bool use_full_block = true; if (ctx.config.a_scale_radius != 0 && block_z == 1) { int start_x = x * block_x; int end_x = astc::min(dim_x, start_x + block_x); int start_y = y * block_y; int end_y = astc::min(dim_y, start_y + block_y); // SATs accumulate error, so don't test exactly zero. Test for // less than 1 alpha in the expanded block footprint that // includes the alpha radius. int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1); int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1); float footprint = static_cast(x_footprint * y_footprint); float threshold = 0.9f / (255.0f * footprint); // Do we have any alpha values? use_full_block = false; for (int ay = start_y; ay < end_y; ay++) { for (int ax = start_x; ax < end_x; ax++) { float a_avg = ctx.input_alpha_averages[ay * dim_x + ax]; if (a_avg > threshold) { use_full_block = true; ax = end_x; ay = end_y; } } } } // Fetch the full block for compression if (use_full_block) { load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle); // Scale RGB error contribution by the maximum alpha in the block // This encourages preserving alpha accuracy in regions with high // transparency, and can buy up to 0.5 dB PSNR. if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT) { float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f); blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale, ctx.config.cw_g_weight * alpha_scale, ctx.config.cw_b_weight * alpha_scale, ctx.config.cw_a_weight); } } // Apply alpha scale RDO - substitute constant color block else { blk.origin_texel = vfloat4::zero(); blk.data_min = vfloat4::zero(); blk.data_mean = vfloat4::zero(); blk.data_max = vfloat4::zero(); blk.grayscale = true; } int offset = ((z * yblocks + y) * xblocks + x) * 16; uint8_t *bp = buffer + offset; physical_compressed_block* pcb = reinterpret_cast(bp); compress_block(ctx, blk, *pcb, temp_buffers); } ctxo.manage_compress.complete_task_assignment(count); } } /** * @brief Compute regional averages in an image. * * This function can be called by multiple threads, but only after a single * thread calls the setup function @c init_compute_averages(). * * Results are written back into @c img->input_alpha_averages. * * @param[out] ctx The context. * @param ag The average and variance arguments created during setup. */ static void compute_averages( astcenc_context& ctx, const avg_args &ag ) { pixel_region_args arg = ag.arg; arg.work_memory = new vfloat4[ag.work_memory_size]; int size_x = ag.img_size_x; int size_y = ag.img_size_y; int size_z = ag.img_size_z; int step_xy = ag.blk_size_xy; int step_z = ag.blk_size_z; int y_tasks = (size_y + step_xy - 1) / step_xy; // All threads run this processing loop until there is no work remaining while (true) { unsigned int count; unsigned int base = ctx.manage_avg.get_task_assignment(16, count); if (!count) { break; } for (unsigned int i = base; i < base + count; i++) { int z = (i / (y_tasks)) * step_z; int y = (i - (z * y_tasks)) * step_xy; arg.size_z = astc::min(step_z, size_z - z); arg.offset_z = z; arg.size_y = astc::min(step_xy, size_y - y); arg.offset_y = y; for (int x = 0; x < size_x; x += step_xy) { arg.size_x = astc::min(step_xy, size_x - x); arg.offset_x = x; compute_pixel_region_variance(ctx.context, arg); } } ctx.manage_avg.complete_task_assignment(count); } delete[] arg.work_memory; } #endif /* See header for documentation. */ astcenc_error astcenc_compress_image( astcenc_context* ctxo, astcenc_image* imagep, const astcenc_swizzle* swizzle, uint8_t* data_out, size_t data_len, unsigned int thread_index ) { #if defined(ASTCENC_DECOMPRESS_ONLY) (void)ctxo; (void)imagep; (void)swizzle; (void)data_out; (void)data_len; (void)thread_index; return ASTCENC_ERR_BAD_CONTEXT; #else astcenc_contexti* ctx = &ctxo->context; astcenc_error status; astcenc_image& image = *imagep; if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY) { return ASTCENC_ERR_BAD_CONTEXT; } status = validate_compression_swizzle(*swizzle); if (status != ASTCENC_SUCCESS) { return status; } if (thread_index >= ctx->thread_count) { return ASTCENC_ERR_BAD_PARAM; } unsigned int block_x = ctx->config.block_x; unsigned int block_y = ctx->config.block_y; unsigned int block_z = ctx->config.block_z; unsigned int xblocks = (image.dim_x + block_x - 1) / block_x; unsigned int yblocks = (image.dim_y + block_y - 1) / block_y; unsigned int zblocks = (image.dim_z + block_z - 1) / block_z; // Check we have enough output space (16 bytes per block) size_t size_needed = xblocks * yblocks * zblocks * 16; if (data_len < size_needed) { return ASTCENC_ERR_OUT_OF_MEM; } // If context thread count is one then implicitly reset if (ctx->thread_count == 1) { astcenc_compress_reset(ctxo); } if (ctx->config.a_scale_radius != 0) { // First thread to enter will do setup, other threads will subsequently // enter the critical section but simply skip over the initialization auto init_avg = [ctx, &image, swizzle]() { // Perform memory allocations for the destination buffers size_t texel_count = image.dim_x * image.dim_y * image.dim_z; ctx->input_alpha_averages = new float[texel_count]; return init_compute_averages( image, ctx->config.a_scale_radius, *swizzle, ctx->avg_preprocess_args); }; // Only the first thread actually runs the initializer ctxo->manage_avg.init(init_avg); // All threads will enter this function and dynamically grab work compute_averages(*ctxo, ctx->avg_preprocess_args); } // Wait for compute_averages to complete before compressing ctxo->manage_avg.wait(); compress_image(*ctxo, thread_index, image, *swizzle, data_out); // Wait for compress to complete before freeing memory ctxo->manage_compress.wait(); auto term_compress = [ctx]() { delete[] ctx->input_alpha_averages; ctx->input_alpha_averages = nullptr; }; // Only the first thread to arrive actually runs the term ctxo->manage_compress.term(term_compress); return ASTCENC_SUCCESS; #endif } /* See header for documentation. */ astcenc_error astcenc_compress_reset( astcenc_context* ctxo ) { #if defined(ASTCENC_DECOMPRESS_ONLY) (void)ctxo; return ASTCENC_ERR_BAD_CONTEXT; #else astcenc_contexti* ctx = &ctxo->context; if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY) { return ASTCENC_ERR_BAD_CONTEXT; } ctxo->manage_avg.reset(); ctxo->manage_compress.reset(); return ASTCENC_SUCCESS; #endif } /* See header for documentation. */ astcenc_error astcenc_decompress_image( astcenc_context* ctxo, const uint8_t* data, size_t data_len, astcenc_image* image_outp, const astcenc_swizzle* swizzle, unsigned int thread_index ) { astcenc_error status; astcenc_image& image_out = *image_outp; astcenc_contexti* ctx = &ctxo->context; // Today this doesn't matter (working set on stack) but might in future ... if (thread_index >= ctx->thread_count) { return ASTCENC_ERR_BAD_PARAM; } status = validate_decompression_swizzle(*swizzle); if (status != ASTCENC_SUCCESS) { return status; } unsigned int block_x = ctx->config.block_x; unsigned int block_y = ctx->config.block_y; unsigned int block_z = ctx->config.block_z; unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; int row_blocks = xblocks; int plane_blocks = xblocks * yblocks; // Check we have enough output space (16 bytes per block) size_t size_needed = xblocks * yblocks * zblocks * 16; if (data_len < size_needed) { return ASTCENC_ERR_OUT_OF_MEM; } image_block blk; blk.texel_count = static_cast(block_x * block_y * block_z); // If context thread count is one then implicitly reset if (ctx->thread_count == 1) { astcenc_decompress_reset(ctxo); } // Only the first thread actually runs the initializer ctxo->manage_decompress.init(zblocks * yblocks * xblocks); // All threads run this processing loop until there is no work remaining while (true) { unsigned int count; unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count); if (!count) { break; } for (unsigned int i = base; i < base + count; i++) { // Decode i into x, y, z block indices int z = i / plane_blocks; unsigned int rem = i - (z * plane_blocks); int y = rem / row_blocks; int x = rem - (y * row_blocks); unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16; const uint8_t* bp = data + offset; const physical_compressed_block& pcb = *reinterpret_cast(bp); symbolic_compressed_block scb; physical_to_symbolic(*ctx->bsd, pcb, scb); decompress_symbolic_block(ctx->config.profile, *ctx->bsd, x * block_x, y * block_y, z * block_z, scb, blk); store_image_block(image_out, blk, *ctx->bsd, x * block_x, y * block_y, z * block_z, *swizzle); } ctxo->manage_decompress.complete_task_assignment(count); } return ASTCENC_SUCCESS; } /* See header for documentation. */ astcenc_error astcenc_decompress_reset( astcenc_context* ctxo ) { ctxo->manage_decompress.reset(); return ASTCENC_SUCCESS; } /* See header for documentation. */ astcenc_error astcenc_get_block_info( astcenc_context* ctxo, const uint8_t data[16], astcenc_block_info* info ) { #if defined(ASTCENC_DECOMPRESS_ONLY) (void)ctxo; (void)data; (void)info; return ASTCENC_ERR_BAD_CONTEXT; #else astcenc_contexti* ctx = &ctxo->context; // Decode the compressed data into a symbolic form const physical_compressed_block&pcb = *reinterpret_cast(data); symbolic_compressed_block scb; physical_to_symbolic(*ctx->bsd, pcb, scb); // Fetch the appropriate partition and decimation tables block_size_descriptor& bsd = *ctx->bsd; // Start from a clean slate memset(info, 0, sizeof(*info)); // Basic info we can always populate info->profile = ctx->config.profile; info->block_x = ctx->config.block_x; info->block_y = ctx->config.block_y; info->block_z = ctx->config.block_z; info->texel_count = bsd.texel_count; // Check for error blocks first info->is_error_block = scb.block_type == SYM_BTYPE_ERROR; if (info->is_error_block) { return ASTCENC_SUCCESS; } // Check for constant color blocks second info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 || scb.block_type == SYM_BTYPE_CONST_U16; if (info->is_constant_block) { return ASTCENC_SUCCESS; } // Otherwise handle a full block ; known to be valid after conditions above have been checked int partition_count = scb.partition_count; const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); const block_mode& bm = bsd.get_block_mode(scb.block_mode); const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); info->weight_x = di.weight_x; info->weight_y = di.weight_y; info->weight_z = di.weight_z; info->is_dual_plane_block = bm.is_dual_plane != 0; info->partition_count = scb.partition_count; info->partition_index = scb.partition_index; info->dual_plane_component = scb.plane2_component; info->color_level_count = get_quant_level(scb.get_color_quant_mode()); info->weight_level_count = get_quant_level(bm.get_weight_quant_mode()); // Unpack color endpoints for each active partition for (unsigned int i = 0; i < scb.partition_count; i++) { bool rgb_hdr; bool a_hdr; vint4 endpnt[2]; unpack_color_endpoints(ctx->config.profile, scb.color_formats[i], scb.color_values[i], rgb_hdr, a_hdr, endpnt[0], endpnt[1]); // Store the color endpoint mode info info->color_endpoint_modes[i] = scb.color_formats[i]; info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr; // Store the unpacked and decoded color endpoint vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr); for (int j = 0; j < 2; j++) { vint4 color_lns = lns_to_sf16(endpnt[j]); vint4 color_unorm = unorm16_to_sf16(endpnt[j]); vint4 datai = select(color_unorm, color_lns, hdr_mask); store(float16_to_float(datai), info->color_endpoints[i][j]); } } // Unpack weights for each texel int weight_plane1[BLOCK_MAX_TEXELS]; int weight_plane2[BLOCK_MAX_TEXELS]; unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2); for (unsigned int i = 0; i < bsd.texel_count; i++) { info->weight_values_plane1[i] = static_cast(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM); if (info->is_dual_plane_block) { info->weight_values_plane2[i] = static_cast(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM); } } // Unpack partition assignments for each texel for (unsigned int i = 0; i < bsd.texel_count; i++) { info->partition_assignment[i] = pi.partition_of_texel[i]; } return ASTCENC_SUCCESS; #endif } /* See header for documentation. */ const char* astcenc_get_error_string( astcenc_error status ) { // Values in this enum are from an external user, so not guaranteed to be // bounded to the enum values switch (static_cast(status)) { case ASTCENC_SUCCESS: return "ASTCENC_SUCCESS"; case ASTCENC_ERR_OUT_OF_MEM: return "ASTCENC_ERR_OUT_OF_MEM"; case ASTCENC_ERR_BAD_CPU_FLOAT: return "ASTCENC_ERR_BAD_CPU_FLOAT"; case ASTCENC_ERR_BAD_CPU_ISA: return "ASTCENC_ERR_BAD_CPU_ISA"; case ASTCENC_ERR_BAD_PARAM: return "ASTCENC_ERR_BAD_PARAM"; case ASTCENC_ERR_BAD_BLOCK_SIZE: return "ASTCENC_ERR_BAD_BLOCK_SIZE"; case ASTCENC_ERR_BAD_PROFILE: return "ASTCENC_ERR_BAD_PROFILE"; case ASTCENC_ERR_BAD_QUALITY: return "ASTCENC_ERR_BAD_QUALITY"; case ASTCENC_ERR_BAD_FLAGS: return "ASTCENC_ERR_BAD_FLAGS"; case ASTCENC_ERR_BAD_SWIZZLE: return "ASTCENC_ERR_BAD_SWIZZLE"; case ASTCENC_ERR_BAD_CONTEXT: return "ASTCENC_ERR_BAD_CONTEXT"; case ASTCENC_ERR_NOT_IMPLEMENTED: return "ASTCENC_ERR_NOT_IMPLEMENTED"; #if defined(ASTCENC_DIAGNOSTICS) case ASTCENC_ERR_DTRACE_FAILURE: return "ASTCENC_ERR_DTRACE_FAILURE"; #endif default: return nullptr; } }