diff options
Diffstat (limited to 'thirdparty/astcenc/astcenc_decompress_symbolic.cpp')
-rw-r--r-- | thirdparty/astcenc/astcenc_decompress_symbolic.cpp | 623 |
1 files changed, 623 insertions, 0 deletions
diff --git a/thirdparty/astcenc/astcenc_decompress_symbolic.cpp b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp new file mode 100644 index 0000000000..39e5525c3b --- /dev/null +++ b/thirdparty/astcenc/astcenc_decompress_symbolic.cpp @@ -0,0 +1,623 @@ +// SPDX-License-Identifier: Apache-2.0 +// ---------------------------------------------------------------------------- +// Copyright 2011-2023 Arm Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// ---------------------------------------------------------------------------- + +/** + * @brief Functions to decompress a symbolic block. + */ + +#include "astcenc_internal.h" + +#include <stdio.h> +#include <assert.h> + +/** + * @brief Compute the integer linear interpolation of two color endpoints. + * + * @param decode_mode The ASTC profile (linear or sRGB) + * @param color0 The endpoint0 color. + * @param color1 The endpoint1 color. + * @param weights The interpolation weight (between 0 and 64). + * + * @return The interpolated color. + */ +static vint4 lerp_color_int( + astcenc_profile decode_mode, + vint4 color0, + vint4 color1, + vint4 weights +) { + vint4 weight1 = weights; + vint4 weight0 = vint4(64) - weight1; + + if (decode_mode == ASTCENC_PRF_LDR_SRGB) + { + color0 = asr<8>(color0); + color1 = asr<8>(color1); + } + + vint4 color = (color0 * weight0) + (color1 * weight1) + vint4(32); + color = asr<6>(color); + + if (decode_mode == ASTCENC_PRF_LDR_SRGB) + { + color = color * vint4(257); + } + + return color; +} + + +/** + * @brief Convert integer color value into a float value for the decoder. + * + * @param data The integer color value post-interpolation. + * @param lns_mask If set treat lane as HDR (LNS) else LDR (unorm16). + * + * @return The float color value. + */ +static inline vfloat4 decode_texel( + vint4 data, + vmask4 lns_mask +) { + vint4 color_lns = vint4::zero(); + vint4 color_unorm = vint4::zero(); + + if (any(lns_mask)) + { + color_lns = lns_to_sf16(data); + } + + if (!all(lns_mask)) + { + color_unorm = unorm16_to_sf16(data); + } + + // Pick components and then convert to FP16 + vint4 datai = select(color_unorm, color_lns, lns_mask); + return float16_to_float(datai); +} + +/* See header for documentation. */ +void unpack_weights( + const block_size_descriptor& bsd, + const symbolic_compressed_block& scb, + const decimation_info& di, + bool is_dual_plane, + int weights_plane1[BLOCK_MAX_TEXELS], + int weights_plane2[BLOCK_MAX_TEXELS] +) { + // Safe to overshoot as all arrays are allocated to full size + if (!is_dual_plane) + { + // Build full 64-entry weight lookup table + vint4 tab0(reinterpret_cast<const int*>(scb.weights + 0)); + vint4 tab1(reinterpret_cast<const int*>(scb.weights + 16)); + vint4 tab2(reinterpret_cast<const int*>(scb.weights + 32)); + vint4 tab3(reinterpret_cast<const int*>(scb.weights + 48)); + + vint tab0p, tab1p, tab2p, tab3p; + vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p); + + for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint summed_value(8); + vint weight_count(di.texel_weight_count + i); + int max_weight_count = hmax(weight_count).lane<0>(); + + promise(max_weight_count > 0); + for (int j = 0; j < max_weight_count; j++) + { + vint texel_weights(di.texel_weights_tr[j] + i); + vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); + + summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int; + } + + store(lsr<4>(summed_value), weights_plane1 + i); + } + } + else + { + // Build a 32-entry weight lookup table per plane + // Plane 1 + vint4 tab0_plane1(reinterpret_cast<const int*>(scb.weights + 0)); + vint4 tab1_plane1(reinterpret_cast<const int*>(scb.weights + 16)); + vint tab0_plane1p, tab1_plane1p; + vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p); + + // Plane 2 + vint4 tab0_plane2(reinterpret_cast<const int*>(scb.weights + 32)); + vint4 tab1_plane2(reinterpret_cast<const int*>(scb.weights + 48)); + vint tab0_plane2p, tab1_plane2p; + vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p); + + for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH) + { + vint sum_plane1(8); + vint sum_plane2(8); + + vint weight_count(di.texel_weight_count + i); + int max_weight_count = hmax(weight_count).lane<0>(); + + promise(max_weight_count > 0); + for (int j = 0; j < max_weight_count; j++) + { + vint texel_weights(di.texel_weights_tr[j] + i); + vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i); + + sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int; + sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int; + } + + store(lsr<4>(sum_plane1), weights_plane1 + i); + store(lsr<4>(sum_plane2), weights_plane2 + i); + } + } +} + +/** + * @brief Return an FP32 NaN value for use in error colors. + * + * This NaN encoding will turn into 0xFFFF when converted to an FP16 NaN. + * + * @return The float color value. + */ +static float error_color_nan() +{ + if32 v; + v.u = 0xFFFFE000U; + return v.f; +} + +/* See header for documentation. */ +void decompress_symbolic_block( + astcenc_profile decode_mode, + const block_size_descriptor& bsd, + int xpos, + int ypos, + int zpos, + const symbolic_compressed_block& scb, + image_block& blk +) { + blk.xpos = xpos; + blk.ypos = ypos; + blk.zpos = zpos; + + blk.data_min = vfloat4::zero(); + blk.data_mean = vfloat4::zero(); + blk.data_max = vfloat4::zero(); + blk.grayscale = false; + + // If we detected an error-block, blow up immediately. + if (scb.block_type == SYM_BTYPE_ERROR) + { + for (unsigned int i = 0; i < bsd.texel_count; i++) + { + blk.data_r[i] = error_color_nan(); + blk.data_g[i] = error_color_nan(); + blk.data_b[i] = error_color_nan(); + blk.data_a[i] = error_color_nan(); + blk.rgb_lns[i] = 0; + blk.alpha_lns[i] = 0; + } + + return; + } + + if ((scb.block_type == SYM_BTYPE_CONST_F16) || + (scb.block_type == SYM_BTYPE_CONST_U16)) + { + vfloat4 color; + uint8_t use_lns = 0; + + // UNORM16 constant color block + if (scb.block_type == SYM_BTYPE_CONST_U16) + { + vint4 colori(scb.constant_color); + + // For sRGB decoding a real decoder would just use the top 8 bits for color conversion. + // We don't color convert, so rescale the top 8 bits into the full 16 bit dynamic range. + if (decode_mode == ASTCENC_PRF_LDR_SRGB) + { + colori = asr<8>(colori) * 257; + } + + vint4 colorf16 = unorm16_to_sf16(colori); + color = float16_to_float(colorf16); + } + // FLOAT16 constant color block + else + { + switch (decode_mode) + { + case ASTCENC_PRF_LDR_SRGB: + case ASTCENC_PRF_LDR: + color = vfloat4(error_color_nan()); + break; + case ASTCENC_PRF_HDR_RGB_LDR_A: + case ASTCENC_PRF_HDR: + // Constant-color block; unpack from FP16 to FP32. + color = float16_to_float(vint4(scb.constant_color)); + use_lns = 1; + break; + } + } + + for (unsigned int i = 0; i < bsd.texel_count; i++) + { + blk.data_r[i] = color.lane<0>(); + blk.data_g[i] = color.lane<1>(); + blk.data_b[i] = color.lane<2>(); + blk.data_a[i] = color.lane<3>(); + blk.rgb_lns[i] = use_lns; + blk.alpha_lns[i] = use_lns; + } + + return; + } + + // Get the appropriate partition-table entry + int partition_count = scb.partition_count; + const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); + + // Get the appropriate block descriptors + const auto& bm = bsd.get_block_mode(scb.block_mode); + const auto& di = bsd.get_decimation_info(bm.decimation_mode); + + bool is_dual_plane = static_cast<bool>(bm.is_dual_plane); + + // Unquantize and undecimate the weights + int plane1_weights[BLOCK_MAX_TEXELS]; + int plane2_weights[BLOCK_MAX_TEXELS]; + unpack_weights(bsd, scb, di, is_dual_plane, plane1_weights, plane2_weights); + + // Now that we have endpoint colors and weights, we can unpack texel colors + int plane2_component = scb.plane2_component; + vmask4 plane2_mask = vint4::lane_id() == vint4(plane2_component); + + for (int i = 0; i < partition_count; i++) + { + // Decode the color endpoints for this partition + vint4 ep0; + vint4 ep1; + bool rgb_lns; + bool a_lns; + + unpack_color_endpoints(decode_mode, + scb.color_formats[i], + scb.color_values[i], + rgb_lns, a_lns, + ep0, ep1); + + vmask4 lns_mask(rgb_lns, rgb_lns, rgb_lns, a_lns); + + int texel_count = pi.partition_texel_count[i]; + for (int j = 0; j < texel_count; j++) + { + int tix = pi.texels_of_partition[i][j]; + vint4 weight = select(vint4(plane1_weights[tix]), vint4(plane2_weights[tix]), plane2_mask); + vint4 color = lerp_color_int(decode_mode, ep0, ep1, weight); + vfloat4 colorf = decode_texel(color, lns_mask); + + blk.data_r[tix] = colorf.lane<0>(); + blk.data_g[tix] = colorf.lane<1>(); + blk.data_b[tix] = colorf.lane<2>(); + blk.data_a[tix] = colorf.lane<3>(); + } + } +} + +#if !defined(ASTCENC_DECOMPRESS_ONLY) + +/* See header for documentation. */ +float compute_symbolic_block_difference_2plane( + const astcenc_config& config, + const block_size_descriptor& bsd, + const symbolic_compressed_block& scb, + const image_block& blk +) { + // If we detected an error-block, blow up immediately. + if (scb.block_type == SYM_BTYPE_ERROR) + { + return ERROR_CALC_DEFAULT; + } + + assert(scb.block_mode >= 0); + assert(scb.partition_count == 1); + assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 1); + + // Get the appropriate block descriptor + const block_mode& bm = bsd.get_block_mode(scb.block_mode); + const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); + + // Unquantize and undecimate the weights + int plane1_weights[BLOCK_MAX_TEXELS]; + int plane2_weights[BLOCK_MAX_TEXELS]; + unpack_weights(bsd, scb, di, true, plane1_weights, plane2_weights); + + vmask4 plane2_mask = vint4::lane_id() == vint4(scb.plane2_component); + + vfloat4 summa = vfloat4::zero(); + + // Decode the color endpoints for this partition + vint4 ep0; + vint4 ep1; + bool rgb_lns; + bool a_lns; + + unpack_color_endpoints(config.profile, + scb.color_formats[0], + scb.color_values[0], + rgb_lns, a_lns, + ep0, ep1); + + // Unpack and compute error for each texel in the partition + unsigned int texel_count = bsd.texel_count; + for (unsigned int i = 0; i < texel_count; i++) + { + vint4 weight = select(vint4(plane1_weights[i]), vint4(plane2_weights[i]), plane2_mask); + vint4 colori = lerp_color_int(config.profile, ep0, ep1, weight); + + vfloat4 color = int_to_float(colori); + vfloat4 oldColor = blk.texel(i); + + // Compare error using a perceptual decode metric for RGBM textures + if (config.flags & ASTCENC_FLG_MAP_RGBM) + { + // Fail encodings that result in zero weight M pixels. Note that this can cause + // "interesting" artifacts if we reject all useful encodings - we typically get max + // brightness encodings instead which look just as bad. We recommend users apply a + // bias to their stored M value, limiting the lower value to 16 or 32 to avoid + // getting small M values post-quantization, but we can't prove it would never + // happen, especially at low bit rates ... + if (color.lane<3>() == 0.0f) + { + return -ERROR_CALC_DEFAULT; + } + + // Compute error based on decoded RGBM color + color = vfloat4( + color.lane<0>() * color.lane<3>() * config.rgbm_m_scale, + color.lane<1>() * color.lane<3>() * config.rgbm_m_scale, + color.lane<2>() * color.lane<3>() * config.rgbm_m_scale, + 1.0f + ); + + oldColor = vfloat4( + oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale, + oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale, + oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale, + 1.0f + ); + } + + vfloat4 error = oldColor - color; + error = min(abs(error), 1e15f); + error = error * error; + + summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT); + } + + return summa.lane<0>(); +} + +/* See header for documentation. */ +float compute_symbolic_block_difference_1plane( + const astcenc_config& config, + const block_size_descriptor& bsd, + const symbolic_compressed_block& scb, + const image_block& blk +) { + assert(bsd.get_block_mode(scb.block_mode).is_dual_plane == 0); + + // If we detected an error-block, blow up immediately. + if (scb.block_type == SYM_BTYPE_ERROR) + { + return ERROR_CALC_DEFAULT; + } + + assert(scb.block_mode >= 0); + + // Get the appropriate partition-table entry + unsigned int partition_count = scb.partition_count; + const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); + + // Get the appropriate block descriptor + const block_mode& bm = bsd.get_block_mode(scb.block_mode); + const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); + + // Unquantize and undecimate the weights + int plane1_weights[BLOCK_MAX_TEXELS]; + unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); + + vfloat4 summa = vfloat4::zero(); + for (unsigned int i = 0; i < partition_count; i++) + { + // Decode the color endpoints for this partition + vint4 ep0; + vint4 ep1; + bool rgb_lns; + bool a_lns; + + unpack_color_endpoints(config.profile, + scb.color_formats[i], + scb.color_values[i], + rgb_lns, a_lns, + ep0, ep1); + + // Unpack and compute error for each texel in the partition + unsigned int texel_count = pi.partition_texel_count[i]; + for (unsigned int j = 0; j < texel_count; j++) + { + unsigned int tix = pi.texels_of_partition[i][j]; + vint4 colori = lerp_color_int(config.profile, ep0, ep1, + vint4(plane1_weights[tix])); + + vfloat4 color = int_to_float(colori); + vfloat4 oldColor = blk.texel(tix); + + // Compare error using a perceptual decode metric for RGBM textures + if (config.flags & ASTCENC_FLG_MAP_RGBM) + { + // Fail encodings that result in zero weight M pixels. Note that this can cause + // "interesting" artifacts if we reject all useful encodings - we typically get max + // brightness encodings instead which look just as bad. We recommend users apply a + // bias to their stored M value, limiting the lower value to 16 or 32 to avoid + // getting small M values post-quantization, but we can't prove it would never + // happen, especially at low bit rates ... + if (color.lane<3>() == 0.0f) + { + return -ERROR_CALC_DEFAULT; + } + + // Compute error based on decoded RGBM color + color = vfloat4( + color.lane<0>() * color.lane<3>() * config.rgbm_m_scale, + color.lane<1>() * color.lane<3>() * config.rgbm_m_scale, + color.lane<2>() * color.lane<3>() * config.rgbm_m_scale, + 1.0f + ); + + oldColor = vfloat4( + oldColor.lane<0>() * oldColor.lane<3>() * config.rgbm_m_scale, + oldColor.lane<1>() * oldColor.lane<3>() * config.rgbm_m_scale, + oldColor.lane<2>() * oldColor.lane<3>() * config.rgbm_m_scale, + 1.0f + ); + } + + vfloat4 error = oldColor - color; + error = min(abs(error), 1e15f); + error = error * error; + + summa += min(dot(error, blk.channel_weight), ERROR_CALC_DEFAULT); + } + } + + return summa.lane<0>(); +} + +/* See header for documentation. */ +float compute_symbolic_block_difference_1plane_1partition( + const astcenc_config& config, + const block_size_descriptor& bsd, + const symbolic_compressed_block& scb, + const image_block& blk +) { + // If we detected an error-block, blow up immediately. + if (scb.block_type == SYM_BTYPE_ERROR) + { + return ERROR_CALC_DEFAULT; + } + + assert(scb.block_mode >= 0); + assert(bsd.get_partition_info(scb.partition_count, scb.partition_index).partition_count == 1); + + // Get the appropriate block descriptor + const block_mode& bm = bsd.get_block_mode(scb.block_mode); + const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); + + // Unquantize and undecimate the weights + alignas(ASTCENC_VECALIGN) int plane1_weights[BLOCK_MAX_TEXELS]; + unpack_weights(bsd, scb, di, false, plane1_weights, nullptr); + + // Decode the color endpoints for this partition + vint4 ep0; + vint4 ep1; + bool rgb_lns; + bool a_lns; + + unpack_color_endpoints(config.profile, + scb.color_formats[0], + scb.color_values[0], + rgb_lns, a_lns, + ep0, ep1); + + + // Pre-shift sRGB so things round correctly + if (config.profile == ASTCENC_PRF_LDR_SRGB) + { + ep0 = asr<8>(ep0); + ep1 = asr<8>(ep1); + } + + // Unpack and compute error for each texel in the partition + vfloatacc summav = vfloatacc::zero(); + + vint lane_id = vint::lane_id(); + vint srgb_scale(config.profile == ASTCENC_PRF_LDR_SRGB ? 257 : 1); + + unsigned int texel_count = bsd.texel_count; + for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) + { + // Compute EP1 contribution + vint weight1 = vint::loada(plane1_weights + i); + vint ep1_r = vint(ep1.lane<0>()) * weight1; + vint ep1_g = vint(ep1.lane<1>()) * weight1; + vint ep1_b = vint(ep1.lane<2>()) * weight1; + vint ep1_a = vint(ep1.lane<3>()) * weight1; + + // Compute EP0 contribution + vint weight0 = vint(64) - weight1; + vint ep0_r = vint(ep0.lane<0>()) * weight0; + vint ep0_g = vint(ep0.lane<1>()) * weight0; + vint ep0_b = vint(ep0.lane<2>()) * weight0; + vint ep0_a = vint(ep0.lane<3>()) * weight0; + + // Shift so things round correctly + vint colori_r = asr<6>(ep0_r + ep1_r + vint(32)) * srgb_scale; + vint colori_g = asr<6>(ep0_g + ep1_g + vint(32)) * srgb_scale; + vint colori_b = asr<6>(ep0_b + ep1_b + vint(32)) * srgb_scale; + vint colori_a = asr<6>(ep0_a + ep1_a + vint(32)) * srgb_scale; + + // Compute color diff + vfloat color_r = int_to_float(colori_r); + vfloat color_g = int_to_float(colori_g); + vfloat color_b = int_to_float(colori_b); + vfloat color_a = int_to_float(colori_a); + + vfloat color_orig_r = loada(blk.data_r + i); + vfloat color_orig_g = loada(blk.data_g + i); + vfloat color_orig_b = loada(blk.data_b + i); + vfloat color_orig_a = loada(blk.data_a + i); + + vfloat color_error_r = min(abs(color_orig_r - color_r), vfloat(1e15f)); + vfloat color_error_g = min(abs(color_orig_g - color_g), vfloat(1e15f)); + vfloat color_error_b = min(abs(color_orig_b - color_b), vfloat(1e15f)); + vfloat color_error_a = min(abs(color_orig_a - color_a), vfloat(1e15f)); + + // Compute squared error metric + color_error_r = color_error_r * color_error_r; + color_error_g = color_error_g * color_error_g; + color_error_b = color_error_b * color_error_b; + color_error_a = color_error_a * color_error_a; + + vfloat metric = color_error_r * blk.channel_weight.lane<0>() + + color_error_g * blk.channel_weight.lane<1>() + + color_error_b * blk.channel_weight.lane<2>() + + color_error_a * blk.channel_weight.lane<3>(); + + // Mask off bad lanes + vmask mask = lane_id < vint(texel_count); + lane_id += vint(ASTCENC_SIMD_WIDTH); + haccumulate(summav, metric, mask); + } + + return hadd_s(summav); +} + +#endif |