diff options
Diffstat (limited to 'thirdparty/astcenc/astcenc_compute_variance.cpp')
-rw-r--r-- | thirdparty/astcenc/astcenc_compute_variance.cpp | 472 |
1 files changed, 472 insertions, 0 deletions
diff --git a/thirdparty/astcenc/astcenc_compute_variance.cpp b/thirdparty/astcenc/astcenc_compute_variance.cpp new file mode 100644 index 0000000000..48a4af8cef --- /dev/null +++ b/thirdparty/astcenc/astcenc_compute_variance.cpp @@ -0,0 +1,472 @@ +// SPDX-License-Identifier: Apache-2.0 +// ---------------------------------------------------------------------------- +// Copyright 2011-2022 Arm Limited +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// ---------------------------------------------------------------------------- + +#if !defined(ASTCENC_DECOMPRESS_ONLY) + +/** + * @brief Functions to calculate variance per component in a NxN footprint. + * + * We need N to be parametric, so the routine below uses summed area tables in order to execute in + * O(1) time independent of how big N is. + * + * The addition uses a Brent-Kung-based parallel prefix adder. This uses the prefix tree to first + * perform a binary reduction, and then distributes the results. This method means that there is no + * serial dependency between a given element and the next one, and also significantly improves + * numerical stability allowing us to use floats rather than doubles. + */ + +#include "astcenc_internal.h" + +#include <cassert> + +/** + * @brief Generate a prefix-sum array using the Brent-Kung algorithm. + * + * This will take an input array of the form: + * v0, v1, v2, ... + * ... and modify in-place to turn it into a prefix-sum array of the form: + * v0, v0+v1, v0+v1+v2, ... + * + * @param d The array to prefix-sum. + * @param items The number of items in the array. + * @param stride The item spacing in the array; i.e. dense arrays should use 1. + */ +static void brent_kung_prefix_sum( + vfloat4* d, + size_t items, + int stride +) { + if (items < 2) + return; + + size_t lc_stride = 2; + size_t log2_stride = 1; + + // The reduction-tree loop + do { + size_t step = lc_stride >> 1; + size_t start = lc_stride - 1; + size_t iters = items >> log2_stride; + + vfloat4 *da = d + (start * stride); + ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); + size_t ofs_stride = stride << log2_stride; + + while (iters) + { + *da = *da + da[ofs]; + da += ofs_stride; + iters--; + } + + log2_stride += 1; + lc_stride <<= 1; + } while (lc_stride <= items); + + // The expansion-tree loop + do { + log2_stride -= 1; + lc_stride >>= 1; + + size_t step = lc_stride >> 1; + size_t start = step + lc_stride - 1; + size_t iters = (items - step) >> log2_stride; + + vfloat4 *da = d + (start * stride); + ptrdiff_t ofs = -static_cast<ptrdiff_t>(step * stride); + size_t ofs_stride = stride << log2_stride; + + while (iters) + { + *da = *da + da[ofs]; + da += ofs_stride; + iters--; + } + } while (lc_stride > 2); +} + +/* See header for documentation. */ +void compute_pixel_region_variance( + astcenc_contexti& ctx, + const pixel_region_args& arg +) { + // Unpack the memory structure into local variables + const astcenc_image* img = arg.img; + astcenc_swizzle swz = arg.swz; + bool have_z = arg.have_z; + + int size_x = arg.size_x; + int size_y = arg.size_y; + int size_z = arg.size_z; + + int offset_x = arg.offset_x; + int offset_y = arg.offset_y; + int offset_z = arg.offset_z; + + int alpha_kernel_radius = arg.alpha_kernel_radius; + + float* input_alpha_averages = ctx.input_alpha_averages; + vfloat4* work_memory = arg.work_memory; + + // Compute memory sizes and dimensions that we need + int kernel_radius = alpha_kernel_radius; + int kerneldim = 2 * kernel_radius + 1; + int kernel_radius_xy = kernel_radius; + int kernel_radius_z = have_z ? kernel_radius : 0; + + int padsize_x = size_x + kerneldim; + int padsize_y = size_y + kerneldim; + int padsize_z = size_z + (have_z ? kerneldim : 0); + int sizeprod = padsize_x * padsize_y * padsize_z; + + int zd_start = have_z ? 1 : 0; + + vfloat4 *varbuf1 = work_memory; + vfloat4 *varbuf2 = work_memory + sizeprod; + + // Scaling factors to apply to Y and Z for accesses into the work buffers + int yst = padsize_x; + int zst = padsize_x * padsize_y; + + // Scaling factors to apply to Y and Z for accesses into result buffers + int ydt = img->dim_x; + int zdt = img->dim_x * img->dim_y; + + // Macros to act as accessor functions for the work-memory + #define VARBUF1(z, y, x) varbuf1[z * zst + y * yst + x] + #define VARBUF2(z, y, x) varbuf2[z * zst + y * yst + x] + + // Load N and N^2 values into the work buffers + if (img->data_type == ASTCENC_TYPE_U8) + { + // Swizzle data structure 4 = ZERO, 5 = ONE + uint8_t data[6]; + data[ASTCENC_SWZ_0] = 0; + data[ASTCENC_SWZ_1] = 255; + + for (int z = zd_start; z < padsize_z; z++) + { + int z_src = (z - zd_start) + offset_z - kernel_radius_z; + z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); + uint8_t* data8 = static_cast<uint8_t*>(img->data[z_src]); + + for (int y = 1; y < padsize_y; y++) + { + int y_src = (y - 1) + offset_y - kernel_radius_xy; + y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); + + for (int x = 1; x < padsize_x; x++) + { + int x_src = (x - 1) + offset_x - kernel_radius_xy; + x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); + + data[0] = data8[(4 * img->dim_x * y_src) + (4 * x_src )]; + data[1] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; + data[2] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; + data[3] = data8[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; + + uint8_t r = data[swz.r]; + uint8_t g = data[swz.g]; + uint8_t b = data[swz.b]; + uint8_t a = data[swz.a]; + + vfloat4 d = vfloat4 (r * (1.0f / 255.0f), + g * (1.0f / 255.0f), + b * (1.0f / 255.0f), + a * (1.0f / 255.0f)); + + VARBUF1(z, y, x) = d; + VARBUF2(z, y, x) = d * d; + } + } + } + } + else if (img->data_type == ASTCENC_TYPE_F16) + { + // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) + uint16_t data[6]; + data[ASTCENC_SWZ_0] = 0; + data[ASTCENC_SWZ_1] = 0x3C00; + + for (int z = zd_start; z < padsize_z; z++) + { + int z_src = (z - zd_start) + offset_z - kernel_radius_z; + z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); + uint16_t* data16 = static_cast<uint16_t*>(img->data[z_src]); + + for (int y = 1; y < padsize_y; y++) + { + int y_src = (y - 1) + offset_y - kernel_radius_xy; + y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); + + for (int x = 1; x < padsize_x; x++) + { + int x_src = (x - 1) + offset_x - kernel_radius_xy; + x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); + + data[0] = data16[(4 * img->dim_x * y_src) + (4 * x_src )]; + data[1] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; + data[2] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; + data[3] = data16[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; + + vint4 di(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); + vfloat4 d = float16_to_float(di); + + VARBUF1(z, y, x) = d; + VARBUF2(z, y, x) = d * d; + } + } + } + } + else // if (img->data_type == ASTCENC_TYPE_F32) + { + assert(img->data_type == ASTCENC_TYPE_F32); + + // Swizzle data structure 4 = ZERO, 5 = ONE (in FP16) + float data[6]; + data[ASTCENC_SWZ_0] = 0.0f; + data[ASTCENC_SWZ_1] = 1.0f; + + for (int z = zd_start; z < padsize_z; z++) + { + int z_src = (z - zd_start) + offset_z - kernel_radius_z; + z_src = astc::clamp(z_src, 0, static_cast<int>(img->dim_z - 1)); + float* data32 = static_cast<float*>(img->data[z_src]); + + for (int y = 1; y < padsize_y; y++) + { + int y_src = (y - 1) + offset_y - kernel_radius_xy; + y_src = astc::clamp(y_src, 0, static_cast<int>(img->dim_y - 1)); + + for (int x = 1; x < padsize_x; x++) + { + int x_src = (x - 1) + offset_x - kernel_radius_xy; + x_src = astc::clamp(x_src, 0, static_cast<int>(img->dim_x - 1)); + + data[0] = data32[(4 * img->dim_x * y_src) + (4 * x_src )]; + data[1] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 1)]; + data[2] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 2)]; + data[3] = data32[(4 * img->dim_x * y_src) + (4 * x_src + 3)]; + + float r = data[swz.r]; + float g = data[swz.g]; + float b = data[swz.b]; + float a = data[swz.a]; + + vfloat4 d(r, g, b, a); + + VARBUF1(z, y, x) = d; + VARBUF2(z, y, x) = d * d; + } + } + } + } + + // Pad with an extra layer of 0s; this forms the edge of the SAT tables + vfloat4 vbz = vfloat4::zero(); + for (int z = 0; z < padsize_z; z++) + { + for (int y = 0; y < padsize_y; y++) + { + VARBUF1(z, y, 0) = vbz; + VARBUF2(z, y, 0) = vbz; + } + + for (int x = 0; x < padsize_x; x++) + { + VARBUF1(z, 0, x) = vbz; + VARBUF2(z, 0, x) = vbz; + } + } + + if (have_z) + { + for (int y = 0; y < padsize_y; y++) + { + for (int x = 0; x < padsize_x; x++) + { + VARBUF1(0, y, x) = vbz; + VARBUF2(0, y, x) = vbz; + } + } + } + + // Generate summed-area tables for N and N^2; this is done in-place, using + // a Brent-Kung parallel-prefix based algorithm to minimize precision loss + for (int z = zd_start; z < padsize_z; z++) + { + for (int y = 1; y < padsize_y; y++) + { + brent_kung_prefix_sum(&(VARBUF1(z, y, 1)), padsize_x - 1, 1); + brent_kung_prefix_sum(&(VARBUF2(z, y, 1)), padsize_x - 1, 1); + } + } + + for (int z = zd_start; z < padsize_z; z++) + { + for (int x = 1; x < padsize_x; x++) + { + brent_kung_prefix_sum(&(VARBUF1(z, 1, x)), padsize_y - 1, yst); + brent_kung_prefix_sum(&(VARBUF2(z, 1, x)), padsize_y - 1, yst); + } + } + + if (have_z) + { + for (int y = 1; y < padsize_y; y++) + { + for (int x = 1; x < padsize_x; x++) + { + brent_kung_prefix_sum(&(VARBUF1(1, y, x)), padsize_z - 1, zst); + brent_kung_prefix_sum(&(VARBUF2(1, y, x)), padsize_z - 1, zst); + } + } + } + + // Compute a few constants used in the variance-calculation. + float alpha_kdim = static_cast<float>(2 * alpha_kernel_radius + 1); + float alpha_rsamples; + + if (have_z) + { + alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim * alpha_kdim); + } + else + { + alpha_rsamples = 1.0f / (alpha_kdim * alpha_kdim); + } + + // Use the summed-area tables to compute variance for each neighborhood + if (have_z) + { + for (int z = 0; z < size_z; z++) + { + int z_src = z + kernel_radius_z; + int z_dst = z + offset_z; + int z_low = z_src - alpha_kernel_radius; + int z_high = z_src + alpha_kernel_radius + 1; + + for (int y = 0; y < size_y; y++) + { + int y_src = y + kernel_radius_xy; + int y_dst = y + offset_y; + int y_low = y_src - alpha_kernel_radius; + int y_high = y_src + alpha_kernel_radius + 1; + + for (int x = 0; x < size_x; x++) + { + int x_src = x + kernel_radius_xy; + int x_dst = x + offset_x; + int x_low = x_src - alpha_kernel_radius; + int x_high = x_src + alpha_kernel_radius + 1; + + // Summed-area table lookups for alpha average + float vasum = ( VARBUF1(z_high, y_low, x_low).lane<3>() + - VARBUF1(z_high, y_low, x_high).lane<3>() + - VARBUF1(z_high, y_high, x_low).lane<3>() + + VARBUF1(z_high, y_high, x_high).lane<3>()) - + ( VARBUF1(z_low, y_low, x_low).lane<3>() + - VARBUF1(z_low, y_low, x_high).lane<3>() + - VARBUF1(z_low, y_high, x_low).lane<3>() + + VARBUF1(z_low, y_high, x_high).lane<3>()); + + int out_index = z_dst * zdt + y_dst * ydt + x_dst; + input_alpha_averages[out_index] = (vasum * alpha_rsamples); + } + } + } + } + else + { + for (int y = 0; y < size_y; y++) + { + int y_src = y + kernel_radius_xy; + int y_dst = y + offset_y; + int y_low = y_src - alpha_kernel_radius; + int y_high = y_src + alpha_kernel_radius + 1; + + for (int x = 0; x < size_x; x++) + { + int x_src = x + kernel_radius_xy; + int x_dst = x + offset_x; + int x_low = x_src - alpha_kernel_radius; + int x_high = x_src + alpha_kernel_radius + 1; + + // Summed-area table lookups for alpha average + float vasum = VARBUF1(0, y_low, x_low).lane<3>() + - VARBUF1(0, y_low, x_high).lane<3>() + - VARBUF1(0, y_high, x_low).lane<3>() + + VARBUF1(0, y_high, x_high).lane<3>(); + + int out_index = y_dst * ydt + x_dst; + input_alpha_averages[out_index] = (vasum * alpha_rsamples); + } + } + } +} + +/* See header for documentation. */ +unsigned int init_compute_averages( + const astcenc_image& img, + unsigned int alpha_kernel_radius, + const astcenc_swizzle& swz, + avg_args& ag +) { + unsigned int size_x = img.dim_x; + unsigned int size_y = img.dim_y; + unsigned int size_z = img.dim_z; + + // Compute maximum block size and from that the working memory buffer size + unsigned int kernel_radius = alpha_kernel_radius; + unsigned int kerneldim = 2 * kernel_radius + 1; + + bool have_z = (size_z > 1); + unsigned int max_blk_size_xy = have_z ? 16 : 32; + unsigned int max_blk_size_z = astc::min(size_z, have_z ? 16u : 1u); + + unsigned int max_padsize_xy = max_blk_size_xy + kerneldim; + unsigned int max_padsize_z = max_blk_size_z + (have_z ? kerneldim : 0); + + // Perform block-wise averages calculations across the image + // Initialize fields which are not populated until later + ag.arg.size_x = 0; + ag.arg.size_y = 0; + ag.arg.size_z = 0; + ag.arg.offset_x = 0; + ag.arg.offset_y = 0; + ag.arg.offset_z = 0; + ag.arg.work_memory = nullptr; + + ag.arg.img = &img; + ag.arg.swz = swz; + ag.arg.have_z = have_z; + ag.arg.alpha_kernel_radius = alpha_kernel_radius; + + ag.img_size_x = size_x; + ag.img_size_y = size_y; + ag.img_size_z = size_z; + ag.blk_size_xy = max_blk_size_xy; + ag.blk_size_z = max_blk_size_z; + ag.work_memory_size = 2 * max_padsize_xy * max_padsize_xy * max_padsize_z; + + // The parallel task count + unsigned int z_tasks = (size_z + max_blk_size_z - 1) / max_blk_size_z; + unsigned int y_tasks = (size_y + max_blk_size_xy - 1) / max_blk_size_xy; + return z_tasks * y_tasks; +} + +#endif |