summaryrefslogtreecommitdiff
path: root/thirdparty/basis_universal/encoder/basisu_enc.h
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/basis_universal/encoder/basisu_enc.h')
-rw-r--r--thirdparty/basis_universal/encoder/basisu_enc.h409
1 files changed, 365 insertions, 44 deletions
diff --git a/thirdparty/basis_universal/encoder/basisu_enc.h b/thirdparty/basis_universal/encoder/basisu_enc.h
index 0ce011452d..0efeaa461f 100644
--- a/thirdparty/basis_universal/encoder/basisu_enc.h
+++ b/thirdparty/basis_universal/encoder/basisu_enc.h
@@ -33,14 +33,23 @@
// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
+#if BASISU_SUPPORT_SSE
+// Declared in basisu_kernels_imp.h, but we can't include that here otherwise it would lead to circular type errors.
+extern void update_covar_matrix_16x16_sse41(uint32_t num_vecs, const void* pWeighted_vecs, const void* pOrigin, const uint32_t *pVec_indices, void* pMatrix16x16);
+#endif
+
namespace basisu
{
extern uint8_t g_hamming_dist[256];
extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8];
+ // true if basisu_encoder_init() has been called and returned.
+ extern bool g_library_initialized;
+
// Encoder library initialization.
// This function MUST be called before encoding anything!
- void basisu_encoder_init();
+ void basisu_encoder_init(bool use_opencl = false, bool opencl_force_serialization = false);
+ void basisu_encoder_deinit();
// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
extern void detect_sse41();
@@ -51,8 +60,9 @@ namespace basisu
const bool g_cpu_supports_sse41 = false;
#endif
+ void error_vprintf(const char* pFmt, va_list args);
void error_printf(const char *pFmt, ...);
-
+
// Helpers
inline uint8_t clamp255(int32_t i)
@@ -170,18 +180,24 @@ namespace basisu
class running_stat
{
public:
- running_stat() :
- m_n(0),
- m_old_m(0), m_new_m(0), m_old_s(0), m_new_s(0)
- {
- }
+ running_stat() { clear(); }
+
void clear()
{
m_n = 0;
+ m_total = 0;
+ m_old_m = 0;
+ m_new_m = 0;
+ m_old_s = 0;
+ m_new_s = 0;
+ m_min = 0;
+ m_max = 0;
}
+
void push(double x)
{
m_n++;
+ m_total += x;
if (m_n == 1)
{
m_old_m = m_new_m = x;
@@ -191,6 +207,7 @@ namespace basisu
}
else
{
+ // See Knuth TAOCP vol 2, 3rd edition, page 232
m_new_m = m_old_m + (x - m_old_m) / m_n;
m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);
m_old_m = m_new_m;
@@ -199,15 +216,23 @@ namespace basisu
m_max = basisu::maximum(x, m_max);
}
}
+
uint32_t get_num() const
{
return m_n;
}
+
+ double get_total() const
+ {
+ return m_total;
+ }
+
double get_mean() const
{
return (m_n > 0) ? m_new_m : 0.0;
}
+ // Returns sample variance
double get_variance() const
{
return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0);
@@ -230,7 +255,7 @@ namespace basisu
private:
uint32_t m_n;
- double m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
+ double m_total, m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
};
// Linear algebra
@@ -401,6 +426,8 @@ namespace basisu
typedef vec<3, float> vec3F;
typedef vec<2, float> vec2F;
typedef vec<1, float> vec1F;
+
+ typedef vec<16, float> vec16F;
template <uint32_t Rows, uint32_t Cols, typename T>
class matrix
@@ -504,6 +531,164 @@ namespace basisu
[pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; }
);
}
+
+ // 1-4 byte direct Radix sort.
+ template <typename T>
+ T* radix_sort(uint32_t num_vals, T* pBuf0, T* pBuf1, uint32_t key_ofs, uint32_t key_size)
+ {
+ assert(key_ofs < sizeof(T));
+ assert((key_size >= 1) && (key_size <= 4));
+
+ uint32_t hist[256 * 4];
+
+ memset(hist, 0, sizeof(hist[0]) * 256 * key_size);
+
+#define BASISU_GET_KEY(p) (*(uint32_t *)((uint8_t *)(p) + key_ofs))
+
+ if (key_size == 4)
+ {
+ T* p = pBuf0;
+ T* q = pBuf0 + num_vals;
+ for (; p != q; p++)
+ {
+ const uint32_t key = BASISU_GET_KEY(p);
+
+ hist[key & 0xFF]++;
+ hist[256 + ((key >> 8) & 0xFF)]++;
+ hist[512 + ((key >> 16) & 0xFF)]++;
+ hist[768 + ((key >> 24) & 0xFF)]++;
+ }
+ }
+ else if (key_size == 3)
+ {
+ T* p = pBuf0;
+ T* q = pBuf0 + num_vals;
+ for (; p != q; p++)
+ {
+ const uint32_t key = BASISU_GET_KEY(p);
+
+ hist[key & 0xFF]++;
+ hist[256 + ((key >> 8) & 0xFF)]++;
+ hist[512 + ((key >> 16) & 0xFF)]++;
+ }
+ }
+ else if (key_size == 2)
+ {
+ T* p = pBuf0;
+ T* q = pBuf0 + (num_vals >> 1) * 2;
+
+ for (; p != q; p += 2)
+ {
+ const uint32_t key0 = BASISU_GET_KEY(p);
+ const uint32_t key1 = BASISU_GET_KEY(p + 1);
+
+ hist[key0 & 0xFF]++;
+ hist[256 + ((key0 >> 8) & 0xFF)]++;
+
+ hist[key1 & 0xFF]++;
+ hist[256 + ((key1 >> 8) & 0xFF)]++;
+ }
+
+ if (num_vals & 1)
+ {
+ const uint32_t key = BASISU_GET_KEY(p);
+
+ hist[key & 0xFF]++;
+ hist[256 + ((key >> 8) & 0xFF)]++;
+ }
+ }
+ else
+ {
+ assert(key_size == 1);
+ if (key_size != 1)
+ return NULL;
+
+ T* p = pBuf0;
+ T* q = pBuf0 + (num_vals >> 1) * 2;
+
+ for (; p != q; p += 2)
+ {
+ const uint32_t key0 = BASISU_GET_KEY(p);
+ const uint32_t key1 = BASISU_GET_KEY(p + 1);
+
+ hist[key0 & 0xFF]++;
+ hist[key1 & 0xFF]++;
+ }
+
+ if (num_vals & 1)
+ {
+ const uint32_t key = BASISU_GET_KEY(p);
+ hist[key & 0xFF]++;
+ }
+ }
+
+ T* pCur = pBuf0;
+ T* pNew = pBuf1;
+
+ for (uint32_t pass = 0; pass < key_size; pass++)
+ {
+ const uint32_t* pHist = &hist[pass << 8];
+
+ uint32_t offsets[256];
+
+ uint32_t cur_ofs = 0;
+ for (uint32_t i = 0; i < 256; i += 2)
+ {
+ offsets[i] = cur_ofs;
+ cur_ofs += pHist[i];
+
+ offsets[i + 1] = cur_ofs;
+ cur_ofs += pHist[i + 1];
+ }
+
+ const uint32_t pass_shift = pass << 3;
+
+ T* p = pCur;
+ T* q = pCur + (num_vals >> 1) * 2;
+
+ for (; p != q; p += 2)
+ {
+ uint32_t c0 = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;
+ uint32_t c1 = (BASISU_GET_KEY(p + 1) >> pass_shift) & 0xFF;
+
+ if (c0 == c1)
+ {
+ uint32_t dst_offset0 = offsets[c0];
+
+ offsets[c0] = dst_offset0 + 2;
+
+ pNew[dst_offset0] = p[0];
+ pNew[dst_offset0 + 1] = p[1];
+ }
+ else
+ {
+ uint32_t dst_offset0 = offsets[c0]++;
+ uint32_t dst_offset1 = offsets[c1]++;
+
+ pNew[dst_offset0] = p[0];
+ pNew[dst_offset1] = p[1];
+ }
+ }
+
+ if (num_vals & 1)
+ {
+ uint32_t c = (BASISU_GET_KEY(p) >> pass_shift) & 0xFF;
+
+ uint32_t dst_offset = offsets[c];
+ offsets[c] = dst_offset + 1;
+
+ pNew[dst_offset] = *p;
+ }
+
+ T* t = pCur;
+ pCur = pNew;
+ pNew = t;
+ }
+
+ return pCur;
+ }
+
+#undef BASISU_GET_KEY
// Very simple job pool with no dependencies.
class job_pool
@@ -805,17 +990,28 @@ namespace basisu
int dg = e1.g - e2.g;
int db = e1.b - e2.b;
+#if 0
int delta_l = dr * 27 + dg * 92 + db * 9;
int delta_cr = dr * 128 - delta_l;
int delta_cb = db * 128 - delta_l;
-
+
uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +
((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +
((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);
+#else
+ int64_t delta_l = dr * 27 + dg * 92 + db * 9;
+ int64_t delta_cr = dr * 128 - delta_l;
+ int64_t delta_cb = db * 128 - delta_l;
+
+ uint32_t id = ((uint32_t)((delta_l * delta_l) >> 7U)) +
+ ((((uint32_t)((delta_cr * delta_cr) >> 7U)) * 26U) >> 7U) +
+ ((((uint32_t)((delta_cb * delta_cb) >> 7U)) * 3U) >> 7U);
+#endif
if (alpha)
{
int da = (e1.a - e2.a) << 7;
+ // This shouldn't overflow if da is 255 or -255: 29.99 bits after squaring.
id += ((uint32_t)(da * da) >> 7U);
}
@@ -1258,7 +1454,7 @@ namespace basisu
{
codebook.resize(codebook.size() + 1);
codebook.back() = cur.m_training_vecs;
-
+
if (node_stack.empty())
break;
@@ -1295,6 +1491,9 @@ namespace basisu
uint32_t total_leaf_nodes = 1;
+ //interval_timer tm;
+ //tm.start();
+
while ((var_heap.size()) && (total_leaf_nodes < max_size))
{
const uint32_t node_index = var_heap.get_top_index();
@@ -1315,6 +1514,8 @@ namespace basisu
}
}
+ //debug_printf("tree_vector_quant::generate %u: %3.3f secs\n", TrainingVectorType::num_elements, tm.get_elapsed_secs());
+
return true;
}
@@ -1443,17 +1644,32 @@ namespace basisu
{
const uint32_t N = TrainingVectorType::num_elements;
- matrix<N, N, float> cmatrix(cZero);
+ matrix<N, N, float> cmatrix;
- // Compute covariance matrix from weighted input vectors
- for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+ if ((N != 16) || (!g_cpu_supports_sse41))
{
- const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin);
- const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v);
+ cmatrix.set_zero();
+
+ // Compute covariance matrix from weighted input vectors
+ for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+ {
+ const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin);
+ const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v);
- for (uint32_t x = 0; x < N; x++)
- for (uint32_t y = x; y < N; y++)
- cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y];
+ for (uint32_t x = 0; x < N; x++)
+ for (uint32_t y = x; y < N; y++)
+ cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y];
+ }
+ }
+ else
+ {
+#if BASISU_SUPPORT_SSE
+ // Specialize the case with 16x16 matrices, which are quite expensive without SIMD.
+ // This SSE function takes pointers to void types, so do some sanity checks.
+ assert(sizeof(TrainingVectorType) == sizeof(float) * 16);
+ assert(sizeof(training_vec_with_weight) == sizeof(std::pair<vec16F, uint64_t>));
+ update_covar_matrix_16x16_sse41(node.m_training_vecs.size(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix);
+#endif
}
const float renorm_scale = 1.0f / node.m_weight;
@@ -1632,16 +1848,19 @@ namespace basisu
}
}
+ // Node is unsplittable using the above algorithm - try something else to split it up.
if ((!l_weight) || (!r_weight))
{
l_children.resize(0);
new_l_child.set(0.0f);
l_ttsum = 0.0f;
l_weight = 0;
+
r_children.resize(0);
new_r_child.set(0.0f);
r_ttsum = 0.0f;
r_weight = 0;
+
TrainingVectorType firstVec;
for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
{
@@ -1847,31 +2066,67 @@ namespace basisu
uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
basisu::vector<uint_vec>& codebook,
basisu::vector<uint_vec>& parent_codebook,
- uint32_t max_threads, job_pool *pJob_pool)
+ uint32_t max_threads, job_pool *pJob_pool,
+ bool even_odd_input_pairs_equal)
{
typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
+
typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group,
training_vec_bit_hasher> group_hash;
+ //interval_timer tm;
+ //tm.start();
+
group_hash unique_vecs;
- weighted_block_group g;
- g.m_indices.resize(1);
+ unique_vecs.reserve(20000);
- for (uint32_t i = 0; i < q.get_training_vecs().size(); i++)
+ weighted_block_group g;
+
+ if (even_odd_input_pairs_equal)
{
- g.m_total_weight = q.get_training_vecs()[i].second;
- g.m_indices[0] = i;
+ g.m_indices.resize(2);
- auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
+ assert(q.get_training_vecs().size() >= 2 && (q.get_training_vecs().size() & 1) == 0);
- if (!ins_res.second)
+ for (uint32_t i = 0; i < q.get_training_vecs().size(); i += 2)
{
- (ins_res.first)->second.m_total_weight += g.m_total_weight;
- (ins_res.first)->second.m_indices.push_back(i);
+ assert(q.get_training_vecs()[i].first == q.get_training_vecs()[i + 1].first);
+
+ g.m_total_weight = q.get_training_vecs()[i].second + q.get_training_vecs()[i + 1].second;
+ g.m_indices[0] = i;
+ g.m_indices[1] = i + 1;
+
+ auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
+
+ if (!ins_res.second)
+ {
+ (ins_res.first)->second.m_total_weight += g.m_total_weight;
+ (ins_res.first)->second.m_indices.push_back(i);
+ (ins_res.first)->second.m_indices.push_back(i + 1);
+ }
+ }
+ }
+ else
+ {
+ g.m_indices.resize(1);
+
+ for (uint32_t i = 0; i < q.get_training_vecs().size(); i++)
+ {
+ g.m_total_weight = q.get_training_vecs()[i].second;
+ g.m_indices[0] = i;
+
+ auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
+
+ if (!ins_res.second)
+ {
+ (ins_res.first)->second.m_total_weight += g.m_total_weight;
+ (ins_res.first)->second.m_indices.push_back(i);
+ }
}
}
+ //debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors, %3.3f secs\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size(), tm.get_elapsed_secs());
debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size());
Quantizer group_quant;
@@ -2491,7 +2746,27 @@ namespace basisu
return *this;
}
- image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color)
+ // pPixels MUST have been allocated using malloc() (basisu::vector will eventually use free() on the pointer).
+ image& grant_ownership(color_rgba* pPixels, uint32_t w, uint32_t h, uint32_t p = UINT32_MAX)
+ {
+ if (p == UINT32_MAX)
+ p = w;
+
+ clear();
+
+ if ((!p) || (!w) || (!h))
+ return *this;
+
+ m_pixels.grant_ownership(pPixels, p * h, p * h);
+
+ m_width = w;
+ m_height = h;
+ m_pitch = p;
+
+ return *this;
+ }
+
+ image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color, bool init_image = true)
{
if (p == UINT32_MAX)
p = w;
@@ -2509,15 +2784,25 @@ namespace basisu
cur_state.swap(m_pixels);
m_pixels.resize(p * h);
-
- for (uint32_t y = 0; y < h; y++)
+
+ if (init_image)
{
- for (uint32_t x = 0; x < w; x++)
+ if (m_width || m_height)
{
- if ((x < m_width) && (y < m_height))
- m_pixels[x + y * p] = cur_state[x + y * m_pitch];
- else
- m_pixels[x + y * p] = background;
+ for (uint32_t y = 0; y < h; y++)
+ {
+ for (uint32_t x = 0; x < w; x++)
+ {
+ if ((x < m_width) && (y < m_height))
+ m_pixels[x + y * p] = cur_state[x + y * m_pitch];
+ else
+ m_pixels[x + y * p] = background;
+ }
+ }
+ }
+ else
+ {
+ m_pixels.set_all(background);
}
}
@@ -2590,9 +2875,25 @@ namespace basisu
const image &extract_block_clamped(color_rgba *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
{
- for (uint32_t y = 0; y < h; y++)
- for (uint32_t x = 0; x < w; x++)
- *pDst++ = get_clamped(src_x + x, src_y + y);
+ if (((src_x + w) > m_width) || ((src_y + h) > m_height))
+ {
+ // Slower clamping case
+ for (uint32_t y = 0; y < h; y++)
+ for (uint32_t x = 0; x < w; x++)
+ *pDst++ = get_clamped(src_x + x, src_y + y);
+ }
+ else
+ {
+ const color_rgba* pSrc = &m_pixels[src_x + src_y * m_pitch];
+
+ for (uint32_t y = 0; y < h; y++)
+ {
+ memcpy(pDst, pSrc, w * sizeof(color_rgba));
+ pSrc += m_pitch;
+ pDst += w;
+ }
+ }
+
return *this;
}
@@ -2947,21 +3248,18 @@ namespace basisu
};
// Image saving/loading/resampling
-
+
bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
bool load_png(const char* pFilename, image& img);
inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }
- bool load_bmp(const char* pFilename, image& img);
- inline bool load_bmp(const std::string &filename, image &img) { return load_bmp(filename.c_str(), img); }
-
bool load_tga(const char* pFilename, image& img);
inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
bool load_jpg(const char *pFilename, image& img);
inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
- // Currently loads .BMP, .PNG, or .TGA.
+ // Currently loads .PNG, .TGA, or .JPG
bool load_image(const char* pFilename, image& img);
inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
@@ -3129,6 +3427,29 @@ namespace basisu
}
void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
+
+ const uint32_t cPixelBlockWidth = 4;
+ const uint32_t cPixelBlockHeight = 4;
+ const uint32_t cPixelBlockTotalPixels = cPixelBlockWidth * cPixelBlockHeight;
+
+ struct pixel_block
+ {
+ color_rgba m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
+
+ inline const color_rgba& operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
+ inline color_rgba& operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
+
+ inline const color_rgba* get_ptr() const { return &m_pixels[0][0]; }
+ inline color_rgba* get_ptr() { return &m_pixels[0][0]; }
+
+ inline void clear() { clear_obj(*this); }
+
+ inline bool operator== (const pixel_block& rhs) const
+ {
+ return memcmp(m_pixels, rhs.m_pixels, sizeof(m_pixels)) == 0;
+ }
+ };
+ typedef basisu::vector<pixel_block> pixel_block_vec;
} // namespace basisu