45 files changed, 48372 insertions, 0 deletions
diff --git a/thirdparty/basis_universal/encoder/apg_bmp.c b/thirdparty/basis_universal/encoder/apg_bmp.c
new file mode 100644
index 0000000000..ef3d015e40
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/apg_bmp.c
@@ -0,0 +1,541 @@
+/*
+BMP File Reader/Writer Implementation
+Anton Gerdelan
+Version: 3
+Licence: see apg_bmp.h
+C99
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_WARNINGS 1
+#endif
+
+#include "apg_bmp.h"
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Maximum pixel dimensions of width or height of an image. Should accommodate max used in graphics APIs.
+   NOTE: 65536*65536 is the biggest number storable in 32 bits.
+   This needs to be multiplied by n_channels so actual memory indices are not uint32 but size_t to avoid overflow.
+   Note this will crash stb_image_write et al at maximum size which use 32bits, so reduce max size to accom. */
+#define _BMP_MAX_DIMS 65536
+#define _BMP_FILE_HDR_SZ 14
+#define _BMP_MIN_DIB_HDR_SZ 40
+#define _BMP_MIN_HDR_SZ ( _BMP_FILE_HDR_SZ + _BMP_MIN_DIB_HDR_SZ )
+#define _BMP_MAX_IMAGE_FILE_SIZE (1024ULL*1024ULL*1024ULL)
+
+#pragma pack( push, 1 ) // supported on GCC in addition to individual packing attribs
+/* All BMP files, regardless of type, start with this file header */
+typedef struct _bmp_file_header_t {
+  char file_type[2];
+  uint32_t file_sz;
+  uint16_t reserved1;
+  uint16_t reserved2;
+  uint32_t image_data_offset;
+} _bmp_file_header_t;
+
+/* Following the file header is the BMP type header. this is the most commonly used format */
+typedef struct _bmp_dib_BITMAPINFOHEADER_t {
+  uint32_t this_header_sz;
+  int32_t w;                      // in older headers w & h these are shorts and may be unsigned
+  int32_t h;                      //
+  uint16_t n_planes;              // must be 1
+  uint16_t bpp;                   // bits per pixel. 1,4,8,16,24,32.
+  uint32_t compression_method;    // 16 and 32-bit images must have a value of 3 here
+  uint32_t image_uncompressed_sz; // not consistently used in the wild, so ignored here.
+  int32_t horiz_pixels_per_meter; // not used.
+  int32_t vert_pixels_per_meter;  // not used.
+  uint32_t n_colours_in_palette;  //
+  uint32_t n_important_colours;   // not used.
+  /* NOTE(Anton) a DIB header may end here at 40-bytes. be careful using sizeof() */
+  /* if 'compression' value, above, is set to 3 ie the image is 16 or 32-bit, then these colour channel masks follow the headers.
+  these are big-endian order bit masks to assign bits of each pixel to different colours. bits used must be contiguous and not overlap. */
+  uint32_t bitmask_r;
+  uint32_t bitmask_g;
+  uint32_t bitmask_b;
+} _bmp_dib_BITMAPINFOHEADER_t;
+#pragma pack( pop )
+
+typedef enum _bmp_compression_t {
+  BI_RGB            = 0,
+  BI_RLE8           = 1,
+  BI_RLE4           = 2,
+  BI_BITFIELDS      = 3,
+  BI_JPEG           = 4,
+  BI_PNG            = 5,
+  BI_ALPHABITFIELDS = 6,
+  BI_CMYK           = 11,
+  BI_CMYKRLE8       = 12,
+  BI_CMYRLE4        = 13
+} _bmp_compression_t;
+
+/* convenience struct and file->memory function */
+typedef struct _entire_file_t {
+  void* data;
+  size_t sz;
+} _entire_file_t;
+
+/*
+RETURNS
+- true on success. record->data is allocated memory and must be freed by the caller.
+- false on any error. Any allocated memory is freed if false is returned */
+static bool _read_entire_file( const char* filename, _entire_file_t* record ) {
+  FILE* fp = fopen( filename, "rb" );
+  if ( !fp ) { return false; }
+  fseek( fp, 0L, SEEK_END );
+  record->sz   = (size_t)ftell( fp );
+
+  // Immediately bail on anything larger than _BMP_MAX_IMAGE_FILE_SIZE. 
+  if (record->sz > _BMP_MAX_IMAGE_FILE_SIZE) {
+    fclose( fp );
+    return false;
+  }
+
+  record->data = malloc( record->sz );
+  if ( !record->data ) {
+    fclose( fp );
+    return false;
+  }
+  rewind( fp );
+  size_t nr = fread( record->data, record->sz, 1, fp );
+  fclose( fp );
+  if ( 1 != nr ) { return false; }
+  return true;
+}
+
+static bool _validate_file_hdr( _bmp_file_header_t* file_hdr_ptr, size_t file_sz ) {
+  if ( !file_hdr_ptr ) { return false; }
+  if ( file_hdr_ptr->file_type[0] != 'B' || file_hdr_ptr->file_type[1] != 'M' ) { return false; }
+  if ( file_hdr_ptr->image_data_offset > file_sz ) { return false; }
+  return true;
+}
+
+static bool _validate_dib_hdr( _bmp_dib_BITMAPINFOHEADER_t* dib_hdr_ptr, size_t file_sz ) {
+  if ( !dib_hdr_ptr ) { return false; }
+  if ( _BMP_FILE_HDR_SZ + dib_hdr_ptr->this_header_sz > file_sz ) { return false; }
+  if ( ( 32 == dib_hdr_ptr->bpp || 16 == dib_hdr_ptr->bpp ) && ( BI_BITFIELDS != dib_hdr_ptr->compression_method && BI_ALPHABITFIELDS != dib_hdr_ptr->compression_method ) ) {
+    return false;
+  }
+  if ( BI_RGB != dib_hdr_ptr->compression_method && BI_BITFIELDS != dib_hdr_ptr->compression_method && BI_ALPHABITFIELDS != dib_hdr_ptr->compression_method ) {
+    return false;
+  }
+  // NOTE(Anton) using abs() in the if-statement was blowing up on large negative numbers. switched to labs()
+  if ( 0 == dib_hdr_ptr->w || 0 == dib_hdr_ptr->h || labs( dib_hdr_ptr->w ) > _BMP_MAX_DIMS || labs( dib_hdr_ptr->h ) > _BMP_MAX_DIMS ) { return false; }
+
+  /* NOTE(Anton) if images reliably used n_colours_in_palette we could have done a palette/file size integrity check here.
+  because some always set 0 then we have to check every palette indexing as we read them */
+  return true;
+}
+
+/* NOTE(Anton) this could have ifdef branches on different compilers for the intrinsics versions for perf */
+static uint32_t _bitscan( uint32_t dword ) {
+  for ( uint32_t i = 0; i < 32; i++ ) {
+    if ( 1 & dword ) { return i; }
+    dword = dword >> 1;
+  }
+  return (uint32_t)-1;
+}
+
+unsigned char* apg_bmp_read( const char* filename, int* w, int* h, unsigned int* n_chans ) {
+  if ( !filename || !w || !h || !n_chans ) { return NULL; }
+
+  // read in the whole file into memory first - much faster than parsing on-the-fly
+  _entire_file_t record;
+  if ( !_read_entire_file( filename, &record ) ) { return NULL; }
+  if ( record.sz < _BMP_MIN_HDR_SZ ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // grab and validate the first, file, header
+  _bmp_file_header_t* file_hdr_ptr = (_bmp_file_header_t*)record.data;
+  if ( !_validate_file_hdr( file_hdr_ptr, record.sz ) ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // grad and validate the second, DIB, header
+  _bmp_dib_BITMAPINFOHEADER_t* dib_hdr_ptr = (_bmp_dib_BITMAPINFOHEADER_t*)( (uint8_t*)record.data + _BMP_FILE_HDR_SZ );
+  if ( !_validate_dib_hdr( dib_hdr_ptr, record.sz ) ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // bitmaps can have negative dims to indicate the image should be flipped
+  uint32_t width = *w = abs( dib_hdr_ptr->w );
+  uint32_t height = *h = abs( dib_hdr_ptr->h );
+
+  // TODO(Anton) flip image memory at the end if this is true. because doing it per row was making me write bugs.
+  // bool vertically_flip = dib_hdr_ptr->h > 0 ? false : true;
+
+  // channel count and palette are not well defined in the header so we make a good guess here
+  uint32_t n_dst_chans = 3, n_src_chans = 3;
+  bool has_palette = false;
+  switch ( dib_hdr_ptr->bpp ) {
+  case 32: n_dst_chans = n_src_chans = 4; break; // technically can be RGB but not supported
+  case 24: n_dst_chans = n_src_chans = 3; break; // technically can be RGBA but not supported
+  case 8:                                        // seems to always use a BGR0 palette, even for greyscale
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  case 4: // always has a palette - needed for a MS-saved BMP
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  case 1: // 1-bpp means the palette has 3 colour channels with 2 colours i.e. monochrome but not always black & white
+    n_dst_chans = 3;
+    has_palette = true;
+    n_src_chans = 1;
+    break;
+  default: // this includes 2bpp and 16bpp
+    free( record.data );
+    return NULL;
+  } // endswitch
+  *n_chans = n_dst_chans;
+  // NOTE(Anton) some image formats are not allowed a palette - could check for a bad header spec here also
+  if ( dib_hdr_ptr->n_colours_in_palette > 0 ) { has_palette = true; }
+
+#ifdef APG_BMP_DEBUG_OUTPUT
+  printf( "apg_bmp_debug: reading image\n|-filename `%s`\n|-dims %ux%u pixels\n|-bpp %u\n|-n_src_chans %u\n|-n_dst_chans %u\n", filename, *w, *h,
+    dib_hdr_ptr->bpp, n_src_chans, n_dst_chans );
+#endif
+
+  uint32_t palette_offset = _BMP_FILE_HDR_SZ + dib_hdr_ptr->this_header_sz;
+  bool has_bitmasks       = false;
+  if ( BI_BITFIELDS == dib_hdr_ptr->compression_method || BI_ALPHABITFIELDS == dib_hdr_ptr->compression_method ) {
+    has_bitmasks = true;
+    palette_offset += 12;
+  }
+  if ( palette_offset > record.sz ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // work out if any padding how much to skip at end of each row
+  uint32_t unpadded_row_sz = width * n_src_chans;
+  // bit-encoded palette indices have different padding properties
+  if ( 4 == dib_hdr_ptr->bpp ) {
+    unpadded_row_sz = width % 2 > 0 ? width / 2 + 1 : width / 2; // find how many whole bytes required for this bit width
+  }
+  if ( 1 == dib_hdr_ptr->bpp ) {
+    unpadded_row_sz = width % 8 > 0 ? width / 8 + 1 : width / 8; // find how many whole bytes required for this bit width
+  }
+  uint32_t row_padding_sz = 0 == unpadded_row_sz % 4 ? 0 : 4 - ( unpadded_row_sz % 4 ); // NOTE(Anton) didn't expect operator precedence of - over %
+
+  // another file size integrity check: partially validate source image data size
+  // 'image_data_offset' is by row padded to 4 bytes and is either colour data or palette indices.
+  if ( file_hdr_ptr->image_data_offset + ( unpadded_row_sz + row_padding_sz ) * height > record.sz ) {
+    free( record.data );
+    return NULL;
+  }
+
+  // find which bit number each colour channel starts at, so we can separate colours out
+  uint32_t bitshift_rgba[4] = {0, 0, 0, 0}; // NOTE(Anton) noticed this was int and not uint32_t so changed it. 17 Mar 2020
+  uint32_t bitmask_a        = 0;
+  if ( has_bitmasks ) {
+    bitmask_a        = ~( dib_hdr_ptr->bitmask_r | dib_hdr_ptr->bitmask_g | dib_hdr_ptr->bitmask_b );
+    bitshift_rgba[0] = _bitscan( dib_hdr_ptr->bitmask_r );
+    bitshift_rgba[1] = _bitscan( dib_hdr_ptr->bitmask_g );
+    bitshift_rgba[2] = _bitscan( dib_hdr_ptr->bitmask_b );
+    bitshift_rgba[3] = _bitscan( bitmask_a );
+  }
+
+  // allocate memory for the output pixels block. cast to size_t in case width and height are both the max of 65536 and n_dst_chans > 1
+  unsigned char* dst_img_ptr = malloc( (size_t)width * (size_t)height * (size_t)n_dst_chans );
+  if ( !dst_img_ptr ) {
+    free( record.data );
+    return NULL;
+  }
+
+  uint8_t* palette_data_ptr = (uint8_t*)record.data + palette_offset;
+  uint8_t* src_img_ptr      = (uint8_t*)record.data + file_hdr_ptr->image_data_offset;
+  size_t dst_stride_sz      = width * n_dst_chans;
+
+  //   == 32-bpp -> 32-bit RGBA. == 32-bit and 16-bit require bitmasks
+  if ( 32 == dib_hdr_ptr->bpp ) {
+    // check source image has enough data in it to read from
+    if ( (size_t)file_hdr_ptr->image_data_offset + (size_t)height * (size_t)width * (size_t)n_src_chans > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = r * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        uint32_t pixel;
+        memcpy( &pixel, &src_img_ptr[src_byte_idx], 4 );
+        // NOTE(Anton) the below assumes 32-bits is always RGBA 1 byte per channel. 10,10,10 RGB exists though and isn't handled.
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_r ) >> bitshift_rgba[0] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_g ) >> bitshift_rgba[1] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & dib_hdr_ptr->bitmask_b ) >> bitshift_rgba[2] );
+        dst_img_ptr[dst_pixels_idx++] = ( uint8_t )( ( pixel & bitmask_a ) >> bitshift_rgba[3] );
+        src_byte_idx += 4;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 8-bpp -> 24-bit RGB ==
+  } else if ( 8 == dib_hdr_ptr->bpp && has_palette ) {
+    // validate indices (body of image data) fits in file
+    if ( file_hdr_ptr->image_data_offset + height * width > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        // "most palettes are 4 bytes in RGB0 order but 3 for..." - it was actually BRG0 in old images -- Anton
+        uint8_t index = src_img_ptr[src_byte_idx]; // 8-bit index value per pixel
+
+        if ( palette_offset + index * 4 + 2 >= record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[index * 4 + 0];
+        src_byte_idx++;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 4-bpp (16-colour) -> 24-bit RGB ==
+  } else if ( 4 == dib_hdr_ptr->bpp && has_palette ) {
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        if ( file_hdr_ptr->image_data_offset + src_byte_idx > record.sz ) {
+          free( record.data );
+          free( dst_img_ptr );
+          return NULL;
+        }
+        // handle 2 pixels at a time
+        uint8_t pixel_duo = src_img_ptr[src_byte_idx];
+        uint8_t a_index   = ( 0xFF & pixel_duo ) >> 4;
+        uint8_t b_index   = 0xF & pixel_duo;
+
+        if ( palette_offset + a_index * 4 + 2 >= record.sz ) { // invalid src image
+          free( record.data );
+          return dst_img_ptr;
+        }
+        if ( dst_pixels_idx + 3 > width * height * n_dst_chans ) { // done
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[a_index * 4 + 0];
+        if ( ++c >= width ) { // advance a column
+          c = 0;
+          r++;
+          if ( r >= height ) { // done. no need to get second pixel. eg a 1x1 pixel image.
+            free( record.data );
+            return dst_img_ptr;
+          }
+          dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+        }
+
+        if ( palette_offset + b_index * 4 + 2 >= record.sz ) { // invalid src image
+          free( record.data );
+          return dst_img_ptr;
+        }
+        if ( dst_pixels_idx + 3 > width * height * n_dst_chans ) { // done. probably redundant check since checking r >= height.
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[b_index * 4 + 0];
+        src_byte_idx++;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+
+    // == 1-bpp -> 24-bit RGB ==
+  } else if ( 1 == dib_hdr_ptr->bpp && has_palette ) {
+    /* encoding method for monochrome is not well documented.
+    a 2x2 pixel image is stored as 4 1-bit palette indexes
+    the palette is stored as any 2 RGB0 colours (not necessarily B&W)
+    so for an image with indexes like so:
+    1 1
+    0 1
+    it is bit-encoded as follows, starting at MSB:
+    01000000 00000000 00000000 00000000 (first byte val  64)
+    11000000 00000000 00000000 00000000 (first byte val 192)
+    data is still split by row and each row padded to 4 byte multiples
+     */
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      uint8_t bit_idx       = 0; // used in monochrome
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        if ( 8 == bit_idx ) { // start reading from the next byte
+          src_byte_idx++;
+          bit_idx = 0;
+        }
+        if ( file_hdr_ptr->image_data_offset + src_byte_idx > record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        uint8_t pixel_oct   = src_img_ptr[src_byte_idx];
+        uint8_t bit         = 128 >> bit_idx;
+        uint8_t masked      = pixel_oct & bit;
+        uint8_t palette_idx = masked > 0 ? 1 : 0;
+
+        if ( palette_offset + palette_idx * 4 + 2 >= record.sz ) {
+          free( record.data );
+          return dst_img_ptr;
+        }
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 2];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 1];
+        dst_img_ptr[dst_pixels_idx++] = palette_data_ptr[palette_idx * 4 + 0];
+        bit_idx++;
+      }
+      src_byte_idx += ( row_padding_sz + 1 ); // 1bpp is special here
+    }
+
+    // == 24-bpp -> 24-bit RGB == (but also should handle some other n_chans cases)
+  } else {
+    // NOTE(Anton) this only supports 1 byte per channel
+    if ( file_hdr_ptr->image_data_offset + height * width * n_dst_chans > record.sz ) {
+      free( record.data );
+      free( dst_img_ptr );
+      return NULL;
+    }
+    size_t src_byte_idx = 0;
+    for ( uint32_t r = 0; r < height; r++ ) {
+      size_t dst_pixels_idx = ( height - 1 - r ) * dst_stride_sz;
+      for ( uint32_t c = 0; c < width; c++ ) {
+        // re-orders from BGR to RGB
+        if ( n_dst_chans > 3 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 3]; }
+        if ( n_dst_chans > 2 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 2]; }
+        if ( n_dst_chans > 1 ) { dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx + 1]; }
+        dst_img_ptr[dst_pixels_idx++] = src_img_ptr[src_byte_idx];
+        src_byte_idx += n_src_chans;
+      }
+      src_byte_idx += row_padding_sz;
+    }
+  } // endif bpp
+
+  free( record.data );
+  return dst_img_ptr;
+}
+
+void apg_bmp_free( unsigned char* pixels_ptr ) {
+  if ( !pixels_ptr ) { return; }
+  free( pixels_ptr );
+}
+
+unsigned int apg_bmp_write( const char* filename, unsigned char* pixels_ptr, int w, int h, unsigned int n_chans ) {
+  if ( !filename || !pixels_ptr ) { return 0; }
+  if ( 0 == w || 0 == h ) { return 0; }
+  if ( labs( w ) > _BMP_MAX_DIMS || labs( h ) > _BMP_MAX_DIMS ) { return 0; }
+  if ( n_chans != 3 && n_chans != 4 ) { return 0; }
+
+  uint32_t height = (uint32_t)labs( h );
+  uint32_t width  = (uint32_t)labs( w );
+  // work out if any padding how much to skip at end of each row
+  const size_t unpadded_row_sz      = width * n_chans;
+  const size_t row_padding_sz       = 0 == unpadded_row_sz % 4 ? 0 : 4 - unpadded_row_sz % 4;
+  const size_t row_sz               = unpadded_row_sz + row_padding_sz;
+  const size_t dst_pixels_padded_sz = row_sz * height;
+
+  const size_t dib_hdr_sz = sizeof( _bmp_dib_BITMAPINFOHEADER_t );
+  _bmp_file_header_t file_hdr;
+  {
+    file_hdr.file_type[0]      = 'B';
+    file_hdr.file_type[1]      = 'M';
+    file_hdr.file_sz           = _BMP_FILE_HDR_SZ + (uint32_t)dib_hdr_sz + (uint32_t)dst_pixels_padded_sz;
+    file_hdr.reserved1         = 0;
+    file_hdr.reserved2         = 0;
+    file_hdr.image_data_offset = _BMP_FILE_HDR_SZ + (uint32_t)dib_hdr_sz;
+  }
+  _bmp_dib_BITMAPINFOHEADER_t dib_hdr;
+  {
+    dib_hdr.this_header_sz         = _BMP_MIN_DIB_HDR_SZ; // NOTE: must be 40 and not include the bitmask memory in size here
+    dib_hdr.w                      = w;
+    dib_hdr.h                      = h;
+    dib_hdr.n_planes               = 1;
+    dib_hdr.bpp                    = 3 == n_chans ? 24 : 32;
+    dib_hdr.compression_method     = 3 == n_chans ? BI_RGB : BI_BITFIELDS;
+    dib_hdr.image_uncompressed_sz  = 0;
+    dib_hdr.horiz_pixels_per_meter = 0;
+    dib_hdr.vert_pixels_per_meter  = 0;
+    dib_hdr.n_colours_in_palette   = 0;
+    dib_hdr.n_important_colours    = 0;
+    // big-endian masks. only used in BI_BITFIELDS and BI_ALPHABITFIELDS ( 16 and 32-bit images )
+    // important note: GIMP stores BMP data in this array order for 32-bit: [A][B][G][R]
+    dib_hdr.bitmask_r = 0xFF000000;
+    dib_hdr.bitmask_g = 0x00FF0000;
+    dib_hdr.bitmask_b = 0x0000FF00;
+  }
+
+  uint8_t* dst_pixels_ptr = malloc( dst_pixels_padded_sz );
+  if ( !dst_pixels_ptr ) { return 0; }
+  {
+    size_t dst_byte_idx = 0;
+    uint8_t padding[4]  = {0, 0, 0, 0};
+    uint8_t rgba[4]     = {0, 0, 0, 0};
+    uint8_t bgra[4]     = {0, 0, 0, 0};
+
+    for ( uint32_t row = 0; row < height; row++ ) {
+      size_t src_byte_idx = ( height - 1 - row ) * n_chans * width;
+      for ( uint32_t col = 0; col < width; col++ ) {
+        for ( uint32_t chan = 0; chan < n_chans; chan++ ) { rgba[chan] = pixels_ptr[src_byte_idx++]; }
+        if ( 3 == n_chans ) {
+          bgra[0] = rgba[2];
+          bgra[1] = rgba[1];
+          bgra[2] = rgba[0];
+        } else {
+          /* NOTE(Anton) RGBA with alpha channel would be better supported with an extended DIB header */
+          bgra[0] = rgba[3];
+          bgra[1] = rgba[2];
+          bgra[2] = rgba[1];
+          bgra[3] = rgba[0]; // alpha
+        }
+        memcpy( &dst_pixels_ptr[dst_byte_idx], bgra, n_chans );
+        dst_byte_idx += (size_t)n_chans;
+      } // endfor col
+      if ( row_padding_sz > 0 ) {
+        memcpy( &dst_pixels_ptr[dst_byte_idx], padding, row_padding_sz );
+        dst_byte_idx += row_padding_sz;
+      }
+    } // endfor row
+  }
+  {
+    FILE* fp = fopen( filename, "wb" );
+    if ( !fp ) {
+      free( dst_pixels_ptr );
+      return 0;
+    }
+    if ( 1 != fwrite( &file_hdr, _BMP_FILE_HDR_SZ, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    if ( 1 != fwrite( &dib_hdr, dib_hdr_sz, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    if ( 1 != fwrite( dst_pixels_ptr, dst_pixels_padded_sz, 1, fp ) ) {
+      free( dst_pixels_ptr );
+      fclose( fp );
+      return 0;
+    }
+    fclose( fp );
+  }
+  free( dst_pixels_ptr );
+
+  return 1;
+}
diff --git a/thirdparty/basis_universal/encoder/apg_bmp.h b/thirdparty/basis_universal/encoder/apg_bmp.h
new file mode 100644
index 0000000000..8cd73b62e0
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/apg_bmp.h
@@ -0,0 +1,123 @@
+/*
+BMP File Reader/Writer Implementation
+Anton Gerdelan
+Version: 3.1 18 March 2020.
+Licence: see bottom of file.
+C89 ( Implementation is C99 )
+
+Contributors:
+- Anton Gerdelan - Initial code.
+- Saija Sorsa    - Fuzz testing.
+
+Instructions:
+- Just drop this header, and the matching .c file into your project.
+- To get debug printouts during parsing define APG_BMP_DEBUG_OUTPUT.
+
+Advantages:
+- The implementation is fast, simple, and supports more formats than most BMP reader libraries.
+- The reader function is fuzzed with AFL https://lcamtuf.coredump.cx/afl/.
+- The reader is robust to large files and malformed files, and will return any valid partial data in an image.
+- Reader supports 32bpp (with alpha channel), 24bpp, 8bpp, 4bpp, and 1bpp monochrome BMP images.
+- Reader handles indexed BMP images using a colour palette.
+- Writer supports 32bpp RGBA and 24bpp uncompressed RGB images.
+
+Current Limitations:
+- 16-bit images not supported (don't have any samples to test on).
+- No support for interleaved channel bit layouts eg RGB101010 RGB555 RGB565.
+- No support for compressed BMP images, although in practice these are not used.
+- Output images with alpha channel are written in BITMAPINFOHEADER format.
+  For better alpha support in other apps the 124-bit v5 header could be used instead,
+	at the cost of some backward compatibility and bloat.
+
+To Do:
+- FUZZING
+  - create a unique fuzz test set for (8,4,1 BPP).
+- (maybe) FEATURE Flipping the image based on negative width and height in header, and/or function arguments. 
+- (maybe) PERF ifdef intrinsics/asm for bitscan. Platform-specific code so won't include unless necessary.
+- (maybe) FEATURE Add parameter for padding output memory to eg 4-byte alignment or n channels.
+- (maybe) FEATURE Improved apps support in alpha channel writing (using v5 header).
+*/
+
+#ifndef APG_BMP_H_
+#define APG_BMP_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* CPP */
+
+/* Reads a bitmap from a file, allocates memory for the raw image data, and returns it.
+PARAMS
+  * w,h,     - Retrieves the width and height of the BMP in pixels.
+  * n_chans  - Retrieves the number of channels in the BMP.
+RETURNS
+  * Tightly-packed pixel memory in RGBA order. The caller must call free() on the memory.
+  * NULL on any error. Any allocated memory is freed before returning NULL. */
+unsigned char* apg_bmp_read( const char* filename, int* w, int* h, unsigned int* n_chans );
+
+/* Calls free() on memory created by apg_bmp_read */
+void apg_bmp_free( unsigned char* pixels_ptr );
+
+/* Writes a bitmap to a file.
+PARAMS
+  * filename   - e.g."my_bitmap.bmp". Must not be NULL.
+  * pixels_ptr - Pointer to tightly-packed pixel memory in RGBA order. Must not be NULL. There must be abs(w)*abs(h)*n_chans bytes in the memory pointed to.
+  * w,h,       - Width and height of the image in pixels.
+  * n_chans    - The number of channels in the BMP. 3 or 4 supported for writing, which means RGB or RGBA memory, respectively.
+RETURNS
+  * Zero on any error, non zero on success. */
+unsigned int apg_bmp_write( const char* filename, unsigned char* pixels_ptr, int w, int h, unsigned int n_chans );
+
+#ifdef __cplusplus
+}
+#endif /* CPP */
+
+#endif /*_APG_BMP_H_ */
+
+/*
+-------------------------------------------------------------------------------------
+This software is available under two licences - you may use it under either licence.
+-------------------------------------------------------------------------------------
+FIRST LICENCE OPTION
+
+>                                  Apache License
+>                            Version 2.0, January 2004
+>                         http://www.apache.org/licenses/
+>    Copyright 2019 Anton Gerdelan.
+>    Licensed under the Apache License, Version 2.0 (the "License");
+>    you may not use this file except in compliance with the License.
+>    You may obtain a copy of the License at
+>        http://www.apache.org/licenses/LICENSE-2.0
+>    Unless required by applicable law or agreed to in writing, software
+>    distributed under the License is distributed on an "AS IS" BASIS,
+>    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+>    See the License for the specific language governing permissions and
+>    limitations under the License.
+-------------------------------------------------------------------------------------
+SECOND LICENCE OPTION
+
+> This is free and unencumbered software released into the public domain.
+>
+> Anyone is free to copy, modify, publish, use, compile, sell, or
+> distribute this software, either in source code form or as a compiled
+> binary, for any purpose, commercial or non-commercial, and by any
+> means.
+>
+> In jurisdictions that recognize copyright laws, the author or authors
+> of this software dedicate any and all copyright interest in the
+> software to the public domain. We make this dedication for the benefit
+> of the public at large and to the detriment of our heirs and
+> successors. We intend this dedication to be an overt act of
+> relinquishment in perpetuity of all present and future rights to this
+> software under copyright law.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+> IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+> OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+> ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+> OTHER DEALINGS IN THE SOFTWARE.
+>
+> For more information, please refer to <http://unlicense.org>
+-------------------------------------------------------------------------------------
+*/
diff --git a/thirdparty/basis_universal/encoder/basisu_astc_decomp.cpp b/thirdparty/basis_universal/encoder/basisu_astc_decomp.cpp
new file mode 100644
index 0000000000..53bccfc515
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_astc_decomp.cpp
@@ -0,0 +1,1561 @@
+// basisu_astc_decomp.cpp: Only used for ASTC decompression, to validate the transcoder's output.
+// This version does not support HDR.
+
+/*-------------------------------------------------------------------------
+ * drawElements Quality Program Tester Core
+ * ----------------------------------------
+ *
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * rg: Removed external dependencies, remarked out HDR support because
+ * we don't need it, minor fix to decompress() so it converts non-sRGB
+ * output to 8-bits correctly. I've compared this decoder's output
+ * vs. astc-codec with random inputs on 4x4 blocks, and after fixing a few obvious
+ * bugs in astc-codec where it didn't correctly follow the spec they match so 
+ * I'm assuming they are both correct for 4x4 now.
+ * HDR support should be easily added back in, but as we don't need it 
+ * I'm leaving this for someone else.
+ * 
+ *//*!
+ * \file
+ * \brief ASTC Utilities.
+ *//*--------------------------------------------------------------------*/
+#include "basisu_astc_decomp.h"
+#include <assert.h>
+#include <algorithm>
+
+#define DE_LENGTH_OF_ARRAY(x) (sizeof(x)/sizeof(x[0]))
+#define DE_UNREF(x) (void)x
+
+typedef uint8_t deUint8;
+typedef int8_t deInt8;
+typedef uint32_t deUint32;
+typedef int32_t deInt32;
+typedef uint16_t deUint16;
+typedef int16_t deInt16;
+typedef int64_t deInt64;
+typedef uint64_t deUint64;
+
+#define DE_ASSERT assert
+
+#ifdef _MSC_VER
+#pragma warning (disable:4505) // unreferenced local function has been removed
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+namespace basisu_astc
+{
+	static bool inBounds(int v, int l, int h)
+	{
+		return (v >= l) && (v < h);
+	}
+
+	static bool inRange(int v, int l, int h)
+	{
+		return (v >= l) && (v <= h);
+	}
+
+	template<typename T>
+	static inline T max(T a, T b)
+	{
+		return (a > b) ? a : b;
+	}
+
+	template<typename T>
+	static inline T min(T a, T b)
+	{
+		return (a < b) ? a : b;
+	}
+
+	template<typename T>
+	static inline T clamp(T a, T l, T h)
+	{
+		if (a < l)
+			return l;
+		else if (a > h)
+			return h;
+		return a;
+	}
+
+	struct UVec4
+	{
+		uint32_t m_c[4];
+
+		UVec4()
+		{
+			m_c[0] = 0;
+			m_c[1] = 0;
+			m_c[2] = 0;
+			m_c[3] = 0;
+		}
+
+		UVec4(uint32_t x, uint32_t y, uint32_t z, uint32_t w)
+		{
+			m_c[0] = x;
+			m_c[1] = y;
+			m_c[2] = z;
+			m_c[3] = w;
+		}
+
+		uint32_t x() const { return m_c[0]; }
+		uint32_t y() const { return m_c[1]; }
+		uint32_t z() const { return m_c[2]; }
+		uint32_t w() const { return m_c[3]; }
+
+		uint32_t& x() { return m_c[0]; }
+		uint32_t& y() { return m_c[1]; }
+		uint32_t& z() { return m_c[2]; }
+		uint32_t& w() { return m_c[3]; }
+
+		uint32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
+		uint32_t& operator[] (uint32_t idx) { assert(idx < 4);  return m_c[idx]; }
+	};
+
+	struct IVec4
+	{
+		int32_t m_c[4];
+
+		IVec4()
+		{
+			m_c[0] = 0;
+			m_c[1] = 0;
+			m_c[2] = 0;
+			m_c[3] = 0;
+		}
+
+		IVec4(int32_t x, int32_t y, int32_t z, int32_t w)
+		{
+			m_c[0] = x;
+			m_c[1] = y;
+			m_c[2] = z;
+			m_c[3] = w;
+		}
+
+		int32_t x() const { return m_c[0]; }
+		int32_t y() const { return m_c[1]; }
+		int32_t z() const { return m_c[2]; }
+		int32_t w() const { return m_c[3]; }
+
+		int32_t& x() { return m_c[0]; }
+		int32_t& y() { return m_c[1]; }
+		int32_t& z() { return m_c[2]; }
+		int32_t& w() { return m_c[3]; }
+
+		UVec4 asUint() const
+		{
+			return UVec4(basisu::maximum(0, m_c[0]), basisu::maximum(0, m_c[1]), basisu::maximum(0, m_c[2]), basisu::maximum(0, m_c[3]));
+		}
+
+		int32_t operator[] (uint32_t idx) const { assert(idx < 4);  return m_c[idx]; }
+		int32_t& operator[] (uint32_t idx) { assert(idx < 4);  return m_c[idx]; }
+	};
+
+	struct IVec3
+	{
+		int32_t m_c[3];
+
+		IVec3()
+		{
+			m_c[0] = 0;
+			m_c[1] = 0;
+			m_c[2] = 0;
+		}
+
+		IVec3(int32_t x, int32_t y, int32_t z)
+		{
+			m_c[0] = x;
+			m_c[1] = y;
+			m_c[2] = z;
+		}
+
+		int32_t x() const { return m_c[0]; }
+		int32_t y() const { return m_c[1]; }
+		int32_t z() const { return m_c[2]; }
+
+		int32_t& x() { return m_c[0]; }
+		int32_t& y() { return m_c[1]; }
+		int32_t& z() { return m_c[2]; }
+
+		int32_t operator[] (uint32_t idx) const { assert(idx < 3);  return m_c[idx]; }
+		int32_t& operator[] (uint32_t idx) { assert(idx < 3);  return m_c[idx]; }
+	};
+
+	static uint32_t deDivRoundUp32(uint32_t a, uint32_t b)
+	{
+		return (a + b - 1) / b;
+	}
+
+	static bool deInBounds32(uint32_t v, uint32_t l, uint32_t h)
+	{
+		return (v >= l) && (v < h);
+	}
+
+namespace astc
+{
+using std::vector;
+namespace
+{
+// Common utilities
+enum
+{
+	MAX_BLOCK_WIDTH		= 12,
+	MAX_BLOCK_HEIGHT	= 12
+};
+inline deUint32 getBit (deUint32 src, int ndx)
+{
+	DE_ASSERT(basisu_astc::inBounds(ndx, 0, 32));
+	return (src >> ndx) & 1;
+}
+inline deUint32 getBits (deUint32 src, int low, int high)
+{
+	const int numBits = (high-low) + 1;
+	DE_ASSERT(basisu_astc::inRange(numBits, 1, 32));
+	if (numBits < 32)
+		return (deUint32)((src >> low) & ((1u<<numBits)-1));
+	else
+		return (deUint32)((src >> low) & 0xFFFFFFFFu);
+}
+inline bool isBitSet (deUint32 src, int ndx)
+{
+	return getBit(src, ndx) != 0;
+}
+inline deUint32 reverseBits (deUint32 src, int numBits)
+{
+	DE_ASSERT(basisu_astc::inRange(numBits, 0, 32));
+	deUint32 result = 0;
+	for (int i = 0; i < numBits; i++)
+		result |= ((src >> i) & 1) << (numBits-1-i);
+	return result;
+}
+inline deUint32 bitReplicationScale (deUint32 src, int numSrcBits, int numDstBits)
+{
+	DE_ASSERT(numSrcBits <= numDstBits);
+	DE_ASSERT((src & ((1<<numSrcBits)-1)) == src);
+	deUint32 dst = 0;
+	for (int shift = numDstBits-numSrcBits; shift > -numSrcBits; shift -= numSrcBits)
+		dst |= shift >= 0 ? src << shift : src >> -shift;
+	return dst;
+}
+
+inline deInt32 signExtend (deInt32 src, int numSrcBits)
+{
+	DE_ASSERT(basisu_astc::inRange(numSrcBits, 2, 31));
+	const bool negative = (src & (1 << (numSrcBits-1))) != 0;
+	return src | (negative ? ~((1 << numSrcBits) - 1) : 0);
+}
+
+//inline bool isFloat16InfOrNan (deFloat16 v)
+//{
+//	return getBits(v, 10, 14) == 31;
+//}
+
+enum ISEMode
+{
+	ISEMODE_TRIT = 0,
+	ISEMODE_QUINT,
+	ISEMODE_PLAIN_BIT,
+	ISEMODE_LAST
+};
+struct ISEParams
+{
+	ISEMode		mode;
+	int			numBits;
+	ISEParams (ISEMode mode_, int numBits_) : mode(mode_), numBits(numBits_) {}
+};
+inline int computeNumRequiredBits (const ISEParams& iseParams, int numValues)
+{
+	switch (iseParams.mode)
+	{
+		case ISEMODE_TRIT:			return deDivRoundUp32(numValues*8, 5) + numValues*iseParams.numBits;
+		case ISEMODE_QUINT:			return deDivRoundUp32(numValues*7, 3) + numValues*iseParams.numBits;
+		case ISEMODE_PLAIN_BIT:		return numValues*iseParams.numBits;
+		default:
+			DE_ASSERT(false);
+			return -1;
+	}
+}
+ISEParams computeMaximumRangeISEParams (int numAvailableBits, int numValuesInSequence)
+{
+	int curBitsForTritMode		= 6;
+	int curBitsForQuintMode		= 5;
+	int curBitsForPlainBitMode	= 8;
+	while (true)
+	{
+		DE_ASSERT(curBitsForTritMode > 0 || curBitsForQuintMode > 0 || curBitsForPlainBitMode > 0);
+		const int tritRange			= curBitsForTritMode > 0		? (3 << curBitsForTritMode) - 1			: -1;
+		const int quintRange		= curBitsForQuintMode > 0		? (5 << curBitsForQuintMode) - 1		: -1;
+		const int plainBitRange		= curBitsForPlainBitMode > 0	? (1 << curBitsForPlainBitMode) - 1		: -1;
+		const int maxRange			= basisu_astc::max(basisu_astc::max(tritRange, quintRange), plainBitRange);
+		if (maxRange == tritRange)
+		{
+			const ISEParams params(ISEMODE_TRIT, curBitsForTritMode);
+			if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+				return ISEParams(ISEMODE_TRIT, curBitsForTritMode);
+			curBitsForTritMode--;
+		}
+		else if (maxRange == quintRange)
+		{
+			const ISEParams params(ISEMODE_QUINT, curBitsForQuintMode);
+			if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+				return ISEParams(ISEMODE_QUINT, curBitsForQuintMode);
+			curBitsForQuintMode--;
+		}
+		else
+		{
+			const ISEParams params(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode);
+			DE_ASSERT(maxRange == plainBitRange);
+			if (computeNumRequiredBits(params, numValuesInSequence) <= numAvailableBits)
+				return ISEParams(ISEMODE_PLAIN_BIT, curBitsForPlainBitMode);
+			curBitsForPlainBitMode--;
+		}
+	}
+}
+inline int computeNumColorEndpointValues (deUint32 endpointMode)
+{
+	DE_ASSERT(endpointMode < 16);
+	return (endpointMode/4 + 1) * 2;
+}
+// Decompression utilities
+enum DecompressResult
+{
+	DECOMPRESS_RESULT_VALID_BLOCK	= 0,	//!< Decompressed valid block
+	DECOMPRESS_RESULT_ERROR,				//!< Encountered error while decompressing, error color written
+	DECOMPRESS_RESULT_LAST
+};
+// A helper for getting bits from a 128-bit block.
+class Block128
+{
+private:
+	typedef deUint64 Word;
+	enum
+	{
+		WORD_BYTES	= sizeof(Word),
+		WORD_BITS	= 8*WORD_BYTES,
+		NUM_WORDS	= 128 / WORD_BITS
+	};
+	//DE_STATIC_ASSERT(128 % WORD_BITS == 0);
+public:
+	Block128 (const deUint8* src)
+	{
+		for (int wordNdx = 0; wordNdx < NUM_WORDS; wordNdx++)
+		{
+			m_words[wordNdx] = 0;
+			for (int byteNdx = 0; byteNdx < WORD_BYTES; byteNdx++)
+				m_words[wordNdx] |= (Word)src[wordNdx*WORD_BYTES + byteNdx] << (8*byteNdx);
+		}
+	}
+	deUint32 getBit (int ndx) const
+	{
+		DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128));
+		return (m_words[ndx / WORD_BITS] >> (ndx % WORD_BITS)) & 1;
+	}
+	deUint32 getBits (int low, int high) const
+	{
+		DE_ASSERT(basisu_astc::inBounds(low, 0, 128));
+		DE_ASSERT(basisu_astc::inBounds(high, 0, 128));
+		DE_ASSERT(basisu_astc::inRange(high-low+1, 0, 32));
+		if (high-low+1 == 0)
+			return 0;
+		const int word0Ndx = low / WORD_BITS;
+		const int word1Ndx = high / WORD_BITS;
+		// \note "foo << bar << 1" done instead of "foo << (bar+1)" to avoid overflow, i.e. shift amount being too big.
+		if (word0Ndx == word1Ndx)
+			return (deUint32)((m_words[word0Ndx] & ((((Word)1 << high%WORD_BITS << 1) - 1))) >> ((Word)low % WORD_BITS));
+		else
+		{
+			DE_ASSERT(word1Ndx == word0Ndx + 1);
+			return (deUint32)(m_words[word0Ndx] >> (low%WORD_BITS)) |
+				   (deUint32)((m_words[word1Ndx] & (((Word)1 << high%WORD_BITS << 1) - 1)) << (high-low - high%WORD_BITS));
+		}
+	}
+	bool isBitSet (int ndx) const
+	{
+		DE_ASSERT(basisu_astc::inBounds(ndx, 0, 128));
+		return getBit(ndx) != 0;
+	}
+private:
+	Word m_words[NUM_WORDS];
+};
+// A helper for sequential access into a Block128.
+class BitAccessStream
+{
+public:
+	BitAccessStream (const Block128& src, int startNdxInSrc, int length, bool forward)
+		: m_src				(src)
+		, m_startNdxInSrc	(startNdxInSrc)
+		, m_length			(length)
+		, m_forward			(forward)
+		, m_ndx				(0)
+	{
+	}
+	// Get the next num bits. Bits at positions greater than or equal to m_length are zeros.
+	deUint32 getNext (int num)
+	{
+		if (num == 0 || m_ndx >= m_length)
+			return 0;
+		const int end				= m_ndx + num;
+		const int numBitsFromSrc	= basisu_astc::max(0, basisu_astc::min(m_length, end) - m_ndx);
+		const int low				= m_ndx;
+		const int high				= m_ndx + numBitsFromSrc - 1;
+		m_ndx += num;
+		return m_forward ?			   m_src.getBits(m_startNdxInSrc + low,  m_startNdxInSrc + high)
+						 : reverseBits(m_src.getBits(m_startNdxInSrc - high, m_startNdxInSrc - low), numBitsFromSrc);
+	}
+private:
+	const Block128&		m_src;
+	const int			m_startNdxInSrc;
+	const int			m_length;
+	const bool			m_forward;
+	int					m_ndx;
+};
+struct ISEDecodedResult
+{
+	deUint32 m;
+	deUint32 tq; //!< Trit or quint value, depending on ISE mode.
+	deUint32 v;
+};
+// Data from an ASTC block's "block mode" part (i.e. bits [0,10]).
+struct ASTCBlockMode
+{
+	bool		isError;
+	// \note Following fields only relevant if !isError.
+	bool		isVoidExtent;
+	// \note Following fields only relevant if !isVoidExtent.
+	bool		isDualPlane;
+	int			weightGridWidth;
+	int			weightGridHeight;
+	ISEParams	weightISEParams;
+	ASTCBlockMode (void)
+		: isError			(true)
+		, isVoidExtent		(true)
+		, isDualPlane		(true)
+		, weightGridWidth	(-1)
+		, weightGridHeight	(-1)
+		, weightISEParams	(ISEMODE_LAST, -1)
+	{
+	}
+};
+inline int computeNumWeights (const ASTCBlockMode& mode)
+{
+	return mode.weightGridWidth * mode.weightGridHeight * (mode.isDualPlane ? 2 : 1);
+}
+struct ColorEndpointPair
+{
+	UVec4 e0;
+	UVec4 e1;
+};
+struct TexelWeightPair
+{
+	deUint32 w[2];
+};
+ASTCBlockMode getASTCBlockMode (deUint32 blockModeData)
+{
+	ASTCBlockMode blockMode;
+	blockMode.isError = true; // \note Set to false later, if not error.
+	blockMode.isVoidExtent = getBits(blockModeData, 0, 8) == 0x1fc;
+	if (!blockMode.isVoidExtent)
+	{
+		if ((getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 6, 8) == 7) || getBits(blockModeData, 0, 3) == 0)
+			return blockMode; // Invalid ("reserved").
+		deUint32 r = (deUint32)-1; // \note Set in the following branches.
+		if (getBits(blockModeData, 0, 1) == 0)
+		{
+			const deUint32 r0	= getBit(blockModeData, 4);
+			const deUint32 r1	= getBit(blockModeData, 2);
+			const deUint32 r2	= getBit(blockModeData, 3);
+			const deUint32 i78	= getBits(blockModeData, 7, 8);
+			r = (r2 << 2) | (r1 << 1) | (r0 << 0);
+			if (i78 == 3)
+			{
+				const bool i5 = isBitSet(blockModeData, 5);
+				blockMode.weightGridWidth	= i5 ? 10 : 6;
+				blockMode.weightGridHeight	= i5 ? 6  : 10;
+			}
+			else
+			{
+				const deUint32 a = getBits(blockModeData, 5, 6);
+				switch (i78)
+				{
+					case 0:		blockMode.weightGridWidth = 12;		blockMode.weightGridHeight = a + 2;									break;
+					case 1:		blockMode.weightGridWidth = a + 2;	blockMode.weightGridHeight = 12;									break;
+					case 2:		blockMode.weightGridWidth = a + 6;	blockMode.weightGridHeight = getBits(blockModeData, 9, 10) + 6;		break;
+					default: DE_ASSERT(false);
+				}
+			}
+		}
+		else
+		{
+			const deUint32 r0	= getBit(blockModeData, 4);
+			const deUint32 r1	= getBit(blockModeData, 0);
+			const deUint32 r2	= getBit(blockModeData, 1);
+			const deUint32 i23	= getBits(blockModeData, 2, 3);
+			const deUint32 a	= getBits(blockModeData, 5, 6);
+			r = (r2 << 2) | (r1 << 1) | (r0 << 0);
+			if (i23 == 3)
+			{
+				const deUint32	b	= getBit(blockModeData, 7);
+				const bool		i8	= isBitSet(blockModeData, 8);
+				blockMode.weightGridWidth	= i8 ? b+2 : a+2;
+				blockMode.weightGridHeight	= i8 ? a+2 : b+6;
+			}
+			else
+			{
+				const deUint32 b = getBits(blockModeData, 7, 8);
+				switch (i23)
+				{
+					case 0:		blockMode.weightGridWidth = b + 4;	blockMode.weightGridHeight = a + 2;	break;
+					case 1:		blockMode.weightGridWidth = b + 8;	blockMode.weightGridHeight = a + 2;	break;
+					case 2:		blockMode.weightGridWidth = a + 2;	blockMode.weightGridHeight = b + 8;	break;
+					default: DE_ASSERT(false);
+				}
+			}
+		}
+		const bool	zeroDH		= getBits(blockModeData, 0, 1) == 0 && getBits(blockModeData, 7, 8) == 2;
+		const bool	h			= zeroDH ? 0 : isBitSet(blockModeData, 9);
+		blockMode.isDualPlane	= zeroDH ? 0 : isBitSet(blockModeData, 10);
+		{
+			ISEMode&	m	= blockMode.weightISEParams.mode;
+			int&		b	= blockMode.weightISEParams.numBits;
+			m = ISEMODE_PLAIN_BIT;
+			b = 0;
+			if (h)
+			{
+				switch (r)
+				{
+					case 2:							m = ISEMODE_QUINT;	b = 1;	break;
+					case 3:		m = ISEMODE_TRIT;						b = 2;	break;
+					case 4:												b = 4;	break;
+					case 5:							m = ISEMODE_QUINT;	b = 2;	break;
+					case 6:		m = ISEMODE_TRIT;						b = 3;	break;
+					case 7:												b = 5;	break;
+					default:	DE_ASSERT(false);
+				}
+			}
+			else
+			{
+				switch (r)
+				{
+					case 2:												b = 1;	break;
+					case 3:		m = ISEMODE_TRIT;								break;
+					case 4:												b = 2;	break;
+					case 5:							m = ISEMODE_QUINT;			break;
+					case 6:		m = ISEMODE_TRIT;						b = 1;	break;
+					case 7:												b = 3;	break;
+					default:	DE_ASSERT(false);
+				}
+			}
+		}
+	}
+	blockMode.isError = false;
+	return blockMode;
+}
+inline void setASTCErrorColorBlock (void* dst, int blockWidth, int blockHeight, bool isSRGB)
+{
+	if (isSRGB)
+	{
+		deUint8* const dstU = (deUint8*)dst;
+		for (int i = 0; i < blockWidth*blockHeight; i++)
+		{
+			dstU[4*i + 0] = 0xff;
+			dstU[4*i + 1] = 0;
+			dstU[4*i + 2] = 0xff;
+			dstU[4*i + 3] = 0xff;
+		}
+	}
+	else
+	{
+		float* const dstF = (float*)dst;
+		for (int i = 0; i < blockWidth*blockHeight; i++)
+		{
+			dstF[4*i + 0] = 1.0f;
+			dstF[4*i + 1] = 0.0f;
+			dstF[4*i + 2] = 1.0f;
+			dstF[4*i + 3] = 1.0f;
+		}
+	}
+}
+DecompressResult decodeVoidExtentBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode)
+{
+	const deUint32	minSExtent			= blockData.getBits(12, 24);
+	const deUint32	maxSExtent			= blockData.getBits(25, 37);
+	const deUint32	minTExtent			= blockData.getBits(38, 50);
+	const deUint32	maxTExtent			= blockData.getBits(51, 63);
+	const bool		allExtentsAllOnes	= minSExtent == 0x1fff && maxSExtent == 0x1fff && minTExtent == 0x1fff && maxTExtent == 0x1fff;
+	const bool		isHDRBlock			= blockData.isBitSet(9);
+	if ((isLDRMode && isHDRBlock) || (!allExtentsAllOnes && (minSExtent >= maxSExtent || minTExtent >= maxTExtent)))
+	{
+		setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+		return DECOMPRESS_RESULT_ERROR;
+	}
+	const deUint32 rgba[4] =
+	{
+		blockData.getBits(64,  79),
+		blockData.getBits(80,  95),
+		blockData.getBits(96,  111),
+		blockData.getBits(112, 127)
+	};
+	if (isSRGB)
+	{
+		deUint8* const dstU = (deUint8*)dst;
+		for (int i = 0; i < blockWidth*blockHeight; i++)
+		for (int c = 0; c < 4; c++)
+			dstU[i*4 + c] = (deUint8)((rgba[c] & 0xff00) >> 8);
+	}
+	else
+	{
+		float* const dstF = (float*)dst;
+		if (isHDRBlock)
+		{
+			// rg - REMOVING HDR SUPPORT FOR NOW
+#if 0
+			for (int c = 0; c < 4; c++)
+			{
+				if (isFloat16InfOrNan((deFloat16)rgba[c]))
+					throw InternalError("Infinity or NaN color component in HDR void extent block in ASTC texture (behavior undefined by ASTC specification)");
+			}
+			for (int i = 0; i < blockWidth*blockHeight; i++)
+			for (int c = 0; c < 4; c++)
+				dstF[i*4 + c] = deFloat16To32((deFloat16)rgba[c]);
+#endif
+		}
+		else
+		{
+			for (int i = 0; i < blockWidth*blockHeight; i++)
+			for (int c = 0; c < 4; c++)
+				dstF[i*4 + c] = rgba[c] == 65535 ? 1.0f : (float)rgba[c] / 65536.0f;
+		}
+	}
+	return DECOMPRESS_RESULT_VALID_BLOCK;
+}
+void decodeColorEndpointModes (deUint32* endpointModesDst, const Block128& blockData, int numPartitions, int extraCemBitsStart)
+{
+	if (numPartitions == 1)
+		endpointModesDst[0] = blockData.getBits(13, 16);
+	else
+	{
+		const deUint32 highLevelSelector = blockData.getBits(23, 24);
+		if (highLevelSelector == 0)
+		{
+			const deUint32 mode = blockData.getBits(25, 28);
+			for (int i = 0; i < numPartitions; i++)
+				endpointModesDst[i] = mode;
+		}
+		else
+		{
+			for (int partNdx = 0; partNdx < numPartitions; partNdx++)
+			{
+				const deUint32 cemClass		= highLevelSelector - (blockData.isBitSet(25 + partNdx) ? 0 : 1);
+				const deUint32 lowBit0Ndx	= numPartitions + 2*partNdx;
+				const deUint32 lowBit1Ndx	= numPartitions + 2*partNdx + 1;
+				const deUint32 lowBit0		= blockData.getBit(lowBit0Ndx < 4 ? 25+lowBit0Ndx : extraCemBitsStart+lowBit0Ndx-4);
+				const deUint32 lowBit1		= blockData.getBit(lowBit1Ndx < 4 ? 25+lowBit1Ndx : extraCemBitsStart+lowBit1Ndx-4);
+				endpointModesDst[partNdx] = (cemClass << 2) | (lowBit1 << 1) | lowBit0;
+			}
+		}
+	}
+}
+int computeNumColorEndpointValues (const deUint32* endpointModes, int numPartitions)
+{
+	int result = 0;
+	for (int i = 0; i < numPartitions; i++)
+		result += computeNumColorEndpointValues(endpointModes[i]);
+	return result;
+}
+void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits)
+{
+	DE_ASSERT(basisu_astc::inRange(numValues, 1, 5));
+	deUint32 m[5];
+	m[0]			= data.getNext(numBits);
+	deUint32 T01	= data.getNext(2);
+	m[1]			= data.getNext(numBits);
+	deUint32 T23	= data.getNext(2);
+	m[2]			= data.getNext(numBits);
+	deUint32 T4		= data.getNext(1);
+	m[3]			= data.getNext(numBits);
+	deUint32 T56	= data.getNext(2);
+	m[4]			= data.getNext(numBits);
+	deUint32 T7		= data.getNext(1);
+	switch (numValues)
+	{
+		// \note Fall-throughs.
+		case 1: T23		= 0;
+		case 2: T4		= 0;
+		case 3: T56		= 0;
+		case 4: T7		= 0;
+		case 5: break;
+		default:
+			DE_ASSERT(false);
+	}
+	const deUint32 T = (T7 << 7) | (T56 << 5) | (T4 << 4) | (T23 << 2) | (T01 << 0);
+	static const deUint32 tritsFromT[256][5] =
+	{
+		{ 0,0,0,0,0 }, { 1,0,0,0,0 }, { 2,0,0,0,0 }, { 0,0,2,0,0 }, { 0,1,0,0,0 }, { 1,1,0,0,0 }, { 2,1,0,0,0 }, { 1,0,2,0,0 }, { 0,2,0,0,0 }, { 1,2,0,0,0 }, { 2,2,0,0,0 }, { 2,0,2,0,0 }, { 0,2,2,0,0 }, { 1,2,2,0,0 }, { 2,2,2,0,0 }, { 2,0,2,0,0 },
+		{ 0,0,1,0,0 }, { 1,0,1,0,0 }, { 2,0,1,0,0 }, { 0,1,2,0,0 }, { 0,1,1,0,0 }, { 1,1,1,0,0 }, { 2,1,1,0,0 }, { 1,1,2,0,0 }, { 0,2,1,0,0 }, { 1,2,1,0,0 }, { 2,2,1,0,0 }, { 2,1,2,0,0 }, { 0,0,0,2,2 }, { 1,0,0,2,2 }, { 2,0,0,2,2 }, { 0,0,2,2,2 },
+		{ 0,0,0,1,0 }, { 1,0,0,1,0 }, { 2,0,0,1,0 }, { 0,0,2,1,0 }, { 0,1,0,1,0 }, { 1,1,0,1,0 }, { 2,1,0,1,0 }, { 1,0,2,1,0 }, { 0,2,0,1,0 }, { 1,2,0,1,0 }, { 2,2,0,1,0 }, { 2,0,2,1,0 }, { 0,2,2,1,0 }, { 1,2,2,1,0 }, { 2,2,2,1,0 }, { 2,0,2,1,0 },
+		{ 0,0,1,1,0 }, { 1,0,1,1,0 }, { 2,0,1,1,0 }, { 0,1,2,1,0 }, { 0,1,1,1,0 }, { 1,1,1,1,0 }, { 2,1,1,1,0 }, { 1,1,2,1,0 }, { 0,2,1,1,0 }, { 1,2,1,1,0 }, { 2,2,1,1,0 }, { 2,1,2,1,0 }, { 0,1,0,2,2 }, { 1,1,0,2,2 }, { 2,1,0,2,2 }, { 1,0,2,2,2 },
+		{ 0,0,0,2,0 }, { 1,0,0,2,0 }, { 2,0,0,2,0 }, { 0,0,2,2,0 }, { 0,1,0,2,0 }, { 1,1,0,2,0 }, { 2,1,0,2,0 }, { 1,0,2,2,0 }, { 0,2,0,2,0 }, { 1,2,0,2,0 }, { 2,2,0,2,0 }, { 2,0,2,2,0 }, { 0,2,2,2,0 }, { 1,2,2,2,0 }, { 2,2,2,2,0 }, { 2,0,2,2,0 },
+		{ 0,0,1,2,0 }, { 1,0,1,2,0 }, { 2,0,1,2,0 }, { 0,1,2,2,0 }, { 0,1,1,2,0 }, { 1,1,1,2,0 }, { 2,1,1,2,0 }, { 1,1,2,2,0 }, { 0,2,1,2,0 }, { 1,2,1,2,0 }, { 2,2,1,2,0 }, { 2,1,2,2,0 }, { 0,2,0,2,2 }, { 1,2,0,2,2 }, { 2,2,0,2,2 }, { 2,0,2,2,2 },
+		{ 0,0,0,0,2 }, { 1,0,0,0,2 }, { 2,0,0,0,2 }, { 0,0,2,0,2 }, { 0,1,0,0,2 }, { 1,1,0,0,2 }, { 2,1,0,0,2 }, { 1,0,2,0,2 }, { 0,2,0,0,2 }, { 1,2,0,0,2 }, { 2,2,0,0,2 }, { 2,0,2,0,2 }, { 0,2,2,0,2 }, { 1,2,2,0,2 }, { 2,2,2,0,2 }, { 2,0,2,0,2 },
+		{ 0,0,1,0,2 }, { 1,0,1,0,2 }, { 2,0,1,0,2 }, { 0,1,2,0,2 }, { 0,1,1,0,2 }, { 1,1,1,0,2 }, { 2,1,1,0,2 }, { 1,1,2,0,2 }, { 0,2,1,0,2 }, { 1,2,1,0,2 }, { 2,2,1,0,2 }, { 2,1,2,0,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,0,2,2,2 },
+		{ 0,0,0,0,1 }, { 1,0,0,0,1 }, { 2,0,0,0,1 }, { 0,0,2,0,1 }, { 0,1,0,0,1 }, { 1,1,0,0,1 }, { 2,1,0,0,1 }, { 1,0,2,0,1 }, { 0,2,0,0,1 }, { 1,2,0,0,1 }, { 2,2,0,0,1 }, { 2,0,2,0,1 }, { 0,2,2,0,1 }, { 1,2,2,0,1 }, { 2,2,2,0,1 }, { 2,0,2,0,1 },
+		{ 0,0,1,0,1 }, { 1,0,1,0,1 }, { 2,0,1,0,1 }, { 0,1,2,0,1 }, { 0,1,1,0,1 }, { 1,1,1,0,1 }, { 2,1,1,0,1 }, { 1,1,2,0,1 }, { 0,2,1,0,1 }, { 1,2,1,0,1 }, { 2,2,1,0,1 }, { 2,1,2,0,1 }, { 0,0,1,2,2 }, { 1,0,1,2,2 }, { 2,0,1,2,2 }, { 0,1,2,2,2 },
+		{ 0,0,0,1,1 }, { 1,0,0,1,1 }, { 2,0,0,1,1 }, { 0,0,2,1,1 }, { 0,1,0,1,1 }, { 1,1,0,1,1 }, { 2,1,0,1,1 }, { 1,0,2,1,1 }, { 0,2,0,1,1 }, { 1,2,0,1,1 }, { 2,2,0,1,1 }, { 2,0,2,1,1 }, { 0,2,2,1,1 }, { 1,2,2,1,1 }, { 2,2,2,1,1 }, { 2,0,2,1,1 },
+		{ 0,0,1,1,1 }, { 1,0,1,1,1 }, { 2,0,1,1,1 }, { 0,1,2,1,1 }, { 0,1,1,1,1 }, { 1,1,1,1,1 }, { 2,1,1,1,1 }, { 1,1,2,1,1 }, { 0,2,1,1,1 }, { 1,2,1,1,1 }, { 2,2,1,1,1 }, { 2,1,2,1,1 }, { 0,1,1,2,2 }, { 1,1,1,2,2 }, { 2,1,1,2,2 }, { 1,1,2,2,2 },
+		{ 0,0,0,2,1 }, { 1,0,0,2,1 }, { 2,0,0,2,1 }, { 0,0,2,2,1 }, { 0,1,0,2,1 }, { 1,1,0,2,1 }, { 2,1,0,2,1 }, { 1,0,2,2,1 }, { 0,2,0,2,1 }, { 1,2,0,2,1 }, { 2,2,0,2,1 }, { 2,0,2,2,1 }, { 0,2,2,2,1 }, { 1,2,2,2,1 }, { 2,2,2,2,1 }, { 2,0,2,2,1 },
+		{ 0,0,1,2,1 }, { 1,0,1,2,1 }, { 2,0,1,2,1 }, { 0,1,2,2,1 }, { 0,1,1,2,1 }, { 1,1,1,2,1 }, { 2,1,1,2,1 }, { 1,1,2,2,1 }, { 0,2,1,2,1 }, { 1,2,1,2,1 }, { 2,2,1,2,1 }, { 2,1,2,2,1 }, { 0,2,1,2,2 }, { 1,2,1,2,2 }, { 2,2,1,2,2 }, { 2,1,2,2,2 },
+		{ 0,0,0,1,2 }, { 1,0,0,1,2 }, { 2,0,0,1,2 }, { 0,0,2,1,2 }, { 0,1,0,1,2 }, { 1,1,0,1,2 }, { 2,1,0,1,2 }, { 1,0,2,1,2 }, { 0,2,0,1,2 }, { 1,2,0,1,2 }, { 2,2,0,1,2 }, { 2,0,2,1,2 }, { 0,2,2,1,2 }, { 1,2,2,1,2 }, { 2,2,2,1,2 }, { 2,0,2,1,2 },
+		{ 0,0,1,1,2 }, { 1,0,1,1,2 }, { 2,0,1,1,2 }, { 0,1,2,1,2 }, { 0,1,1,1,2 }, { 1,1,1,1,2 }, { 2,1,1,1,2 }, { 1,1,2,1,2 }, { 0,2,1,1,2 }, { 1,2,1,1,2 }, { 2,2,1,1,2 }, { 2,1,2,1,2 }, { 0,2,2,2,2 }, { 1,2,2,2,2 }, { 2,2,2,2,2 }, { 2,1,2,2,2 }
+	};
+	const deUint32 (& trits)[5] = tritsFromT[T];
+	for (int i = 0; i < numValues; i++)
+	{
+		dst[i].m	= m[i];
+		dst[i].tq	= trits[i];
+		dst[i].v	= (trits[i] << numBits) + m[i];
+	}
+}
+void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& data, int numBits)
+{
+	DE_ASSERT(basisu_astc::inRange(numValues, 1, 3));
+	deUint32 m[3];
+	m[0]			= data.getNext(numBits);
+	deUint32 Q012	= data.getNext(3);
+	m[1]			= data.getNext(numBits);
+	deUint32 Q34	= data.getNext(2);
+	m[2]			= data.getNext(numBits);
+	deUint32 Q56	= data.getNext(2);
+	switch (numValues)
+	{
+		// \note Fall-throughs.
+		case 1: Q34		= 0;
+		case 2: Q56		= 0;
+		case 3: break;
+		default:
+			DE_ASSERT(false);
+	}
+	const deUint32 Q = (Q56 << 5) | (Q34 << 3) | (Q012 << 0);
+	static const deUint32 quintsFromQ[256][3] =
+	{
+		{ 0,0,0 }, { 1,0,0 }, { 2,0,0 }, { 3,0,0 }, { 4,0,0 }, { 0,4,0 }, { 4,4,0 }, { 4,4,4 }, { 0,1,0 }, { 1,1,0 }, { 2,1,0 }, { 3,1,0 }, { 4,1,0 }, { 1,4,0 }, { 4,4,1 }, { 4,4,4 },
+		{ 0,2,0 }, { 1,2,0 }, { 2,2,0 }, { 3,2,0 }, { 4,2,0 }, { 2,4,0 }, { 4,4,2 }, { 4,4,4 }, { 0,3,0 }, { 1,3,0 }, { 2,3,0 }, { 3,3,0 }, { 4,3,0 }, { 3,4,0 }, { 4,4,3 }, { 4,4,4 },
+		{ 0,0,1 }, { 1,0,1 }, { 2,0,1 }, { 3,0,1 }, { 4,0,1 }, { 0,4,1 }, { 4,0,4 }, { 0,4,4 }, { 0,1,1 }, { 1,1,1 }, { 2,1,1 }, { 3,1,1 }, { 4,1,1 }, { 1,4,1 }, { 4,1,4 }, { 1,4,4 },
+		{ 0,2,1 }, { 1,2,1 }, { 2,2,1 }, { 3,2,1 }, { 4,2,1 }, { 2,4,1 }, { 4,2,4 }, { 2,4,4 }, { 0,3,1 }, { 1,3,1 }, { 2,3,1 }, { 3,3,1 }, { 4,3,1 }, { 3,4,1 }, { 4,3,4 }, { 3,4,4 },
+		{ 0,0,2 }, { 1,0,2 }, { 2,0,2 }, { 3,0,2 }, { 4,0,2 }, { 0,4,2 }, { 2,0,4 }, { 3,0,4 }, { 0,1,2 }, { 1,1,2 }, { 2,1,2 }, { 3,1,2 }, { 4,1,2 }, { 1,4,2 }, { 2,1,4 }, { 3,1,4 },
+		{ 0,2,2 }, { 1,2,2 }, { 2,2,2 }, { 3,2,2 }, { 4,2,2 }, { 2,4,2 }, { 2,2,4 }, { 3,2,4 }, { 0,3,2 }, { 1,3,2 }, { 2,3,2 }, { 3,3,2 }, { 4,3,2 }, { 3,4,2 }, { 2,3,4 }, { 3,3,4 },
+		{ 0,0,3 }, { 1,0,3 }, { 2,0,3 }, { 3,0,3 }, { 4,0,3 }, { 0,4,3 }, { 0,0,4 }, { 1,0,4 }, { 0,1,3 }, { 1,1,3 }, { 2,1,3 }, { 3,1,3 }, { 4,1,3 }, { 1,4,3 }, { 0,1,4 }, { 1,1,4 },
+		{ 0,2,3 }, { 1,2,3 }, { 2,2,3 }, { 3,2,3 }, { 4,2,3 }, { 2,4,3 }, { 0,2,4 }, { 1,2,4 }, { 0,3,3 }, { 1,3,3 }, { 2,3,3 }, { 3,3,3 }, { 4,3,3 }, { 3,4,3 }, { 0,3,4 }, { 1,3,4 }
+	};
+	const deUint32 (& quints)[3] = quintsFromQ[Q];
+	for (int i = 0; i < numValues; i++)
+	{
+		dst[i].m	= m[i];
+		dst[i].tq	= quints[i];
+		dst[i].v	= (quints[i] << numBits) + m[i];
+	}
+}
+inline void decodeISEBitBlock (ISEDecodedResult* dst, BitAccessStream& data, int numBits)
+{
+	dst[0].m = data.getNext(numBits);
+	dst[0].v = dst[0].m;
+}
+void decodeISE (ISEDecodedResult* dst, int numValues, BitAccessStream& data, const ISEParams& params)
+{
+	if (params.mode == ISEMODE_TRIT)
+	{
+		const int numBlocks = deDivRoundUp32(numValues, 5);
+		for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++)
+		{
+			const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 5*(numBlocks-1) : 5;
+			decodeISETritBlock(&dst[5*blockNdx], numValuesInBlock, data, params.numBits);
+		}
+	}
+	else if (params.mode == ISEMODE_QUINT)
+	{
+		const int numBlocks = deDivRoundUp32(numValues, 3);
+		for (int blockNdx = 0; blockNdx < numBlocks; blockNdx++)
+		{
+			const int numValuesInBlock = blockNdx == numBlocks-1 ? numValues - 3*(numBlocks-1) : 3;
+			decodeISEQuintBlock(&dst[3*blockNdx], numValuesInBlock, data, params.numBits);
+		}
+	}
+	else
+	{
+		DE_ASSERT(params.mode == ISEMODE_PLAIN_BIT);
+		for (int i = 0; i < numValues; i++)
+			decodeISEBitBlock(&dst[i], data, params.numBits);
+	}
+}
+void unquantizeColorEndpoints (deUint32* dst, const ISEDecodedResult* iseResults, int numEndpoints, const ISEParams& iseParams)
+{
+	if (iseParams.mode == ISEMODE_TRIT || iseParams.mode == ISEMODE_QUINT)
+	{
+		const int rangeCase				= iseParams.numBits*2 - (iseParams.mode == ISEMODE_TRIT ? 2 : 1);
+		DE_ASSERT(basisu_astc::inRange(rangeCase, 0, 10));
+		static const deUint32	Ca[11]	= { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 };
+		const deUint32			C		= Ca[rangeCase];
+		for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++)
+		{
+			const deUint32 a = getBit(iseResults[endpointNdx].m, 0);
+			const deUint32 b = getBit(iseResults[endpointNdx].m, 1);
+			const deUint32 c = getBit(iseResults[endpointNdx].m, 2);
+			const deUint32 d = getBit(iseResults[endpointNdx].m, 3);
+			const deUint32 e = getBit(iseResults[endpointNdx].m, 4);
+			const deUint32 f = getBit(iseResults[endpointNdx].m, 5);
+			const deUint32 A = a == 0 ? 0 : (1<<9)-1;
+			const deUint32 B = rangeCase == 0	? 0
+							 : rangeCase == 1	? 0
+							 : rangeCase == 2	? (b << 8) |									(b << 4) |				(b << 2) |	(b << 1)
+							 : rangeCase == 3	? (b << 8) |												(b << 3) |	(b << 2)
+							 : rangeCase == 4	? (c << 8) | (b << 7) |										(c << 3) |	(b << 2) |	(c << 1) |	(b << 0)
+							 : rangeCase == 5	? (c << 8) | (b << 7) |													(c << 2) |	(b << 1) |	(c << 0)
+							 : rangeCase == 6	? (d << 8) | (c << 7) | (b << 6) |										(d << 2) |	(c << 1) |	(b << 0)
+							 : rangeCase == 7	? (d << 8) | (c << 7) | (b << 6) |													(d << 1) |	(c << 0)
+							 : rangeCase == 8	? (e << 8) | (d << 7) | (c << 6) | (b << 5) |										(e << 1) |	(d << 0)
+							 : rangeCase == 9	? (e << 8) | (d << 7) | (c << 6) | (b << 5) |													(e << 0)
+							 : rangeCase == 10	? (f << 8) | (e << 7) | (d << 6) | (c << 5) |	(b << 4) |										(f << 0)
+							 : (deUint32)-1;
+			DE_ASSERT(B != (deUint32)-1);
+			dst[endpointNdx] = (((iseResults[endpointNdx].tq*C + B) ^ A) >> 2) | (A & 0x80);
+		}
+	}
+	else
+	{
+		DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT);
+		for (int endpointNdx = 0; endpointNdx < numEndpoints; endpointNdx++)
+			dst[endpointNdx] = bitReplicationScale(iseResults[endpointNdx].v, iseParams.numBits, 8);
+	}
+}
+inline void bitTransferSigned (deInt32& a, deInt32& b)
+{
+	b >>= 1;
+	b |= a & 0x80;
+	a >>= 1;
+	a &= 0x3f;
+	if (isBitSet(a, 5))
+		a -= 0x40;
+}
+inline UVec4 clampedRGBA (const IVec4& rgba)
+{
+	return UVec4(basisu_astc::clamp(rgba.x(), 0, 0xff),
+		basisu_astc::clamp(rgba.y(), 0, 0xff),
+		basisu_astc::clamp(rgba.z(), 0, 0xff),
+		basisu_astc::clamp(rgba.w(), 0, 0xff));
+}
+inline IVec4 blueContract (int r, int g, int b, int a)
+{
+	return IVec4((r+b)>>1, (g+b)>>1, b, a);
+}
+inline bool isColorEndpointModeHDR (deUint32 mode)
+{
+	return mode == 2	||
+		   mode == 3	||
+		   mode == 7	||
+		   mode == 11	||
+		   mode == 14	||
+		   mode == 15;
+}
+void decodeHDREndpointMode7 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3)
+{
+	const deUint32 m10		= getBit(v1, 7) | (getBit(v2, 7) << 1);
+	const deUint32 m23		= getBits(v0, 6, 7);
+	const deUint32 majComp	= m10 != 3	? m10
+							: m23 != 3	? m23
+							:			  0;
+	const deUint32 mode		= m10 != 3	? m23
+							: m23 != 3	? 4
+							:			  5;
+	deInt32			red		= (deInt32)getBits(v0, 0, 5);
+	deInt32			green	= (deInt32)getBits(v1, 0, 4);
+	deInt32			blue	= (deInt32)getBits(v2, 0, 4);
+	deInt32			scale	= (deInt32)getBits(v3, 0, 4);
+	{
+#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT)
+#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5, V6,S6) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); SHOR(V6,S6,x6); } while (false)
+		const deUint32	x0	= getBit(v1, 6);
+		const deUint32	x1	= getBit(v1, 5);
+		const deUint32	x2	= getBit(v2, 6);
+		const deUint32	x3	= getBit(v2, 5);
+		const deUint32	x4	= getBit(v3, 7);
+		const deUint32	x5	= getBit(v3, 6);
+		const deUint32	x6	= getBit(v3, 5);
+		deInt32&		R	= red;
+		deInt32&		G	= green;
+		deInt32&		B	= blue;
+		deInt32&		S	= scale;
+		switch (mode)
+		{
+			case 0: ASSIGN_X_BITS(R,9,  R,8,  R,7,  R,10,  R,6,  S,6,   S,5); break;
+			case 1: ASSIGN_X_BITS(R,8,  G,5,  R,7,  B,5,   R,6,  R,10,  R,9); break;
+			case 2: ASSIGN_X_BITS(R,9,  R,8,  R,7,  R,6,   S,7,  S,6,   S,5); break;
+			case 3: ASSIGN_X_BITS(R,8,  G,5,  R,7,  B,5,   R,6,  S,6,   S,5); break;
+			case 4: ASSIGN_X_BITS(G,6,  G,5,  B,6,  B,5,   R,6,  R,7,   S,5); break;
+			case 5: ASSIGN_X_BITS(G,6,  G,5,  B,6,  B,5,   R,6,  S,6,   S,5); break;
+			default:
+				DE_ASSERT(false);
+		}
+#undef ASSIGN_X_BITS
+#undef SHOR
+	}
+	static const int shiftAmounts[] = { 1, 1, 2, 3, 4, 5 };
+	DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(shiftAmounts));
+	red		<<= shiftAmounts[mode];
+	green	<<= shiftAmounts[mode];
+	blue	<<= shiftAmounts[mode];
+	scale	<<= shiftAmounts[mode];
+	if (mode != 5)
+	{
+		green	= red - green;
+		blue	= red - blue;
+	}
+	if (majComp == 1)
+		std::swap(red, green);
+	else if (majComp == 2)
+		std::swap(red, blue);
+	e0 = UVec4(basisu_astc::clamp(red	- scale,	0, 0xfff),
+		basisu_astc::clamp(green	- scale,	0, 0xfff),
+		basisu_astc::clamp(blue	- scale,	0, 0xfff),
+			   0x780);
+	e1 = UVec4(basisu_astc::clamp(red,				0, 0xfff),
+		basisu_astc::clamp(green,				0, 0xfff),
+		basisu_astc::clamp(blue,				0, 0xfff),
+			   0x780);
+}
+void decodeHDREndpointMode11 (UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5)
+{
+	const deUint32 major = (getBit(v5, 7) << 1) | getBit(v4, 7);
+	if (major == 3)
+	{
+		e0 = UVec4(v0<<4, v2<<4, getBits(v4,0,6)<<5, 0x780);
+		e1 = UVec4(v1<<4, v3<<4, getBits(v5,0,6)<<5, 0x780);
+	}
+	else
+	{
+		const deUint32 mode = (getBit(v3, 7) << 2) | (getBit(v2, 7) << 1) | getBit(v1, 7);
+		deInt32 a	= (deInt32)((getBit(v1, 6) << 8) | v0);
+		deInt32 c	= (deInt32)(getBits(v1, 0, 5));
+		deInt32 b0	= (deInt32)(getBits(v2, 0, 5));
+		deInt32 b1	= (deInt32)(getBits(v3, 0, 5));
+		deInt32 d0	= (deInt32)(getBits(v4, 0, 4));
+		deInt32 d1	= (deInt32)(getBits(v5, 0, 4));
+		{
+#define SHOR(DST_VAR, SHIFT, BIT_VAR) (DST_VAR) |= (BIT_VAR) << (SHIFT)
+#define ASSIGN_X_BITS(V0,S0, V1,S1, V2,S2, V3,S3, V4,S4, V5,S5) do { SHOR(V0,S0,x0); SHOR(V1,S1,x1); SHOR(V2,S2,x2); SHOR(V3,S3,x3); SHOR(V4,S4,x4); SHOR(V5,S5,x5); } while (false)
+			const deUint32 x0 = getBit(v2, 6);
+			const deUint32 x1 = getBit(v3, 6);
+			const deUint32 x2 = getBit(v4, 6);
+			const deUint32 x3 = getBit(v5, 6);
+			const deUint32 x4 = getBit(v4, 5);
+			const deUint32 x5 = getBit(v5, 5);
+			switch (mode)
+			{
+				case 0: ASSIGN_X_BITS(b0,6,  b1,6,   d0,6,  d1,6,  d0,5,  d1,5); break;
+				case 1: ASSIGN_X_BITS(b0,6,  b1,6,   b0,7,  b1,7,  d0,5,  d1,5); break;
+				case 2: ASSIGN_X_BITS(a,9,   c,6,    d0,6,  d1,6,  d0,5,  d1,5); break;
+				case 3: ASSIGN_X_BITS(b0,6,  b1,6,   a,9,   c,6,   d0,5,  d1,5); break;
+				case 4: ASSIGN_X_BITS(b0,6,  b1,6,   b0,7,  b1,7,  a,9,   a,10); break;
+				case 5: ASSIGN_X_BITS(a,9,   a,10,   c,7,   c,6,   d0,5,  d1,5); break;
+				case 6: ASSIGN_X_BITS(b0,6,  b1,6,   a,11,  c,6,   a,9,   a,10); break;
+				case 7: ASSIGN_X_BITS(a,9,   a,10,   a,11,  c,6,   d0,5,  d1,5); break;
+				default:
+					DE_ASSERT(false);
+			}
+#undef ASSIGN_X_BITS
+#undef SHOR
+		}
+		static const int numDBits[] = { 7, 6, 7, 6, 5, 6, 5, 6 };
+		DE_ASSERT(mode < DE_LENGTH_OF_ARRAY(numDBits));
+		d0 = signExtend(d0, numDBits[mode]);
+		d1 = signExtend(d1, numDBits[mode]);
+		const int shiftAmount = (mode >> 1) ^ 3;
+		a	<<= shiftAmount;
+		c	<<= shiftAmount;
+		b0	<<= shiftAmount;
+		b1	<<= shiftAmount;
+		d0	<<= shiftAmount;
+		d1	<<= shiftAmount;
+		e0 = UVec4(basisu_astc::clamp(a-c,			0, 0xfff),
+			basisu_astc::clamp(a-b0-c-d0,		0, 0xfff),
+			basisu_astc::clamp(a-b1-c-d1,		0, 0xfff),
+				   0x780);
+		e1 = UVec4(basisu_astc::clamp(a,				0, 0xfff),
+			basisu_astc::clamp(a-b0,			0, 0xfff),
+			basisu_astc::clamp(a-b1,			0, 0xfff),
+				   0x780);
+		if (major == 1)
+		{
+			std::swap(e0.x(), e0.y());
+			std::swap(e1.x(), e1.y());
+		}
+		else if (major == 2)
+		{
+			std::swap(e0.x(), e0.z());
+			std::swap(e1.x(), e1.z());
+		}
+	}
+}
+void decodeHDREndpointMode15(UVec4& e0, UVec4& e1, deUint32 v0, deUint32 v1, deUint32 v2, deUint32 v3, deUint32 v4, deUint32 v5, deUint32 v6In, deUint32 v7In)
+{
+	decodeHDREndpointMode11(e0, e1, v0, v1, v2, v3, v4, v5);
+	const deUint32	mode	= (getBit(v7In, 7) << 1) | getBit(v6In, 7);
+	deInt32			v6		= (deInt32)getBits(v6In, 0, 6);
+	deInt32			v7		= (deInt32)getBits(v7In, 0, 6);
+	if (mode == 3)
+	{
+		e0.w() = v6 << 5;
+		e1.w() = v7 << 5;
+	}
+	else
+	{
+		v6 |= (v7 << (mode+1)) & 0x780;
+		v7 &= (0x3f >> mode);
+		v7 ^= 0x20 >> mode;
+		v7 -= 0x20 >> mode;
+		v6 <<= 4-mode;
+		v7 <<= 4-mode;
+		v7 += v6;
+		v7 = basisu_astc::clamp(v7, 0, 0xfff);
+		e0.w() = v6;
+		e1.w() = v7;
+	}
+}
+void decodeColorEndpoints (ColorEndpointPair* dst, const deUint32* unquantizedEndpoints, const deUint32* endpointModes, int numPartitions)
+{
+	int unquantizedNdx = 0;
+	for (int partitionNdx = 0; partitionNdx < numPartitions; partitionNdx++)
+	{
+		const deUint32		endpointMode	= endpointModes[partitionNdx];
+		const deUint32*		v				= &unquantizedEndpoints[unquantizedNdx];
+		UVec4&				e0				= dst[partitionNdx].e0;
+		UVec4&				e1				= dst[partitionNdx].e1;
+		unquantizedNdx += computeNumColorEndpointValues(endpointMode);
+		switch (endpointMode)
+		{
+			case 0:
+				e0 = UVec4(v[0], v[0], v[0], 0xff);
+				e1 = UVec4(v[1], v[1], v[1], 0xff);
+				break;
+			case 1:
+			{
+				const deUint32 L0 = (v[0] >> 2) | (getBits(v[1], 6, 7) << 6);
+				const deUint32 L1 = basisu_astc::min(0xffu, L0 + getBits(v[1], 0, 5));
+				e0 = UVec4(L0, L0, L0, 0xff);
+				e1 = UVec4(L1, L1, L1, 0xff);
+				break;
+			}
+			case 2:
+			{
+				const deUint32 v1Gr		= v[1] >= v[0];
+				const deUint32 y0		= v1Gr ? v[0]<<4 : (v[1]<<4) + 8;
+				const deUint32 y1		= v1Gr ? v[1]<<4 : (v[0]<<4) - 8;
+				e0 = UVec4(y0, y0, y0, 0x780);
+				e1 = UVec4(y1, y1, y1, 0x780);
+				break;
+			}
+			case 3:
+			{
+				const bool		m	= isBitSet(v[0], 7);
+				const deUint32	y0	= m ? (getBits(v[1], 5, 7) << 9) | (getBits(v[0], 0, 6) << 2)
+										: (getBits(v[1], 4, 7) << 8) | (getBits(v[0], 0, 6) << 1);
+				const deUint32	d	= m ? getBits(v[1], 0, 4) << 2
+										: getBits(v[1], 0, 3) << 1;
+				const deUint32	y1	= basisu_astc::min(0xfffu, y0+d);
+				e0 = UVec4(y0, y0, y0, 0x780);
+				e1 = UVec4(y1, y1, y1, 0x780);
+				break;
+			}
+			case 4:
+				e0 = UVec4(v[0], v[0], v[0], v[2]);
+				e1 = UVec4(v[1], v[1], v[1], v[3]);
+				break;
+			case 5:
+			{
+				deInt32 v0 = (deInt32)v[0];
+				deInt32 v1 = (deInt32)v[1];
+				deInt32 v2 = (deInt32)v[2];
+				deInt32 v3 = (deInt32)v[3];
+				bitTransferSigned(v1, v0);
+				bitTransferSigned(v3, v2);
+				e0 = clampedRGBA(IVec4(v0,		v0,		v0,		v2));
+				e1 = clampedRGBA(IVec4(v0+v1,	v0+v1,	v0+v1,	v2+v3));
+				break;
+			}
+			case 6:
+				e0 = UVec4((v[0]*v[3]) >> 8,	(v[1]*v[3]) >> 8,	(v[2]*v[3]) >> 8,	0xff);
+				e1 = UVec4(v[0],				v[1],				v[2],				0xff);
+				break;
+			case 7:
+				decodeHDREndpointMode7(e0, e1, v[0], v[1], v[2], v[3]);
+				break;
+			case 8:
+				if (v[1]+v[3]+v[5] >= v[0]+v[2]+v[4])
+				{
+					e0 = UVec4(v[0], v[2], v[4], 0xff);
+					e1 = UVec4(v[1], v[3], v[5], 0xff);
+				}
+				else
+				{
+					e0 = blueContract(v[1], v[3], v[5], 0xff).asUint();
+					e1 = blueContract(v[0], v[2], v[4], 0xff).asUint();
+				}
+				break;
+			case 9:
+			{
+				deInt32 v0 = (deInt32)v[0];
+				deInt32 v1 = (deInt32)v[1];
+				deInt32 v2 = (deInt32)v[2];
+				deInt32 v3 = (deInt32)v[3];
+				deInt32 v4 = (deInt32)v[4];
+				deInt32 v5 = (deInt32)v[5];
+				bitTransferSigned(v1, v0);
+				bitTransferSigned(v3, v2);
+				bitTransferSigned(v5, v4);
+				if (v1+v3+v5 >= 0)
+				{
+					e0 = clampedRGBA(IVec4(v0,		v2,		v4,		0xff));
+					e1 = clampedRGBA(IVec4(v0+v1,	v2+v3,	v4+v5,	0xff));
+				}
+				else
+				{
+					e0 = clampedRGBA(blueContract(v0+v1,	v2+v3,	v4+v5,	0xff));
+					e1 = clampedRGBA(blueContract(v0,		v2,		v4,		0xff));
+				}
+				break;
+			}
+			case 10:
+				e0 = UVec4((v[0]*v[3]) >> 8,	(v[1]*v[3]) >> 8,	(v[2]*v[3]) >> 8,	v[4]);
+				e1 = UVec4(v[0],				v[1],				v[2],				v[5]);
+				break;
+			case 11:
+				decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]);
+				break;
+			case 12:
+				if (v[1]+v[3]+v[5] >= v[0]+v[2]+v[4])
+				{
+					e0 = UVec4(v[0], v[2], v[4], v[6]);
+					e1 = UVec4(v[1], v[3], v[5], v[7]);
+				}
+				else
+				{
+					e0 = clampedRGBA(blueContract(v[1], v[3], v[5], v[7]));
+					e1 = clampedRGBA(blueContract(v[0], v[2], v[4], v[6]));
+				}
+				break;
+			case 13:
+			{
+				deInt32 v0 = (deInt32)v[0];
+				deInt32 v1 = (deInt32)v[1];
+				deInt32 v2 = (deInt32)v[2];
+				deInt32 v3 = (deInt32)v[3];
+				deInt32 v4 = (deInt32)v[4];
+				deInt32 v5 = (deInt32)v[5];
+				deInt32 v6 = (deInt32)v[6];
+				deInt32 v7 = (deInt32)v[7];
+				bitTransferSigned(v1, v0);
+				bitTransferSigned(v3, v2);
+				bitTransferSigned(v5, v4);
+				bitTransferSigned(v7, v6);
+				if (v1+v3+v5 >= 0)
+				{
+					e0 = clampedRGBA(IVec4(v0,		v2,		v4,		v6));
+					e1 = clampedRGBA(IVec4(v0+v1,	v2+v3,	v4+v5,	v6+v7));
+				}
+				else
+				{
+					e0 = clampedRGBA(blueContract(v0+v1,	v2+v3,	v4+v5,	v6+v7));
+					e1 = clampedRGBA(blueContract(v0,		v2,		v4,		v6));
+				}
+				break;
+			}
+			case 14:
+				decodeHDREndpointMode11(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5]);
+				e0.w() = v[6];
+				e1.w() = v[7];
+				break;
+			case 15:
+				decodeHDREndpointMode15(e0, e1, v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
+				break;
+			default:
+				DE_ASSERT(false);
+		}
+	}
+}
+void computeColorEndpoints (ColorEndpointPair* dst, const Block128& blockData, const deUint32* endpointModes, int numPartitions, int numColorEndpointValues, const ISEParams& iseParams, int numBitsAvailable)
+{
+	const int			colorEndpointDataStart = numPartitions == 1 ? 17 : 29;
+	ISEDecodedResult	colorEndpointData[18];
+	{
+		BitAccessStream dataStream(blockData, colorEndpointDataStart, numBitsAvailable, true);
+		decodeISE(&colorEndpointData[0], numColorEndpointValues, dataStream, iseParams);
+	}
+	{
+		deUint32 unquantizedEndpoints[18];
+		unquantizeColorEndpoints(&unquantizedEndpoints[0], &colorEndpointData[0], numColorEndpointValues, iseParams);
+		decodeColorEndpoints(dst, &unquantizedEndpoints[0], &endpointModes[0], numPartitions);
+	}
+}
+void unquantizeWeights (deUint32 dst[64], const ISEDecodedResult* weightGrid, const ASTCBlockMode& blockMode)
+{
+	const int			numWeights	= computeNumWeights(blockMode);
+	const ISEParams&	iseParams	= blockMode.weightISEParams;
+	if (iseParams.mode == ISEMODE_TRIT || iseParams.mode == ISEMODE_QUINT)
+	{
+		const int rangeCase = iseParams.numBits*2 + (iseParams.mode == ISEMODE_QUINT ? 1 : 0);
+		if (rangeCase == 0 || rangeCase == 1)
+		{
+			static const deUint32 map0[3]	= { 0, 32, 63 };
+			static const deUint32 map1[5]	= { 0, 16, 32, 47, 63 };
+			const deUint32* const map		= rangeCase == 0 ? &map0[0] : &map1[0];
+			for (int i = 0; i < numWeights; i++)
+			{
+				DE_ASSERT(weightGrid[i].v < (rangeCase == 0 ? 3u : 5u));
+				dst[i] = map[weightGrid[i].v];
+			}
+		}
+		else
+		{
+			DE_ASSERT(rangeCase <= 6);
+			static const deUint32	Ca[5]	= { 50, 28, 23, 13, 11 };
+			const deUint32			C		= Ca[rangeCase-2];
+			for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+			{
+				const deUint32 a = getBit(weightGrid[weightNdx].m, 0);
+				const deUint32 b = getBit(weightGrid[weightNdx].m, 1);
+				const deUint32 c = getBit(weightGrid[weightNdx].m, 2);
+				const deUint32 A = a == 0 ? 0 : (1<<7)-1;
+				const deUint32 B = rangeCase == 2 ? 0
+								 : rangeCase == 3 ? 0
+								 : rangeCase == 4 ? (b << 6) |					(b << 2) |				(b << 0)
+								 : rangeCase == 5 ? (b << 6) |								(b << 1)
+								 : rangeCase == 6 ? (c << 6) | (b << 5) |					(c << 1) |	(b << 0)
+								 : (deUint32)-1;
+				dst[weightNdx] = (((weightGrid[weightNdx].tq*C + B) ^ A) >> 2) | (A & 0x20);
+			}
+		}
+	}
+	else
+	{
+		DE_ASSERT(iseParams.mode == ISEMODE_PLAIN_BIT);
+		for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+			dst[weightNdx] = bitReplicationScale(weightGrid[weightNdx].v, iseParams.numBits, 6);
+	}
+	for (int weightNdx = 0; weightNdx < numWeights; weightNdx++)
+		dst[weightNdx] += dst[weightNdx] > 32 ? 1 : 0;
+	// Initialize nonexistent weights to poison values
+	for (int weightNdx = numWeights; weightNdx < 64; weightNdx++)
+		dst[weightNdx] = ~0u;
+}
+void interpolateWeights (TexelWeightPair* dst, const deUint32 (&unquantizedWeights) [64], int blockWidth, int blockHeight, const ASTCBlockMode& blockMode)
+{
+	const int		numWeightsPerTexel	= blockMode.isDualPlane ? 2 : 1;
+	const deUint32	scaleX				= (1024 + blockWidth/2) / (blockWidth-1);
+	const deUint32	scaleY				= (1024 + blockHeight/2) / (blockHeight-1);
+	DE_ASSERT(blockMode.weightGridWidth*blockMode.weightGridHeight*numWeightsPerTexel <= (int)DE_LENGTH_OF_ARRAY(unquantizedWeights));
+	for (int texelY = 0; texelY < blockHeight; texelY++)
+	{
+		for (int texelX = 0; texelX < blockWidth; texelX++)
+		{
+			const deUint32 gX	= (scaleX*texelX*(blockMode.weightGridWidth-1) + 32) >> 6;
+			const deUint32 gY	= (scaleY*texelY*(blockMode.weightGridHeight-1) + 32) >> 6;
+			const deUint32 jX	= gX >> 4;
+			const deUint32 jY	= gY >> 4;
+			const deUint32 fX	= gX & 0xf;
+			const deUint32 fY	= gY & 0xf;
+			const deUint32 w11	= (fX*fY + 8) >> 4;
+			const deUint32 w10	= fY - w11;
+			const deUint32 w01	= fX - w11;
+			const deUint32 w00	= 16 - fX - fY + w11;
+			const deUint32 i00	= jY*blockMode.weightGridWidth + jX;
+			const deUint32 i01	= i00 + 1;
+			const deUint32 i10	= i00 + blockMode.weightGridWidth;
+			const deUint32 i11	= i00 + blockMode.weightGridWidth + 1;
+			// These addresses can be out of bounds, but respective weights will be 0 then.
+			DE_ASSERT(deInBounds32(i00, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w00 == 0);
+			DE_ASSERT(deInBounds32(i01, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w01 == 0);
+			DE_ASSERT(deInBounds32(i10, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w10 == 0);
+			DE_ASSERT(deInBounds32(i11, 0, blockMode.weightGridWidth*blockMode.weightGridHeight) || w11 == 0);
+			for (int texelWeightNdx = 0; texelWeightNdx < numWeightsPerTexel; texelWeightNdx++)
+			{
+				// & 0x3f clamps address to bounds of unquantizedWeights
+				const deUint32 p00	= unquantizedWeights[(i00 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+				const deUint32 p01	= unquantizedWeights[(i01 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+				const deUint32 p10	= unquantizedWeights[(i10 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+				const deUint32 p11	= unquantizedWeights[(i11 * numWeightsPerTexel + texelWeightNdx) & 0x3f];
+				dst[texelY*blockWidth + texelX].w[texelWeightNdx] = (p00*w00 + p01*w01 + p10*w10 + p11*w11 + 8) >> 4;
+			}
+		}
+	}
+}
+void computeTexelWeights (TexelWeightPair* dst, const Block128& blockData, int blockWidth, int blockHeight, const ASTCBlockMode& blockMode)
+{
+	ISEDecodedResult weightGrid[64];
+	{
+		BitAccessStream dataStream(blockData, 127, computeNumRequiredBits(blockMode.weightISEParams, computeNumWeights(blockMode)), false);
+		decodeISE(&weightGrid[0], computeNumWeights(blockMode), dataStream, blockMode.weightISEParams);
+	}
+	{
+		deUint32 unquantizedWeights[64];
+		unquantizeWeights(&unquantizedWeights[0], &weightGrid[0], blockMode);
+		interpolateWeights(dst, unquantizedWeights, blockWidth, blockHeight, blockMode);
+	}
+}
+inline deUint32 hash52 (deUint32 v)
+{
+	deUint32 p = v;
+	p ^= p >> 15;	p -= p << 17;	p += p << 7;	p += p << 4;
+	p ^= p >>  5;	p += p << 16;	p ^= p >> 7;	p ^= p >> 3;
+	p ^= p <<  6;	p ^= p >> 17;
+	return p;
+}
+int computeTexelPartition (deUint32 seedIn, deUint32 xIn, deUint32 yIn, deUint32 zIn, int numPartitions, bool smallBlock)
+{
+	DE_ASSERT(zIn == 0);
+	const deUint32	x		= smallBlock ? xIn << 1 : xIn;
+	const deUint32	y		= smallBlock ? yIn << 1 : yIn;
+	const deUint32	z		= smallBlock ? zIn << 1 : zIn;
+	const deUint32	seed	= seedIn + 1024*(numPartitions-1);
+	const deUint32	rnum	= hash52(seed);
+	deUint8			seed1	= (deUint8)( rnum							& 0xf);
+	deUint8			seed2	= (deUint8)((rnum >>  4)					& 0xf);
+	deUint8			seed3	= (deUint8)((rnum >>  8)					& 0xf);
+	deUint8			seed4	= (deUint8)((rnum >> 12)					& 0xf);
+	deUint8			seed5	= (deUint8)((rnum >> 16)					& 0xf);
+	deUint8			seed6	= (deUint8)((rnum >> 20)					& 0xf);
+	deUint8			seed7	= (deUint8)((rnum >> 24)					& 0xf);
+	deUint8			seed8	= (deUint8)((rnum >> 28)					& 0xf);
+	deUint8			seed9	= (deUint8)((rnum >> 18)					& 0xf);
+	deUint8			seed10	= (deUint8)((rnum >> 22)					& 0xf);
+	deUint8			seed11	= (deUint8)((rnum >> 26)					& 0xf);
+	deUint8			seed12	= (deUint8)(((rnum >> 30) | (rnum << 2))	& 0xf);
+	seed1  = (deUint8)(seed1  * seed1 );
+	seed2  = (deUint8)(seed2  * seed2 );
+	seed3  = (deUint8)(seed3  * seed3 );
+	seed4  = (deUint8)(seed4  * seed4 );
+	seed5  = (deUint8)(seed5  * seed5 );
+	seed6  = (deUint8)(seed6  * seed6 );
+	seed7  = (deUint8)(seed7  * seed7 );
+	seed8  = (deUint8)(seed8  * seed8 );
+	seed9  = (deUint8)(seed9  * seed9 );
+	seed10 = (deUint8)(seed10 * seed10);
+	seed11 = (deUint8)(seed11 * seed11);
+	seed12 = (deUint8)(seed12 * seed12);
+	const int shA = (seed & 2) != 0		? 4		: 5;
+	const int shB = numPartitions == 3	? 6		: 5;
+	const int sh1 = (seed & 1) != 0		? shA	: shB;
+	const int sh2 = (seed & 1) != 0		? shB	: shA;
+	const int sh3 = (seed & 0x10) != 0	? sh1	: sh2;
+	seed1  = (deUint8)(seed1  >> sh1);
+	seed2  = (deUint8)(seed2  >> sh2);
+	seed3  = (deUint8)(seed3  >> sh1);
+	seed4  = (deUint8)(seed4  >> sh2);
+	seed5  = (deUint8)(seed5  >> sh1);
+	seed6  = (deUint8)(seed6  >> sh2);
+	seed7  = (deUint8)(seed7  >> sh1);
+	seed8  = (deUint8)(seed8  >> sh2);
+	seed9  = (deUint8)(seed9  >> sh3);
+	seed10 = (deUint8)(seed10 >> sh3);
+	seed11 = (deUint8)(seed11 >> sh3);
+	seed12 = (deUint8)(seed12 >> sh3);
+	const int a =						0x3f & (seed1*x + seed2*y + seed11*z + (rnum >> 14));
+	const int b =						0x3f & (seed3*x + seed4*y + seed12*z + (rnum >> 10));
+	const int c = numPartitions >= 3 ?	0x3f & (seed5*x + seed6*y + seed9*z  + (rnum >>  6))	: 0;
+	const int d = numPartitions >= 4 ?	0x3f & (seed7*x + seed8*y + seed10*z + (rnum >>  2))	: 0;
+	return a >= b && a >= c && a >= d	? 0
+		 : b >= c && b >= d				? 1
+		 : c >= d						? 2
+		 :								  3;
+}
+DecompressResult setTexelColors (void* dst, ColorEndpointPair* colorEndpoints, TexelWeightPair* texelWeights, int ccs, deUint32 partitionIndexSeed,
+								 int numPartitions, int blockWidth, int blockHeight, bool isSRGB, bool isLDRMode, const deUint32* colorEndpointModes)
+{
+	const bool			smallBlock	= blockWidth*blockHeight < 31;
+	DecompressResult	result		= DECOMPRESS_RESULT_VALID_BLOCK;
+	bool				isHDREndpoint[4];
+	for (int i = 0; i < numPartitions; i++)
+	{
+		isHDREndpoint[i] = isColorEndpointModeHDR(colorEndpointModes[i]);
+		
+		// rg - REMOVING HDR SUPPORT FOR NOW
+		if (isHDREndpoint[i])
+			return DECOMPRESS_RESULT_ERROR;
+	}
+
+	for (int texelY = 0; texelY < blockHeight; texelY++)
+	for (int texelX = 0; texelX < blockWidth; texelX++)
+	{
+		const int				texelNdx			= texelY*blockWidth + texelX;
+		const int				colorEndpointNdx	= numPartitions == 1 ? 0 : computeTexelPartition(partitionIndexSeed, texelX, texelY, 0, numPartitions, smallBlock);
+		DE_ASSERT(colorEndpointNdx < numPartitions);
+		const UVec4&			e0					= colorEndpoints[colorEndpointNdx].e0;
+		const UVec4&			e1					= colorEndpoints[colorEndpointNdx].e1;
+		const TexelWeightPair&	weight				= texelWeights[texelNdx];
+		if (isLDRMode && isHDREndpoint[colorEndpointNdx])
+		{
+			if (isSRGB)
+			{
+				((deUint8*)dst)[texelNdx*4 + 0] = 0xff;
+				((deUint8*)dst)[texelNdx*4 + 1] = 0;
+				((deUint8*)dst)[texelNdx*4 + 2] = 0xff;
+				((deUint8*)dst)[texelNdx*4 + 3] = 0xff;
+			}
+			else
+			{
+				((float*)dst)[texelNdx*4 + 0] = 1.0f;
+				((float*)dst)[texelNdx*4 + 1] = 0;
+				((float*)dst)[texelNdx*4 + 2] = 1.0f;
+				((float*)dst)[texelNdx*4 + 3] = 1.0f;
+			}
+			result = DECOMPRESS_RESULT_ERROR;
+		}
+		else
+		{
+			for (int channelNdx = 0; channelNdx < 4; channelNdx++)
+			{
+				if (!isHDREndpoint[colorEndpointNdx] || (channelNdx == 3 && colorEndpointModes[colorEndpointNdx] == 14)) // \note Alpha for mode 14 is treated the same as LDR.
+				{
+					const deUint32 c0	= (e0[channelNdx] << 8) | (isSRGB ? 0x80 : e0[channelNdx]);
+					const deUint32 c1	= (e1[channelNdx] << 8) | (isSRGB ? 0x80 : e1[channelNdx]);
+					const deUint32 w	= weight.w[ccs == channelNdx ? 1 : 0];
+					const deUint32 c	= (c0*(64-w) + c1*w + 32) / 64;
+					if (isSRGB)
+						((deUint8*)dst)[texelNdx*4 + channelNdx] = (deUint8)((c & 0xff00) >> 8);
+					else
+						((float*)dst)[texelNdx*4 + channelNdx] = c == 65535 ? 1.0f : (float)c / 65536.0f;
+				}
+				else
+				{
+					//DE_STATIC_ASSERT((basisu_astc::meta::TypesSame<deFloat16, deUint16>::Value));
+					// rg - REMOVING HDR SUPPORT FOR NOW
+#if 0
+					const deUint32		c0	= e0[channelNdx] << 4;
+					const deUint32		c1	= e1[channelNdx] << 4;
+					const deUint32		w	= weight.w[ccs == channelNdx ? 1 : 0];
+					const deUint32		c	= (c0*(64-w) + c1*w + 32) / 64;
+					const deUint32		e	= getBits(c, 11, 15);
+					const deUint32		m	= getBits(c, 0, 10);
+					const deUint32		mt	= m < 512		? 3*m
+											: m >= 1536		? 5*m - 2048
+											:				  4*m - 512;
+					const deFloat16		cf	= (deFloat16)((e << 10) + (mt >> 3));
+					((float*)dst)[texelNdx*4 + channelNdx] = deFloat16To32(isFloat16InfOrNan(cf) ? 0x7bff : cf);
+#endif
+				}
+			}
+		}
+	}
+	return result;
+}
+DecompressResult decompressBlock (void* dst, const Block128& blockData, int blockWidth, int blockHeight, bool isSRGB, bool isLDR)
+{
+	DE_ASSERT(isLDR || !isSRGB);
+	// Decode block mode.
+	const ASTCBlockMode blockMode = getASTCBlockMode(blockData.getBits(0, 10));
+	// Check for block mode errors.
+	if (blockMode.isError)
+	{
+		setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+		return DECOMPRESS_RESULT_ERROR;
+	}
+	// Separate path for void-extent.
+	if (blockMode.isVoidExtent)
+		return decodeVoidExtentBlock(dst, blockData, blockWidth, blockHeight, isSRGB, isLDR);
+	// Compute weight grid values.
+	const int numWeights			= computeNumWeights(blockMode);
+	const int numWeightDataBits		= computeNumRequiredBits(blockMode.weightISEParams, numWeights);
+	const int numPartitions			= (int)blockData.getBits(11, 12) + 1;
+	// Check for errors in weight grid, partition and dual-plane parameters.
+	if (numWeights > 64								||
+		numWeightDataBits > 96						||
+		numWeightDataBits < 24						||
+		blockMode.weightGridWidth > blockWidth		||
+		blockMode.weightGridHeight > blockHeight	||
+		(numPartitions == 4 && blockMode.isDualPlane))
+	{
+		setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+		return DECOMPRESS_RESULT_ERROR;
+	}
+	// Compute number of bits available for color endpoint data.
+	const bool	isSingleUniqueCem			= numPartitions == 1 || blockData.getBits(23, 24) == 0;
+	const int	numConfigDataBits			= (numPartitions == 1 ? 17 : isSingleUniqueCem ? 29 : 25 + 3*numPartitions) +
+											  (blockMode.isDualPlane ? 2 : 0);
+	const int	numBitsForColorEndpoints	= 128 - numWeightDataBits - numConfigDataBits;
+	const int	extraCemBitsStart			= 127 - numWeightDataBits - (isSingleUniqueCem		? -1
+																		: numPartitions == 4	? 7
+																		: numPartitions == 3	? 4
+																		: numPartitions == 2	? 1
+																		: 0);
+	// Decode color endpoint modes.
+	deUint32 colorEndpointModes[4];
+	decodeColorEndpointModes(&colorEndpointModes[0], blockData, numPartitions, extraCemBitsStart);
+	const int numColorEndpointValues = computeNumColorEndpointValues(colorEndpointModes, numPartitions);
+	// Check for errors in color endpoint value count.
+	if (numColorEndpointValues > 18 || numBitsForColorEndpoints < (int)deDivRoundUp32(13*numColorEndpointValues, 5))
+	{
+		setASTCErrorColorBlock(dst, blockWidth, blockHeight, isSRGB);
+		return DECOMPRESS_RESULT_ERROR;
+	}
+	// Compute color endpoints.
+	ColorEndpointPair colorEndpoints[4];
+	computeColorEndpoints(&colorEndpoints[0], blockData, &colorEndpointModes[0], numPartitions, numColorEndpointValues,
+						  computeMaximumRangeISEParams(numBitsForColorEndpoints, numColorEndpointValues), numBitsForColorEndpoints);
+	// Compute texel weights.
+	TexelWeightPair texelWeights[MAX_BLOCK_WIDTH*MAX_BLOCK_HEIGHT];
+	computeTexelWeights(&texelWeights[0], blockData, blockWidth, blockHeight, blockMode);
+	// Set texel colors.
+	const int		ccs						= blockMode.isDualPlane ? (int)blockData.getBits(extraCemBitsStart-2, extraCemBitsStart-1) : -1;
+	const deUint32	partitionIndexSeed		= numPartitions > 1 ? blockData.getBits(13, 22) : (deUint32)-1;
+	return setTexelColors(dst, &colorEndpoints[0], &texelWeights[0], ccs, partitionIndexSeed, numPartitions, blockWidth, blockHeight, isSRGB, isLDR, &colorEndpointModes[0]);
+}
+
+} // anonymous
+
+bool decompress(uint8_t *pDst, const uint8_t * data, bool isSRGB, int blockWidth, int blockHeight)
+{
+	// rg - We only support LDR here, although adding back in HDR would be easy.
+	const bool isLDR = true;
+	DE_ASSERT(isLDR || !isSRGB);
+	
+	float linear[MAX_BLOCK_WIDTH * MAX_BLOCK_HEIGHT * 4];
+
+	const Block128 blockData(data);
+	if (decompressBlock(isSRGB ? (void*)pDst : (void*)& linear[0],
+		blockData, blockWidth, blockHeight, isSRGB, isLDR) != DECOMPRESS_RESULT_VALID_BLOCK)
+		return false;
+	
+	if (!isSRGB)
+	{
+		int pix = 0;
+		for (int i = 0; i < blockHeight; i++)
+		{
+			for (int j = 0; j < blockWidth; j++, pix++)
+			{
+				pDst[4 * pix + 0] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 0] * 65536.0f + .5f), 0, 65535) >> 8);
+				pDst[4 * pix + 1] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 1] * 65536.0f + .5f), 0, 65535) >> 8);
+				pDst[4 * pix + 2] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 2] * 65536.0f + .5f), 0, 65535) >> 8);
+				pDst[4 * pix + 3] = (uint8_t)(basisu_astc::clamp<int>((int)(linear[pix * 4 + 3] * 65536.0f + .5f), 0, 65535) >> 8);
+			}
+		}
+	}
+
+	return true;
+}
+
+} // astc
+} // basisu_astc
+
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
diff --git a/thirdparty/basis_universal/encoder/basisu_astc_decomp.h b/thirdparty/basis_universal/encoder/basisu_astc_decomp.h
new file mode 100644
index 0000000000..9ec2e46076
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_astc_decomp.h
@@ -0,0 +1,43 @@
+#ifndef _TCUASTCUTIL_HPP
+#define _TCUASTCUTIL_HPP
+/*-------------------------------------------------------------------------
+ * drawElements Quality Program Tester Core
+ * ----------------------------------------
+ *
+ * Copyright 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *//*!
+ * \file
+ * \brief ASTC Utilities.
+ *//*--------------------------------------------------------------------*/
+
+#include "../transcoder/basisu.h" // to pick up the iterator debug level madness
+#include <vector>
+#include <stdint.h>
+
+namespace basisu_astc
+{
+namespace astc
+{
+
+// Unpacks a single ASTC block to pDst
+// If isSRGB is true, the spec requires the decoder to scale the LDR 8-bit endpoints to 16-bit before interpolation slightly differently, 
+// which will lead to different outputs. So be sure to set it correctly (ideally it should match whatever the encoder did).
+bool decompress(uint8_t* pDst, const uint8_t* data, bool isSRGB, int blockWidth, int blockHeight);
+
+} // astc
+} // basisu
+
+#endif
diff --git a/thirdparty/basis_universal/encoder/basisu_backend.cpp b/thirdparty/basis_universal/encoder/basisu_backend.cpp
new file mode 100644
index 0000000000..19911fcbb4
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_backend.cpp
@@ -0,0 +1,1805 @@
+// basisu_backend.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// TODO: This code originally supported full ETC1 and ETC1S, so there's some legacy stuff in here.
+//
+#include "basisu_backend.h"
+
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
+#define BASISU_FASTER_SELECTOR_REORDERING 0
+#define BASISU_BACKEND_VERIFY(c) verify(c, __LINE__);
+
+namespace basisu
+{
+	// TODO
+	static inline void verify(bool condition, int line)
+	{
+		if (!condition)
+		{
+			fprintf(stderr, "ERROR: basisu_backend: verify() failed at line %i!\n", line);
+			abort();
+		}
+	}
+
+	basisu_backend::basisu_backend()
+	{
+		clear();
+	}
+
+	void basisu_backend::clear()
+	{
+		m_pFront_end = NULL;
+		m_params.clear();
+		m_output.clear();
+	}
+
+	void basisu_backend::init(basisu_frontend* pFront_end, basisu_backend_params& params, const basisu_backend_slice_desc_vec& slice_descs, const basist::etc1_global_selector_codebook* pGlobal_sel_codebook)
+	{
+		m_pFront_end = pFront_end;
+		m_params = params;
+		m_slices = slice_descs;
+		m_pGlobal_sel_codebook = pGlobal_sel_codebook;
+
+		debug_printf("basisu_backend::Init: Slices: %u, ETC1S: %u, EndpointRDOQualityThresh: %f, SelectorRDOQualityThresh: %f, UseGlobalSelCodebook: %u, GlobalSelCodebookPalBits: %u, GlobalSelCodebookModBits: %u, Use hybrid selector codebooks: %u\n",
+			m_slices.size(),
+			params.m_etc1s,
+			params.m_endpoint_rdo_quality_thresh,
+			params.m_selector_rdo_quality_thresh,
+			params.m_use_global_sel_codebook,
+			params.m_global_sel_codebook_pal_bits,
+			params.m_global_sel_codebook_mod_bits,
+			params.m_use_hybrid_sel_codebooks);
+
+		debug_printf("Frontend endpoints: %u selectors: %u\n", m_pFront_end->get_total_endpoint_clusters(), m_pFront_end->get_total_selector_clusters());
+
+		for (uint32_t i = 0; i < m_slices.size(); i++)
+		{
+			debug_printf("Slice: %u, OrigWidth: %u, OrigHeight: %u, Width: %u, Height: %u, NumBlocksX: %u, NumBlocksY: %u, FirstBlockIndex: %u\n",
+				i,
+				m_slices[i].m_orig_width, m_slices[i].m_orig_height,
+				m_slices[i].m_width, m_slices[i].m_height,
+				m_slices[i].m_num_blocks_x, m_slices[i].m_num_blocks_y,
+				m_slices[i].m_first_block_index);
+		}
+	}
+
+	void basisu_backend::create_endpoint_palette()
+	{
+		const basisu_frontend& r = *m_pFront_end;
+
+		m_output.m_num_endpoints = r.get_total_endpoint_clusters();
+
+		m_endpoint_palette.resize(r.get_total_endpoint_clusters());
+		for (uint32_t i = 0; i < r.get_total_endpoint_clusters(); i++)
+		{
+			etc1_endpoint_palette_entry& e = m_endpoint_palette[i];
+
+			e.m_color5_valid = r.get_endpoint_cluster_color_is_used(i, false);
+			e.m_color5 = r.get_endpoint_cluster_unscaled_color(i, false);
+			e.m_inten5 = r.get_endpoint_cluster_inten_table(i, false);
+
+			BASISU_BACKEND_VERIFY(e.m_color5_valid);
+		}
+	}
+
+	void basisu_backend::create_selector_palette()
+	{
+		const basisu_frontend& r = *m_pFront_end;
+
+		m_output.m_num_selectors = r.get_total_selector_clusters();
+
+		m_selector_palette.resize(r.get_total_selector_clusters());
+
+		if (m_params.m_use_global_sel_codebook)
+		{
+			m_global_selector_palette_desc.resize(r.get_total_selector_clusters());
+
+			for (int i = 0; i < static_cast<int>(r.get_total_selector_clusters()); i++)
+			{
+				basist::etc1_selector_palette_entry& selector_pal_entry = m_selector_palette[i];
+
+				etc1_global_selector_cb_entry_desc& pal_entry_desc = m_global_selector_palette_desc[i];
+				pal_entry_desc.m_pal_index = r.get_selector_cluster_global_selector_entry_ids()[i].m_palette_index;
+				pal_entry_desc.m_mod_index = r.get_selector_cluster_global_selector_entry_ids()[i].m_modifier.get_index();
+
+				pal_entry_desc.m_was_used = true;
+				if (m_params.m_use_hybrid_sel_codebooks)
+					pal_entry_desc.m_was_used = r.get_selector_cluster_uses_global_cb_vec()[i];
+
+				if (pal_entry_desc.m_was_used)
+				{
+					const etc_block& selector_bits = r.get_selector_cluster_selector_bits(i);
+					(void)selector_bits;
+
+					basist::etc1_selector_palette_entry global_pal_entry(m_pGlobal_sel_codebook->get_entry(r.get_selector_cluster_global_selector_entry_ids()[i]));
+
+					for (uint32_t y = 0; y < 4; y++)
+					{
+						for (uint32_t x = 0; x < 4; x++)
+						{
+							selector_pal_entry(x, y) = global_pal_entry(x, y);
+
+							assert(selector_bits.get_selector(x, y) == global_pal_entry(x, y));
+						}
+					}
+				}
+				else
+				{
+					const etc_block& selector_bits = r.get_selector_cluster_selector_bits(i);
+
+					for (uint32_t y = 0; y < 4; y++)
+						for (uint32_t x = 0; x < 4; x++)
+							selector_pal_entry[y * 4 + x] = static_cast<uint8_t>(selector_bits.get_selector(x, y));
+				}
+			}
+		}
+		else
+		{
+			for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++)
+			{
+				basist::etc1_selector_palette_entry& s = m_selector_palette[i];
+
+				const etc_block& selector_bits = r.get_selector_cluster_selector_bits(i);
+
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						s[y * 4 + x] = static_cast<uint8_t>(selector_bits.get_selector(x, y));
+					}
+				}
+			}
+		}
+	}
+
+	static const struct
+	{
+		int8_t m_dx, m_dy;
+	} g_endpoint_preds[] =
+	{
+		{ -1, 0 },
+		{ 0, -1 },
+		{ -1, -1 }
+	};
+
+	void basisu_backend::reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec& all_endpoint_indices)
+	{
+		basisu_frontend& r = *m_pFront_end;
+		//const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+
+		if (m_params.m_used_global_codebooks)
+		{
+			m_endpoint_remap_table_old_to_new.clear();
+			m_endpoint_remap_table_old_to_new.resize(r.get_total_endpoint_clusters());
+			for (uint32_t i = 0; i < r.get_total_endpoint_clusters(); i++)
+				m_endpoint_remap_table_old_to_new[i] = i;
+		}
+		else
+		{
+			//if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 0))
+			if ((total_block_endpoints_remapped) && (m_params.m_compression_level > 1))
+			{
+				// We've changed the block endpoint indices, so we need to go and adjust the endpoint codebook (remove unused entries, optimize existing entries that have changed)
+				uint_vec new_block_endpoints(get_total_blocks());
+
+				for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+				{
+					const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+					const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+					const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+					for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+						for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+							new_block_endpoints[first_block_index + block_x + block_y * num_blocks_x] = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index;
+				}
+
+				int_vec old_to_new_endpoint_indices;
+				r.reoptimize_remapped_endpoints(new_block_endpoints, old_to_new_endpoint_indices, true);
+
+				create_endpoint_palette();
+
+				for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+				{
+					//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+
+					//const uint32_t width = m_slices[slice_index].m_width;
+					//const uint32_t height = m_slices[slice_index].m_height;
+					const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+					const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+					for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+					{
+						for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+						{
+							//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+
+							encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+							m.m_endpoint_index = old_to_new_endpoint_indices[m.m_endpoint_index];
+						} // block_x
+					} // block_y
+				} // slice_index
+
+				for (uint32_t i = 0; i < all_endpoint_indices.size(); i++)
+					all_endpoint_indices[i] = old_to_new_endpoint_indices[all_endpoint_indices[i]];
+
+			} //if (total_block_endpoints_remapped)
+
+			// Sort endpoint codebook
+			palette_index_reorderer reorderer;
+			reorderer.init((uint32_t)all_endpoint_indices.size(), &all_endpoint_indices[0], r.get_total_endpoint_clusters(), nullptr, nullptr, 0);
+			m_endpoint_remap_table_old_to_new = reorderer.get_remap_table();
+		}
+
+		// For endpoints, old_to_new[] may not be bijective! 
+		// Some "old" entries may be unused and don't get remapped into the "new" array.
+
+		m_old_endpoint_was_used.clear();
+		m_old_endpoint_was_used.resize(r.get_total_endpoint_clusters());
+		uint32_t first_old_entry_index = UINT32_MAX;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					const uint32_t old_endpoint_index = m.m_endpoint_index;
+
+					m_old_endpoint_was_used[old_endpoint_index] = true;
+					first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index);
+				} // block_x
+			} // block_y
+		} // slice_index
+
+		debug_printf("basisu_backend::reoptimize_and_sort_endpoints_codebook: First old entry index: %u\n", first_old_entry_index);
+						
+		m_new_endpoint_was_used.clear();
+		m_new_endpoint_was_used.resize(r.get_total_endpoint_clusters());
+
+		m_endpoint_remap_table_new_to_old.clear();
+		m_endpoint_remap_table_new_to_old.resize(r.get_total_endpoint_clusters());
+		
+		// Set unused entries in the new array to point to the first used entry in the old array.
+		m_endpoint_remap_table_new_to_old.set_all(first_old_entry_index);
+
+		for (uint32_t old_index = 0; old_index < m_endpoint_remap_table_old_to_new.size(); old_index++)
+		{
+			if (m_old_endpoint_was_used[old_index])
+			{
+				const uint32_t new_index = m_endpoint_remap_table_old_to_new[old_index];
+				
+				m_new_endpoint_was_used[new_index] = true;
+
+				m_endpoint_remap_table_new_to_old[new_index] = old_index;
+			}
+		}
+	}
+
+	void basisu_backend::sort_selector_codebook()
+	{
+		basisu_frontend& r = *m_pFront_end;
+
+		m_selector_remap_table_new_to_old.resize(r.get_total_selector_clusters());
+
+		if ((m_params.m_compression_level == 0) || (m_params.m_used_global_codebooks))
+		{
+			for (uint32_t i = 0; i < r.get_total_selector_clusters(); i++)
+				m_selector_remap_table_new_to_old[i] = i;
+		}
+		else
+		{
+			m_selector_remap_table_new_to_old[0] = 0;
+			uint32_t prev_selector_index = 0;
+
+			int_vec remaining_selectors;
+			remaining_selectors.reserve(r.get_total_selector_clusters() - 1);
+			for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++)
+				remaining_selectors.push_back(i);
+
+			uint_vec selector_palette_bytes(m_selector_palette.size());
+			for (uint32_t i = 0; i < m_selector_palette.size(); i++)
+				selector_palette_bytes[i] = m_selector_palette[i].get_byte(0) | (m_selector_palette[i].get_byte(1) << 8) | (m_selector_palette[i].get_byte(2) << 16) | (m_selector_palette[i].get_byte(3) << 24);
+
+			// This is the traveling salesman problem.
+			for (uint32_t i = 1; i < r.get_total_selector_clusters(); i++)
+			{
+				uint32_t best_hamming_dist = 100;
+				uint32_t best_index = 0;
+
+#if BASISU_FASTER_SELECTOR_REORDERING
+				const uint32_t step = (remaining_selectors.size() > 16) ? 16 : 1;
+				for (uint32_t j = 0; j < remaining_selectors.size(); j += step)
+#else
+				for (uint32_t j = 0; j < remaining_selectors.size(); j++)
+#endif
+				{
+					int selector_index = remaining_selectors[j];
+
+					uint32_t k = selector_palette_bytes[prev_selector_index] ^ selector_palette_bytes[selector_index];
+					uint32_t hamming_dist = g_hamming_dist[k & 0xFF] + g_hamming_dist[(k >> 8) & 0xFF] + g_hamming_dist[(k >> 16) & 0xFF] + g_hamming_dist[k >> 24];
+
+					if (hamming_dist < best_hamming_dist)
+					{
+						best_hamming_dist = hamming_dist;
+						best_index = j;
+						if (best_hamming_dist <= 1)
+							break;
+					}
+				}
+
+				prev_selector_index = remaining_selectors[best_index];
+				m_selector_remap_table_new_to_old[i] = prev_selector_index;
+
+				remaining_selectors[best_index] = remaining_selectors.back();
+				remaining_selectors.resize(remaining_selectors.size() - 1);
+			}
+		}
+
+		m_selector_remap_table_old_to_new.resize(r.get_total_selector_clusters());
+		for (uint32_t i = 0; i < m_selector_remap_table_new_to_old.size(); i++)
+			m_selector_remap_table_old_to_new[m_selector_remap_table_new_to_old[i]] = i;
+	}
+	int basisu_backend::find_video_frame(int slice_index, int delta)
+	{
+		for (uint32_t s = 0; s < m_slices.size(); s++)
+		{
+			if ((int)m_slices[s].m_source_file_index != ((int)m_slices[slice_index].m_source_file_index + delta))
+				continue;
+			if (m_slices[s].m_mip_index != m_slices[slice_index].m_mip_index)
+				continue;
+
+			// Being super paranoid here.
+			if (m_slices[s].m_num_blocks_x != (m_slices[slice_index].m_num_blocks_x))
+				continue;
+			if (m_slices[s].m_num_blocks_y != (m_slices[slice_index].m_num_blocks_y))
+				continue;
+			if (m_slices[s].m_alpha != (m_slices[slice_index].m_alpha))
+				continue;
+			return s;
+		}
+
+		return -1;
+	}
+
+	void basisu_backend::check_for_valid_cr_blocks()
+	{
+		basisu_frontend& r = *m_pFront_end;
+		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+
+		if (!is_video)
+			return;
+
+		uint32_t total_crs = 0;
+		uint32_t total_invalid_crs = 0;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const bool is_iframe = m_slices[slice_index].m_iframe;
+			//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			const int prev_frame_slice_index = find_video_frame(slice_index, -1);
+
+			// If we don't have a previous frame, and we're not an i-frame, something is wrong.
+			if ((prev_frame_slice_index < 0) && (!is_iframe))
+			{
+				BASISU_BACKEND_VERIFY(0);
+			}
+
+			if ((is_iframe) || (prev_frame_slice_index < 0))
+			{
+				// Ensure no blocks use CR's
+				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				{
+					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					{
+						encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+						BASISU_BACKEND_VERIFY(m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX);
+					}
+				}
+			}
+			else
+			{
+				// For blocks that use CR's, make sure the endpoints/selectors haven't really changed.
+				for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				{
+					for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					{
+						encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+						if (m.m_endpoint_predictor == basist::CR_ENDPOINT_PRED_INDEX)
+						{
+							total_crs++;
+
+							encoder_block& prev_m = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y);
+
+							if ((m.m_endpoint_index != prev_m.m_endpoint_index) || (m.m_selector_index != prev_m.m_selector_index))
+							{
+								total_invalid_crs++;
+							}
+						}
+					} // block_x
+				} // block_y
+
+			} // !slice_index
+
+		} // slice_index
+
+		debug_printf("Total CR's: %u, Total invalid CR's: %u\n", total_crs, total_invalid_crs);
+
+		BASISU_BACKEND_VERIFY(total_invalid_crs == 0);
+	}
+
+	void basisu_backend::create_encoder_blocks()
+	{
+		basisu_frontend& r = *m_pFront_end;
+		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+
+		m_slice_encoder_blocks.resize(m_slices.size());
+
+		uint32_t total_endpoint_pred_missed = 0, total_endpoint_pred_hits = 0, total_block_endpoints_remapped = 0;
+
+		uint_vec all_endpoint_indices;
+		all_endpoint_indices.reserve(get_total_blocks());
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
+			const bool is_iframe = m_slices[slice_index].m_iframe;
+			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+			m_slice_encoder_blocks[slice_index].resize(num_blocks_x, num_blocks_y);
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+					m.m_endpoint_index = r.get_subblock_endpoint_cluster_index(block_index, 0);
+					BASISU_BACKEND_VERIFY(r.get_subblock_endpoint_cluster_index(block_index, 0) == r.get_subblock_endpoint_cluster_index(block_index, 1));
+
+					m.m_selector_index = r.get_block_selector_cluster_index(block_index);
+
+					m.m_endpoint_predictor = basist::NO_ENDPOINT_PRED_INDEX;
+
+					const uint32_t block_endpoint = m.m_endpoint_index;
+
+					uint32_t best_endpoint_pred = UINT32_MAX;
+
+					for (uint32_t endpoint_pred = 0; endpoint_pred < basist::NUM_ENDPOINT_PREDS; endpoint_pred++)
+					{
+						if ((is_video) && (endpoint_pred == basist::CR_ENDPOINT_PRED_INDEX))
+						{
+							if ((prev_frame_slice_index != -1) && (!is_iframe))
+							{
+								const uint32_t cur_endpoint = m_slice_encoder_blocks[slice_index](block_x, block_y).m_endpoint_index;
+								const uint32_t cur_selector = m_slice_encoder_blocks[slice_index](block_x, block_y).m_selector_index;
+								const uint32_t prev_endpoint = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_endpoint_index;
+								const uint32_t prev_selector = m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_selector_index;
+								if ((cur_endpoint == prev_endpoint) && (cur_selector == prev_selector))
+								{
+									best_endpoint_pred = basist::CR_ENDPOINT_PRED_INDEX;
+									m_slice_encoder_blocks[prev_frame_slice_index](block_x, block_y).m_is_cr_target = true;
+								}
+							}
+						}
+						else
+						{
+							int pred_block_x = block_x + g_endpoint_preds[endpoint_pred].m_dx;
+							if ((pred_block_x < 0) || (pred_block_x >= (int)num_blocks_x))
+								continue;
+
+							int pred_block_y = block_y + g_endpoint_preds[endpoint_pred].m_dy;
+							if ((pred_block_y < 0) || (pred_block_y >= (int)num_blocks_y))
+								continue;
+
+							uint32_t pred_endpoint = m_slice_encoder_blocks[slice_index](pred_block_x, pred_block_y).m_endpoint_index;
+
+							if (pred_endpoint == block_endpoint)
+							{
+								if (endpoint_pred < best_endpoint_pred)
+								{
+									best_endpoint_pred = endpoint_pred;
+								}
+							}
+						}
+
+					} // endpoint_pred
+
+					if (best_endpoint_pred != UINT32_MAX)
+					{
+						m.m_endpoint_predictor = best_endpoint_pred;
+
+						total_endpoint_pred_hits++;
+					}
+					else if (m_params.m_endpoint_rdo_quality_thresh > 0.0f)
+					{
+						const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
+
+						etc_block etc_blk(r.get_output_block(block_index));
+
+						uint64_t cur_err = etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual);
+
+						if (cur_err)
+						{
+							const uint64_t thresh_err = (uint64_t)(cur_err * maximum(1.0f, m_params.m_endpoint_rdo_quality_thresh));
+
+							etc_block trial_etc_block(etc_blk);
+
+							uint64_t best_err = UINT64_MAX;
+							uint32_t best_endpoint_index = 0;
+
+							best_endpoint_pred = UINT32_MAX;
+
+							for (uint32_t endpoint_pred = 0; endpoint_pred < basist::NUM_ENDPOINT_PREDS; endpoint_pred++)
+							{
+								if ((is_video) && (endpoint_pred == basist::CR_ENDPOINT_PRED_INDEX))
+									continue;
+								int pred_block_x = block_x + g_endpoint_preds[endpoint_pred].m_dx;
+								if ((pred_block_x < 0) || (pred_block_x >= (int)num_blocks_x))
+									continue;
+
+								int pred_block_y = block_y + g_endpoint_preds[endpoint_pred].m_dy;
+								if ((pred_block_y < 0) || (pred_block_y >= (int)num_blocks_y))
+									continue;
+
+								uint32_t pred_endpoint_index = m_slice_encoder_blocks[slice_index](pred_block_x, pred_block_y).m_endpoint_index;
+
+								uint32_t pred_inten = r.get_endpoint_cluster_inten_table(pred_endpoint_index, false);
+								color_rgba pred_color = r.get_endpoint_cluster_unscaled_color(pred_endpoint_index, false);
+
+								trial_etc_block.set_block_color5(pred_color, pred_color);
+								trial_etc_block.set_inten_table(0, pred_inten);
+								trial_etc_block.set_inten_table(1, pred_inten);
+
+								color_rgba trial_colors[16];
+								unpack_etc1(trial_etc_block, trial_colors);
+
+								uint64_t trial_err = 0;
+								for (uint32_t p = 0; p < 16; p++)
+								{
+									trial_err += color_distance(r.get_params().m_perceptual, src_pixels.get_ptr()[p], trial_colors[p], false);
+									if (trial_err > thresh_err)
+										break;
+								}
+
+								if (trial_err <= thresh_err)
+								{
+									if ((trial_err < best_err) || ((trial_err == best_err) && (endpoint_pred < best_endpoint_pred)))
+									{
+										best_endpoint_pred = endpoint_pred;
+										best_err = trial_err;
+										best_endpoint_index = pred_endpoint_index;
+									}
+								}
+							} // endpoint_pred
+
+							if (best_endpoint_pred != UINT32_MAX)
+							{
+								m.m_endpoint_index = best_endpoint_index;
+								m.m_endpoint_predictor = best_endpoint_pred;
+
+								total_endpoint_pred_hits++;
+								total_block_endpoints_remapped++;
+							}
+							else
+							{
+								total_endpoint_pred_missed++;
+							}
+						}
+					}
+					else
+					{
+						total_endpoint_pred_missed++;
+					}
+
+					if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX)
+					{
+						all_endpoint_indices.push_back(m.m_endpoint_index);
+					}
+
+				} // block_x
+
+			} // block_y
+
+		} // slice
+
+		debug_printf("total_endpoint_pred_missed: %u (%3.2f%%) total_endpoint_pred_hit: %u (%3.2f%%), total_block_endpoints_remapped: %u (%3.2f%%)\n",
+			total_endpoint_pred_missed, total_endpoint_pred_missed * 100.0f / get_total_blocks(),
+			total_endpoint_pred_hits, total_endpoint_pred_hits * 100.0f / get_total_blocks(),
+			total_block_endpoints_remapped, total_block_endpoints_remapped * 100.0f / get_total_blocks());
+
+		reoptimize_and_sort_endpoints_codebook(total_block_endpoints_remapped, all_endpoint_indices);
+
+		sort_selector_codebook();
+		check_for_valid_cr_blocks();
+	}
+
+	void basisu_backend::compute_slice_crcs()
+	{
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			//const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+			const uint32_t width = m_slices[slice_index].m_width;
+			const uint32_t height = m_slices[slice_index].m_height;
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+			gpu_image gi;
+			gi.init(texture_format::cETC1, width, height);
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+					{
+						etc_block& output_block = *(etc_block*)gi.get_block_ptr(block_x, block_y);
+
+						output_block.set_diff_bit(true);
+						output_block.set_flip_bit(true);
+
+						const uint32_t endpoint_index = m.m_endpoint_index;
+
+						output_block.set_block_color5_etc1s(m_endpoint_palette[endpoint_index].m_color5);
+						output_block.set_inten_tables_etc1s(m_endpoint_palette[endpoint_index].m_inten5);
+
+						const uint32_t selector_idx = m.m_selector_index;
+
+						const basist::etc1_selector_palette_entry& selectors = m_selector_palette[selector_idx];
+						for (uint32_t sy = 0; sy < 4; sy++)
+							for (uint32_t sx = 0; sx < 4; sx++)
+								output_block.set_selector(sx, sy, selectors(sx, sy));
+					}
+
+				} // block_x
+			} // block_y
+
+			m_output.m_slice_image_crcs[slice_index] = basist::crc16(gi.get_ptr(), gi.get_size_in_bytes(), 0);
+
+			if (m_params.m_debug_images)
+			{
+				image gi_unpacked;
+				gi.unpack(gi_unpacked);
+
+				char buf[256];
+#ifdef _WIN32				
+				sprintf_s(buf, sizeof(buf), "basisu_backend_slice_%u.png", slice_index);
+#else
+				snprintf(buf, sizeof(buf), "basisu_backend_slice_%u.png", slice_index);
+#endif				
+				save_png(buf, gi_unpacked);
+			}
+
+		} // slice_index
+	}
+
+	// TODO: Split this into multiple methods.
+	bool basisu_backend::encode_image()
+	{
+		basisu_frontend& r = *m_pFront_end;
+		const bool is_video = r.get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+
+		uint32_t total_used_selector_history_buf = 0;
+		uint32_t total_selector_indices_remapped = 0;
+
+		basist::approx_move_to_front selector_history_buf(basist::MAX_SELECTOR_HISTORY_BUF_SIZE);
+		histogram selector_history_buf_histogram(basist::MAX_SELECTOR_HISTORY_BUF_SIZE);
+		histogram selector_histogram(r.get_total_selector_clusters() + basist::MAX_SELECTOR_HISTORY_BUF_SIZE + 1);
+		histogram selector_history_buf_rle_histogram(1 << basist::SELECTOR_HISTORY_BUF_RLE_COUNT_BITS);
+
+		basisu::vector<uint_vec> selector_syms(m_slices.size());
+
+		const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = r.get_total_selector_clusters();
+		const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + basist::MAX_SELECTOR_HISTORY_BUF_SIZE;
+
+		m_output.m_slice_image_crcs.resize(m_slices.size());
+
+		histogram delta_endpoint_histogram(r.get_total_endpoint_clusters());
+
+		histogram endpoint_pred_histogram(basist::ENDPOINT_PRED_TOTAL_SYMBOLS);
+		basisu::vector<uint_vec> endpoint_pred_syms(m_slices.size());
+
+		uint32_t total_endpoint_indices_remapped = 0;
+
+		uint_vec block_endpoint_indices, block_selector_indices;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			//const int prev_frame_slice_index = is_video ? find_video_frame(slice_index, -1) : -1;
+			//const int next_frame_slice_index = is_video ? find_video_frame(slice_index, 1) : -1;
+			const uint32_t first_block_index = m_slices[slice_index].m_first_block_index;
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+			selector_history_buf.reset();
+
+			int selector_history_buf_rle_count = 0;
+
+			int prev_endpoint_pred_sym_bits = -1, endpoint_pred_repeat_count = 0;
+
+			uint32_t prev_endpoint_index = 0;
+
+			vector2D<uint8_t> block_endpoints_are_referenced(num_blocks_x, num_blocks_y);
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					//const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+					if (m.m_endpoint_predictor == 0)
+						block_endpoints_are_referenced(block_x - 1, block_y) = true;
+					else if (m.m_endpoint_predictor == 1)
+						block_endpoints_are_referenced(block_x, block_y - 1) = true;
+					else if (m.m_endpoint_predictor == 2)
+					{
+						if (!is_video)
+							block_endpoints_are_referenced(block_x - 1, block_y - 1) = true;
+					}
+					if (is_video)
+					{
+						if (m.m_is_cr_target)
+							block_endpoints_are_referenced(block_x, block_y) = true;
+					}
+
+				}  // block_x
+			} // block_y
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					const uint32_t block_index = first_block_index + block_x + block_y * num_blocks_x;
+
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+					if (((block_x & 1) == 0) && ((block_y & 1) == 0))
+					{
+						uint32_t endpoint_pred_cur_sym_bits = 0;
+
+						for (uint32_t y = 0; y < 2; y++)
+						{
+							for (uint32_t x = 0; x < 2; x++)
+							{
+								const uint32_t bx = block_x + x;
+								const uint32_t by = block_y + y;
+
+								uint32_t pred = basist::NO_ENDPOINT_PRED_INDEX;
+								if ((bx < num_blocks_x) && (by < num_blocks_y))
+									pred = m_slice_encoder_blocks[slice_index](bx, by).m_endpoint_predictor;
+
+								endpoint_pred_cur_sym_bits |= (pred << (x * 2 + y * 4));
+							}
+						}
+
+						if ((int)endpoint_pred_cur_sym_bits == prev_endpoint_pred_sym_bits)
+						{
+							endpoint_pred_repeat_count++;
+						}
+						else
+						{
+							if (endpoint_pred_repeat_count > 0)
+							{
+								if (endpoint_pred_repeat_count > (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT)
+								{
+									endpoint_pred_histogram.inc(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
+									endpoint_pred_syms[slice_index].push_back(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
+
+									endpoint_pred_syms[slice_index].push_back(endpoint_pred_repeat_count);
+								}
+								else
+								{
+									for (int j = 0; j < endpoint_pred_repeat_count; j++)
+									{
+										endpoint_pred_histogram.inc(prev_endpoint_pred_sym_bits);
+										endpoint_pred_syms[slice_index].push_back(prev_endpoint_pred_sym_bits);
+									}
+								}
+
+								endpoint_pred_repeat_count = 0;
+							}
+
+							endpoint_pred_histogram.inc(endpoint_pred_cur_sym_bits);
+							endpoint_pred_syms[slice_index].push_back(endpoint_pred_cur_sym_bits);
+
+							prev_endpoint_pred_sym_bits = endpoint_pred_cur_sym_bits;
+						}
+					}
+
+					int new_endpoint_index = m_endpoint_remap_table_old_to_new[m.m_endpoint_index];
+
+					if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX)
+					{
+						int endpoint_delta = new_endpoint_index - prev_endpoint_index;
+
+						if ((m_params.m_endpoint_rdo_quality_thresh > 1.0f) && (iabs(endpoint_delta) > 1) && (!block_endpoints_are_referenced(block_x, block_y)))
+						{
+							const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
+
+							etc_block etc_blk(r.get_output_block(block_index));
+
+							const uint64_t cur_err = etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual);
+
+							if (cur_err)
+							{
+								const float endpoint_remap_thresh = maximum(1.0f, m_params.m_endpoint_rdo_quality_thresh);
+								const uint64_t thresh_err = (uint64_t)(cur_err * endpoint_remap_thresh);
+
+								uint64_t best_trial_err = UINT64_MAX;
+								int best_trial_idx = 0;
+
+								etc_block trial_etc_blk(etc_blk);
+
+								const int MAX_ENDPOINT_SEARCH_DIST = 32;
+								const int search_dist = minimum<int>(iabs(endpoint_delta) - 1, MAX_ENDPOINT_SEARCH_DIST);
+								for (int d = -search_dist; d < search_dist; d++)
+								{
+									int trial_idx = prev_endpoint_index + d;
+									if (trial_idx < 0)
+										trial_idx += (int)r.get_total_endpoint_clusters();
+									else if (trial_idx >= (int)r.get_total_endpoint_clusters())
+										trial_idx -= (int)r.get_total_endpoint_clusters();
+
+									if (trial_idx == new_endpoint_index)
+										continue;
+
+									// Skip it if this new endpoint palette entry is actually never used.
+									if (!m_new_endpoint_was_used[trial_idx])
+										continue;
+
+									const etc1_endpoint_palette_entry& p = m_endpoint_palette[m_endpoint_remap_table_new_to_old[trial_idx]];
+									trial_etc_blk.set_block_color5_etc1s(p.m_color5);
+									trial_etc_blk.set_inten_tables_etc1s(p.m_inten5);
+
+									uint64_t trial_err = trial_etc_blk.evaluate_etc1_error(src_pixels.get_ptr(), r.get_params().m_perceptual);
+
+									if (trial_err <= thresh_err)
+									{
+										if (trial_err < best_trial_err)
+										{
+											best_trial_err = trial_err;
+											best_trial_idx = trial_idx;
+										}
+									}
+								}
+
+								if (best_trial_err != UINT64_MAX)
+								{
+									m.m_endpoint_index = m_endpoint_remap_table_new_to_old[best_trial_idx];
+
+									new_endpoint_index = best_trial_idx;
+
+									endpoint_delta = new_endpoint_index - prev_endpoint_index;
+
+									total_endpoint_indices_remapped++;
+								}
+							}
+						}
+
+						if (endpoint_delta < 0)
+							endpoint_delta += (int)r.get_total_endpoint_clusters();
+
+						delta_endpoint_histogram.inc(endpoint_delta);
+					}
+
+					block_endpoint_indices.push_back(m_endpoint_remap_table_new_to_old[new_endpoint_index]);
+
+					prev_endpoint_index = new_endpoint_index;
+
+					if ((!is_video) || (m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX))
+					{
+						int new_selector_index = m_selector_remap_table_old_to_new[m.m_selector_index];
+
+						int selector_history_buf_index = -1;
+
+						if (m.m_is_cr_target)
+						{
+							for (uint32_t j = 0; j < selector_history_buf.size(); j++)
+							{
+								const int trial_idx = selector_history_buf[j];
+								if (trial_idx == new_selector_index)
+								{
+									total_used_selector_history_buf++;
+									selector_history_buf_index = j;
+									selector_history_buf_histogram.inc(j);
+									break;
+								}
+							}
+						}
+						else
+						{
+							const pixel_block& src_pixels = r.get_source_pixel_block(block_index);
+
+							const etc_block& etc_blk = r.get_output_block(block_index);
+
+							color_rgba etc_blk_unpacked[16];
+							unpack_etc1(etc_blk, etc_blk_unpacked);
+
+							uint64_t cur_err = 0;
+							if (r.get_params().m_perceptual)
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(true, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+							else
+							{
+								for (uint32_t p = 0; p < 16; p++)
+									cur_err += color_distance(false, src_pixels.get_ptr()[p], etc_blk_unpacked[p], false);
+							}
+														
+							uint64_t best_trial_err = UINT64_MAX;
+							int best_trial_idx = 0;
+							uint32_t best_trial_history_buf_idx = 0;
+
+							const float selector_remap_thresh = maximum(1.0f, m_params.m_selector_rdo_quality_thresh); //2.5f;
+							const bool use_strict_search = (m_params.m_compression_level == 0) && (selector_remap_thresh == 1.0f);
+
+							const uint64_t limit_err = (uint64_t)ceilf(cur_err * selector_remap_thresh);
+							
+							for (uint32_t j = 0; j < selector_history_buf.size(); j++)
+							{
+								const int trial_idx = selector_history_buf[j];
+
+								if (use_strict_search)
+								{
+									if (trial_idx == new_selector_index)
+									{
+										best_trial_err = 0;
+										best_trial_idx = trial_idx;
+										best_trial_history_buf_idx = j;
+										break;
+									}
+								}
+								else
+								{
+									uint64_t trial_err = 0;
+									const uint64_t thresh_err = minimum(limit_err, best_trial_err);
+
+									color_rgba block_colors[4];
+									etc_blk.get_block_colors(block_colors, 0);
+
+									const uint8_t* pSelectors = &m_selector_palette[m_selector_remap_table_new_to_old[trial_idx]](0, 0);
+									
+									if (r.get_params().m_perceptual)
+									{
+										for (uint32_t p = 0; p < 16; p++)
+										{
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(true, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
+										}
+									}
+									else
+									{
+										for (uint32_t p = 0; p < 16; p++)
+										{
+											uint32_t sel = pSelectors[p];
+											trial_err += color_distance(false, src_pixels.get_ptr()[p], block_colors[sel], false);
+											if (trial_err > thresh_err)
+												break;
+										}
+									}
+
+									if ((trial_err < best_trial_err) && (trial_err <= thresh_err))
+									{
+										assert(trial_err <= limit_err);
+										
+										best_trial_err = trial_err;
+										best_trial_idx = trial_idx;
+										best_trial_history_buf_idx = j;
+									}
+								}
+							}
+
+							if (best_trial_err != UINT64_MAX)
+							{
+								if (new_selector_index != best_trial_idx)
+									total_selector_indices_remapped++;
+
+								new_selector_index = best_trial_idx;
+
+								total_used_selector_history_buf++;
+
+								selector_history_buf_index = best_trial_history_buf_idx;
+
+								selector_history_buf_histogram.inc(best_trial_history_buf_idx);
+							}
+						} // if (m_params.m_selector_rdo_quality_thresh > 0.0f)
+
+						m.m_selector_index = m_selector_remap_table_new_to_old[new_selector_index];
+
+
+						if ((selector_history_buf_rle_count) && (selector_history_buf_index != 0))
+						{
+							if (selector_history_buf_rle_count >= (int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH)
+							{
+								selector_syms[slice_index].push_back(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX);
+								selector_syms[slice_index].push_back(selector_history_buf_rle_count);
+
+								int run_sym = selector_history_buf_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+								if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
+									selector_history_buf_rle_histogram.inc(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1);
+								else
+									selector_history_buf_rle_histogram.inc(run_sym);
+
+								selector_histogram.inc(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX);
+							}
+							else
+							{
+								for (int k = 0; k < selector_history_buf_rle_count; k++)
+								{
+									uint32_t sym_index = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + 0;
+
+									selector_syms[slice_index].push_back(sym_index);
+
+									selector_histogram.inc(sym_index);
+								}
+							}
+
+							selector_history_buf_rle_count = 0;
+						}
+
+						if (selector_history_buf_index >= 0)
+						{
+							if (selector_history_buf_index == 0)
+								selector_history_buf_rle_count++;
+							else
+							{
+								uint32_t history_buf_sym = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + selector_history_buf_index;
+
+								selector_syms[slice_index].push_back(history_buf_sym);
+
+								selector_histogram.inc(history_buf_sym);
+							}
+						}
+						else
+						{
+							selector_syms[slice_index].push_back(new_selector_index);
+
+							selector_histogram.inc(new_selector_index);
+						}
+
+						m.m_selector_history_buf_index = selector_history_buf_index;
+
+						if (selector_history_buf_index < 0)
+							selector_history_buf.add(new_selector_index);
+						else if (selector_history_buf.size())
+							selector_history_buf.use(selector_history_buf_index);
+					}
+					block_selector_indices.push_back(m.m_selector_index);
+
+				} // block_x
+
+			} // block_y
+
+			if (endpoint_pred_repeat_count > 0)
+			{
+				if (endpoint_pred_repeat_count > (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT)
+				{
+					endpoint_pred_histogram.inc(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
+					endpoint_pred_syms[slice_index].push_back(basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL);
+
+					endpoint_pred_syms[slice_index].push_back(endpoint_pred_repeat_count);
+				}
+				else
+				{
+					for (int j = 0; j < endpoint_pred_repeat_count; j++)
+					{
+						endpoint_pred_histogram.inc(prev_endpoint_pred_sym_bits);
+						endpoint_pred_syms[slice_index].push_back(prev_endpoint_pred_sym_bits);
+					}
+				}
+
+				endpoint_pred_repeat_count = 0;
+			}
+
+			if (selector_history_buf_rle_count)
+			{
+				if (selector_history_buf_rle_count >= (int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH)
+				{
+					selector_syms[slice_index].push_back(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX);
+					selector_syms[slice_index].push_back(selector_history_buf_rle_count);
+
+					int run_sym = selector_history_buf_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+					if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
+						selector_history_buf_rle_histogram.inc(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1);
+					else
+						selector_history_buf_rle_histogram.inc(run_sym);
+
+					selector_histogram.inc(SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX);
+				}
+				else
+				{
+					for (int i = 0; i < selector_history_buf_rle_count; i++)
+					{
+						uint32_t sym_index = SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX + 0;
+
+						selector_syms[slice_index].push_back(sym_index);
+
+						selector_histogram.inc(sym_index);
+					}
+				}
+
+				selector_history_buf_rle_count = 0;
+			}
+
+		} // slice_index
+
+		debug_printf("Endpoint pred RDO total endpoint indices remapped: %u %3.2f%%\n",
+			total_endpoint_indices_remapped, total_endpoint_indices_remapped * 100.0f / get_total_blocks());
+
+		debug_printf("Selector history RDO total selector indices remapped: %u %3.2f%%, Used history buf: %u %3.2f%%\n",
+			total_selector_indices_remapped, total_selector_indices_remapped * 100.0f / get_total_blocks(),
+			total_used_selector_history_buf, total_used_selector_history_buf * 100.0f / get_total_blocks());
+
+		//if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 0))
+		if ((total_endpoint_indices_remapped) && (m_params.m_compression_level > 1) && (!m_params.m_used_global_codebooks))
+		{
+			int_vec unused;
+			r.reoptimize_remapped_endpoints(block_endpoint_indices, unused, false, &block_selector_indices);
+
+			create_endpoint_palette();
+		}
+
+		check_for_valid_cr_blocks();
+		compute_slice_crcs();
+
+		double endpoint_pred_entropy = endpoint_pred_histogram.get_entropy() / endpoint_pred_histogram.get_total();
+		double delta_endpoint_entropy = delta_endpoint_histogram.get_entropy() / delta_endpoint_histogram.get_total();
+		double selector_entropy = selector_histogram.get_entropy() / selector_histogram.get_total();
+
+		debug_printf("Histogram entropy: EndpointPred: %3.3f DeltaEndpoint: %3.3f DeltaSelector: %3.3f\n", endpoint_pred_entropy, delta_endpoint_entropy, selector_entropy);
+
+		if (!endpoint_pred_histogram.get_total())
+			endpoint_pred_histogram.inc(0);
+		huffman_encoding_table endpoint_pred_model;
+		if (!endpoint_pred_model.init(endpoint_pred_histogram, 16))
+		{
+			error_printf("endpoint_pred_model.init() failed!");
+			return false;
+		}
+
+		if (!delta_endpoint_histogram.get_total())
+			delta_endpoint_histogram.inc(0);
+		huffman_encoding_table delta_endpoint_model;
+		if (!delta_endpoint_model.init(delta_endpoint_histogram, 16))
+		{
+			error_printf("delta_endpoint_model.init() failed!");
+			return false;
+		}
+		if (!selector_histogram.get_total())
+			selector_histogram.inc(0);
+
+		huffman_encoding_table selector_model;
+		if (!selector_model.init(selector_histogram, 16))
+		{
+			error_printf("selector_model.init() failed!");
+			return false;
+		}
+
+		if (!selector_history_buf_rle_histogram.get_total())
+			selector_history_buf_rle_histogram.inc(0);
+
+		huffman_encoding_table selector_history_buf_rle_model;
+		if (!selector_history_buf_rle_model.init(selector_history_buf_rle_histogram, 16))
+		{
+			error_printf("selector_history_buf_rle_model.init() failed!");
+			return false;
+		}
+
+		bitwise_coder coder;
+		coder.init(1024 * 1024 * 4);
+
+		uint32_t endpoint_pred_model_bits = coder.emit_huffman_table(endpoint_pred_model);
+		uint32_t delta_endpoint_bits = coder.emit_huffman_table(delta_endpoint_model);
+		uint32_t selector_model_bits = coder.emit_huffman_table(selector_model);
+		uint32_t selector_history_buf_run_sym_bits = coder.emit_huffman_table(selector_history_buf_rle_model);
+
+		coder.put_bits(basist::MAX_SELECTOR_HISTORY_BUF_SIZE, 13);
+
+		debug_printf("Model sizes: EndpointPred: %u bits %u bytes (%3.3f bpp) DeltaEndpoint: %u bits %u bytes (%3.3f bpp) Selector: %u bits %u bytes (%3.3f bpp) SelectorHistBufRLE: %u bits %u bytes (%3.3f bpp)\n",
+			endpoint_pred_model_bits, (endpoint_pred_model_bits + 7) / 8, endpoint_pred_model_bits / float(get_total_input_texels()),
+			delta_endpoint_bits, (delta_endpoint_bits + 7) / 8, delta_endpoint_bits / float(get_total_input_texels()),
+			selector_model_bits, (selector_model_bits + 7) / 8, selector_model_bits / float(get_total_input_texels()),
+			selector_history_buf_run_sym_bits, (selector_history_buf_run_sym_bits + 7) / 8, selector_history_buf_run_sym_bits / float(get_total_input_texels()));
+
+		coder.flush();
+
+		m_output.m_slice_image_tables = coder.get_bytes();
+
+		uint32_t total_endpoint_pred_bits = 0, total_delta_endpoint_bits = 0, total_selector_bits = 0;
+
+		uint32_t total_image_bytes = 0;
+
+		m_output.m_slice_image_data.resize(m_slices.size());
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			//const uint32_t width = m_slices[slice_index].m_width;
+			//const uint32_t height = m_slices[slice_index].m_height;
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x;
+			const uint32_t num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+
+			coder.init(1024 * 1024 * 4);
+
+			uint32_t cur_selector_sym_ofs = 0;
+			uint32_t selector_rle_count = 0;
+
+			int endpoint_pred_repeat_count = 0;
+			uint32_t cur_endpoint_pred_sym_ofs = 0;
+//			uint32_t prev_endpoint_pred_sym = 0;
+			uint32_t prev_endpoint_index = 0;
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					const encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+
+					if (((block_x & 1) == 0) && ((block_y & 1) == 0))
+					{
+						if (endpoint_pred_repeat_count > 0)
+						{
+							endpoint_pred_repeat_count--;
+						}
+						else
+						{
+							uint32_t sym = endpoint_pred_syms[slice_index][cur_endpoint_pred_sym_ofs++];
+
+							if (sym == basist::ENDPOINT_PRED_REPEAT_LAST_SYMBOL)
+							{
+								total_endpoint_pred_bits += coder.put_code(sym, endpoint_pred_model);
+
+								endpoint_pred_repeat_count = endpoint_pred_syms[slice_index][cur_endpoint_pred_sym_ofs++];
+								assert(endpoint_pred_repeat_count >= (int)basist::ENDPOINT_PRED_MIN_REPEAT_COUNT);
+
+								total_endpoint_pred_bits += coder.put_vlc(endpoint_pred_repeat_count - basist::ENDPOINT_PRED_MIN_REPEAT_COUNT, basist::ENDPOINT_PRED_COUNT_VLC_BITS);
+
+								endpoint_pred_repeat_count--;
+							}
+							else
+							{
+								total_endpoint_pred_bits += coder.put_code(sym, endpoint_pred_model);
+
+								//prev_endpoint_pred_sym = sym;
+							}
+						}
+					}
+
+					const int new_endpoint_index = m_endpoint_remap_table_old_to_new[m.m_endpoint_index];
+
+					if (m.m_endpoint_predictor == basist::NO_ENDPOINT_PRED_INDEX)
+					{
+						int endpoint_delta = new_endpoint_index - prev_endpoint_index;
+						if (endpoint_delta < 0)
+							endpoint_delta += (int)r.get_total_endpoint_clusters();
+
+						total_delta_endpoint_bits += coder.put_code(endpoint_delta, delta_endpoint_model);
+					}
+
+					prev_endpoint_index = new_endpoint_index;
+
+					if ((!is_video) || (m.m_endpoint_predictor != basist::CR_ENDPOINT_PRED_INDEX))
+					{
+						if (!selector_rle_count)
+						{
+							uint32_t selector_sym_index = selector_syms[slice_index][cur_selector_sym_ofs++];
+
+							if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX)
+								selector_rle_count = selector_syms[slice_index][cur_selector_sym_ofs++];
+
+							total_selector_bits += coder.put_code(selector_sym_index, selector_model);
+
+							if (selector_sym_index == SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX)
+							{
+								int run_sym = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+								if (run_sym >= ((int)basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1))
+								{
+									total_selector_bits += coder.put_code(basist::SELECTOR_HISTORY_BUF_RLE_COUNT_TOTAL - 1, selector_history_buf_rle_model);
+
+									uint32_t n = selector_rle_count - basist::SELECTOR_HISTORY_BUF_RLE_COUNT_THRESH;
+									total_selector_bits += coder.put_vlc(n, 7);
+								}
+								else
+									total_selector_bits += coder.put_code(run_sym, selector_history_buf_rle_model);
+							}
+						}
+
+						if (selector_rle_count)
+							selector_rle_count--;
+					}
+
+				} // block_x
+
+			} // block_y
+
+			BASISU_BACKEND_VERIFY(cur_endpoint_pred_sym_ofs == endpoint_pred_syms[slice_index].size());
+			BASISU_BACKEND_VERIFY(cur_selector_sym_ofs == selector_syms[slice_index].size());
+
+			coder.flush();
+
+			m_output.m_slice_image_data[slice_index] = coder.get_bytes();
+
+			total_image_bytes += (uint32_t)coder.get_bytes().size();
+
+			debug_printf("Slice %u compressed size: %u bytes, %3.3f bits per slice texel\n", slice_index, m_output.m_slice_image_data[slice_index].size(), m_output.m_slice_image_data[slice_index].size() * 8.0f / (m_slices[slice_index].m_orig_width * m_slices[slice_index].m_orig_height));
+
+		} // slice_index
+
+		const double total_texels = static_cast<double>(get_total_input_texels());
+		const double total_blocks = static_cast<double>(get_total_blocks());
+
+		debug_printf("Total endpoint pred bits: %u bytes: %u bits/texel: %3.3f bits/block: %3.3f\n", total_endpoint_pred_bits, total_endpoint_pred_bits / 8, total_endpoint_pred_bits / total_texels, total_endpoint_pred_bits / total_blocks);
+		debug_printf("Total delta endpoint bits: %u bytes: %u bits/texel: %3.3f bits/block: %3.3f\n", total_delta_endpoint_bits, total_delta_endpoint_bits / 8, total_delta_endpoint_bits / total_texels, total_delta_endpoint_bits / total_blocks);
+		debug_printf("Total selector bits: %u bytes: %u bits/texel: %3.3f bits/block: %3.3f\n", total_selector_bits, total_selector_bits / 8, total_selector_bits / total_texels, total_selector_bits / total_blocks);
+
+		debug_printf("Total table bytes: %u, %3.3f bits/texel\n", m_output.m_slice_image_tables.size(), m_output.m_slice_image_tables.size() * 8.0f / total_texels);
+		debug_printf("Total image bytes: %u, %3.3f bits/texel\n", total_image_bytes, total_image_bytes * 8.0f / total_texels);
+
+		return true;
+	}
+
+	bool basisu_backend::encode_endpoint_palette()
+	{
+		const basisu_frontend& r = *m_pFront_end;
+
+		// The endpoint indices may have been changed by the backend's RDO step, so go and figure out which ones are actually used again.
+		bool_vec old_endpoint_was_used(r.get_total_endpoint_clusters());
+		uint32_t first_old_entry_index = UINT32_MAX;
+
+		for (uint32_t slice_index = 0; slice_index < m_slices.size(); slice_index++)
+		{
+			const uint32_t num_blocks_x = m_slices[slice_index].m_num_blocks_x, num_blocks_y = m_slices[slice_index].m_num_blocks_y;
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+			{
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+				{
+					encoder_block& m = m_slice_encoder_blocks[slice_index](block_x, block_y);
+					const uint32_t old_endpoint_index = m.m_endpoint_index;
+
+					old_endpoint_was_used[old_endpoint_index] = true;
+					first_old_entry_index = basisu::minimum(first_old_entry_index, old_endpoint_index);
+				} // block_x
+			} // block_y
+		} // slice_index
+
+		debug_printf("basisu_backend::encode_endpoint_palette: first_old_entry_index: %u\n", first_old_entry_index);
+
+		// Maps NEW to OLD endpoints
+		uint_vec endpoint_remap_table_new_to_old(r.get_total_endpoint_clusters());
+		endpoint_remap_table_new_to_old.set_all(first_old_entry_index);
+
+		bool_vec new_endpoint_was_used(r.get_total_endpoint_clusters());
+
+		for (uint32_t old_endpoint_index = 0; old_endpoint_index < m_endpoint_remap_table_old_to_new.size(); old_endpoint_index++)
+		{
+			if (old_endpoint_was_used[old_endpoint_index])
+			{
+				const uint32_t new_endpoint_index = m_endpoint_remap_table_old_to_new[old_endpoint_index];
+				
+				new_endpoint_was_used[new_endpoint_index] = true;
+
+				endpoint_remap_table_new_to_old[new_endpoint_index] = old_endpoint_index;
+			}
+		}
+
+		// TODO: Some new endpoint palette entries may actually be unused and aren't worth coding. Fix that.
+
+		uint32_t total_unused_new_entries = 0;
+		for (uint32_t i = 0; i < new_endpoint_was_used.size(); i++)
+			if (!new_endpoint_was_used[i])
+				total_unused_new_entries++;
+		debug_printf("basisu_backend::encode_endpoint_palette: total_unused_new_entries: %u out of %u\n", total_unused_new_entries, new_endpoint_was_used.size());
+
+		bool is_grayscale = true;
+		for (uint32_t old_endpoint_index = 0; old_endpoint_index < (uint32_t)m_endpoint_palette.size(); old_endpoint_index++)
+		{
+			int r5 = m_endpoint_palette[old_endpoint_index].m_color5[0];
+			int g5 = m_endpoint_palette[old_endpoint_index].m_color5[1];
+			int b5 = m_endpoint_palette[old_endpoint_index].m_color5[2];
+			if ((r5 != g5) || (r5 != b5))
+			{
+				is_grayscale = false;
+				break;
+			}
+		}
+
+		histogram color5_delta_hist0(32); // prev 0-9, delta is -9 to 31
+		histogram color5_delta_hist1(32); // prev 10-21, delta is -21 to 21
+		histogram color5_delta_hist2(32); // prev 22-31, delta is -31 to 9
+		histogram inten_delta_hist(8);
+
+		color_rgba prev_color5(16, 16, 16, 0);
+		uint32_t prev_inten = 0;
+
+		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
+		{
+			const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index];
+
+			int delta_inten = m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten;
+			inten_delta_hist.inc(delta_inten & 7);
+			prev_inten = m_endpoint_palette[old_endpoint_index].m_inten5;
+
+			for (uint32_t i = 0; i < (is_grayscale ? 1U : 3U); i++)
+			{
+				const int delta = (m_endpoint_palette[old_endpoint_index].m_color5[i] - prev_color5[i]) & 31;
+
+				if (prev_color5[i] <= basist::COLOR5_PAL0_PREV_HI)
+					color5_delta_hist0.inc(delta);
+				else if (prev_color5[i] <= basist::COLOR5_PAL1_PREV_HI)
+					color5_delta_hist1.inc(delta);
+				else
+					color5_delta_hist2.inc(delta);
+
+				prev_color5[i] = m_endpoint_palette[old_endpoint_index].m_color5[i];
+			}
+		}
+
+		if (!color5_delta_hist0.get_total()) color5_delta_hist0.inc(0);
+		if (!color5_delta_hist1.get_total()) color5_delta_hist1.inc(0);
+		if (!color5_delta_hist2.get_total()) color5_delta_hist2.inc(0);
+
+		huffman_encoding_table color5_delta_model0, color5_delta_model1, color5_delta_model2, inten_delta_model;
+		if (!color5_delta_model0.init(color5_delta_hist0, 16))
+		{
+			error_printf("color5_delta_model.init() failed!");
+			return false;
+		}
+
+		if (!color5_delta_model1.init(color5_delta_hist1, 16))
+		{
+			error_printf("color5_delta_model.init() failed!");
+			return false;
+		}
+
+		if (!color5_delta_model2.init(color5_delta_hist2, 16))
+		{
+			error_printf("color5_delta_model.init() failed!");
+			return false;
+		}
+
+		if (!inten_delta_model.init(inten_delta_hist, 16))
+		{
+			error_printf("inten3_model.init() failed!");
+			return false;
+		}
+
+		bitwise_coder coder;
+
+		coder.init(8192);
+
+		coder.emit_huffman_table(color5_delta_model0);
+		coder.emit_huffman_table(color5_delta_model1);
+		coder.emit_huffman_table(color5_delta_model2);
+		coder.emit_huffman_table(inten_delta_model);
+
+		coder.put_bits(is_grayscale, 1);
+
+		prev_color5.set(16, 16, 16, 0);
+		prev_inten = 0;
+
+		for (uint32_t new_endpoint_index = 0; new_endpoint_index < r.get_total_endpoint_clusters(); new_endpoint_index++)
+		{
+			const uint32_t old_endpoint_index = endpoint_remap_table_new_to_old[new_endpoint_index];
+
+			int delta_inten = (m_endpoint_palette[old_endpoint_index].m_inten5 - prev_inten) & 7;
+			coder.put_code(delta_inten, inten_delta_model);
+			prev_inten = m_endpoint_palette[old_endpoint_index].m_inten5;
+
+			for (uint32_t i = 0; i < (is_grayscale ? 1U : 3U); i++)
+			{
+				const int delta = (m_endpoint_palette[old_endpoint_index].m_color5[i] - prev_color5[i]) & 31;
+
+				if (prev_color5[i] <= basist::COLOR5_PAL0_PREV_HI)
+					coder.put_code(delta, color5_delta_model0);
+				else if (prev_color5[i] <= basist::COLOR5_PAL1_PREV_HI)
+					coder.put_code(delta, color5_delta_model1);
+				else
+					coder.put_code(delta, color5_delta_model2);
+
+				prev_color5[i] = m_endpoint_palette[old_endpoint_index].m_color5[i];
+			}
+
+		} // q
+
+		coder.flush();
+
+		m_output.m_endpoint_palette = coder.get_bytes();
+
+		debug_printf("Endpoint codebook size: %u bits %u bytes, Bits per entry: %3.1f, Avg bits/texel: %3.3f\n",
+			8 * (int)m_output.m_endpoint_palette.size(), (int)m_output.m_endpoint_palette.size(), m_output.m_endpoint_palette.size() * 8.0f / r.get_total_endpoint_clusters(), m_output.m_endpoint_palette.size() * 8.0f / get_total_input_texels());
+
+		return true;
+	}
+
+	bool basisu_backend::encode_selector_palette()
+	{
+		const basisu_frontend& r = *m_pFront_end;
+
+		if ((m_params.m_use_global_sel_codebook) && (!m_params.m_use_hybrid_sel_codebooks))
+		{
+			histogram global_mod_indices(1 << m_params.m_global_sel_codebook_mod_bits);
+
+			for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++)
+				global_mod_indices.inc(m_global_selector_palette_desc[q].m_mod_index);
+
+			huffman_encoding_table global_pal_model, global_mod_model;
+
+			if (!global_mod_model.init(global_mod_indices, 16))
+			{
+				error_printf("global_mod_model.init() failed!");
+				return false;
+			}
+
+			bitwise_coder coder;
+			coder.init(1024 * 1024);
+
+			coder.put_bits(1, 1); // use global codebook
+
+			coder.put_bits(m_params.m_global_sel_codebook_pal_bits, 4); // pal bits
+			coder.put_bits(m_params.m_global_sel_codebook_mod_bits, 4); // mod bits
+
+			uint32_t mod_model_bits = 0;
+			if (m_params.m_global_sel_codebook_mod_bits)
+				mod_model_bits = coder.emit_huffman_table(global_mod_model);
+
+			uint32_t total_pal_bits = 0;
+			uint32_t total_mod_bits = 0;
+			for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++)
+			{
+				const uint32_t i = m_selector_remap_table_new_to_old[q];
+
+				if (m_params.m_global_sel_codebook_pal_bits)
+				{
+					coder.put_bits(m_global_selector_palette_desc[i].m_pal_index, m_params.m_global_sel_codebook_pal_bits);
+					total_pal_bits += m_params.m_global_sel_codebook_pal_bits;
+				}
+
+				if (m_params.m_global_sel_codebook_mod_bits)
+					total_mod_bits += coder.put_code(m_global_selector_palette_desc[i].m_mod_index, global_mod_model);
+			}
+
+			coder.flush();
+
+			m_output.m_selector_palette = coder.get_bytes();
+
+			debug_printf("Modifier model bits: %u Avg per entry: %3.3f\n", mod_model_bits, mod_model_bits / float(r.get_total_selector_clusters()));
+			debug_printf("Palette bits: %u Avg per entry: %3.3f, Modifier bits: %u Avg per entry: %3.3f\n", total_pal_bits, total_pal_bits / float(r.get_total_selector_clusters()), total_mod_bits, total_mod_bits / float(r.get_total_selector_clusters()));
+		}
+		else if (m_params.m_use_hybrid_sel_codebooks)
+		{
+			huff2D used_global_cb_bitflag_huff2D(1, 8);
+
+			histogram global_mod_indices(1 << m_params.m_global_sel_codebook_mod_bits);
+
+			for (uint32_t s = 0; s < r.get_total_selector_clusters(); s++)
+			{
+				const uint32_t q = m_selector_remap_table_new_to_old[s];
+
+				const bool used_global_cb_flag = r.get_selector_cluster_uses_global_cb_vec()[q];
+
+				used_global_cb_bitflag_huff2D.emit(used_global_cb_flag);
+
+				global_mod_indices.inc(m_global_selector_palette_desc[q].m_mod_index);
+			}
+
+			huffman_encoding_table global_mod_indices_model;
+			if (!global_mod_indices_model.init(global_mod_indices, 16))
+			{
+				error_printf("global_mod_indices_model.init() failed!");
+				return false;
+			}
+
+			bitwise_coder coder;
+			coder.init(1024 * 1024);
+
+			coder.put_bits(0, 1); // use global codebook
+			coder.put_bits(1, 1); // uses hybrid codebooks
+
+			coder.put_bits(m_params.m_global_sel_codebook_pal_bits, 4); // pal bits
+			coder.put_bits(m_params.m_global_sel_codebook_mod_bits, 4); // mod bits
+
+			used_global_cb_bitflag_huff2D.start_encoding(16);
+			coder.emit_huffman_table(used_global_cb_bitflag_huff2D.get_encoding_table());
+
+			if (m_params.m_global_sel_codebook_mod_bits)
+				coder.emit_huffman_table(global_mod_indices_model);
+
+			uint32_t total_global_cb_entries = 0;
+			uint32_t total_pal_bits = 0;
+			uint32_t total_mod_bits = 0;
+			uint32_t total_selectors = 0;
+			uint32_t total_selector_bits = 0;
+			uint32_t total_flag_bits = 0;
+
+			for (uint32_t s = 0; s < r.get_total_selector_clusters(); s++)
+			{
+				const uint32_t q = m_selector_remap_table_new_to_old[s];
+
+				total_flag_bits += used_global_cb_bitflag_huff2D.emit_next_sym(coder);
+
+				const bool used_global_cb_flag = r.get_selector_cluster_uses_global_cb_vec()[q];
+
+				if (used_global_cb_flag)
+				{
+					total_global_cb_entries++;
+
+					total_pal_bits += coder.put_bits(r.get_selector_cluster_global_selector_entry_ids()[q].m_palette_index, m_params.m_global_sel_codebook_pal_bits);
+					total_mod_bits += coder.put_code(r.get_selector_cluster_global_selector_entry_ids()[q].m_modifier.get_index(), global_mod_indices_model);
+				}
+				else
+				{
+					total_selectors++;
+					total_selector_bits += 32;
+
+					for (uint32_t j = 0; j < 4; j++)
+						coder.put_bits(m_selector_palette[q].get_byte(j), 8);
+				}
+			}
+
+			coder.flush();
+
+			m_output.m_selector_palette = coder.get_bytes();
+
+			debug_printf("Total global CB entries: %u %3.2f%%\n", total_global_cb_entries, total_global_cb_entries * 100.0f / r.get_total_selector_clusters());
+			debug_printf("Total selector entries: %u %3.2f%%\n", total_selectors, total_selectors * 100.0f / r.get_total_selector_clusters());
+			debug_printf("Total pal bits: %u, mod bits: %u, selector bits: %u, flag bits: %u\n", total_pal_bits, total_mod_bits, total_selector_bits, total_flag_bits);
+		}
+		else
+		{
+			histogram delta_selector_pal_histogram(256);
+
+			for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++)
+			{
+				if (!q)
+					continue;
+
+				const basist::etc1_selector_palette_entry& cur = m_selector_palette[m_selector_remap_table_new_to_old[q]];
+				const basist::etc1_selector_palette_entry predictor(m_selector_palette[m_selector_remap_table_new_to_old[q - 1]]);
+
+				for (uint32_t j = 0; j < 4; j++)
+					delta_selector_pal_histogram.inc(cur.get_byte(j) ^ predictor.get_byte(j));
+			}
+
+			if (!delta_selector_pal_histogram.get_total())
+				delta_selector_pal_histogram.inc(0);
+
+			huffman_encoding_table delta_selector_pal_model;
+			if (!delta_selector_pal_model.init(delta_selector_pal_histogram, 16))
+			{
+				error_printf("delta_selector_pal_model.init() failed!");
+				return false;
+			}
+
+			bitwise_coder coder;
+			coder.init(1024 * 1024);
+
+			coder.put_bits(0, 1); // use global codebook
+			coder.put_bits(0, 1); // uses hybrid codebooks
+
+			coder.put_bits(0, 1); // raw bytes
+
+			coder.emit_huffman_table(delta_selector_pal_model);
+
+			for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++)
+			{
+				if (!q)
+				{
+					for (uint32_t j = 0; j < 4; j++)
+						coder.put_bits(m_selector_palette[m_selector_remap_table_new_to_old[q]].get_byte(j), 8);
+					continue;
+				}
+
+				const basist::etc1_selector_palette_entry& cur = m_selector_palette[m_selector_remap_table_new_to_old[q]];
+				const basist::etc1_selector_palette_entry predictor(m_selector_palette[m_selector_remap_table_new_to_old[q - 1]]);
+
+				for (uint32_t j = 0; j < 4; j++)
+					coder.put_code(cur.get_byte(j) ^ predictor.get_byte(j), delta_selector_pal_model);
+			}
+
+			coder.flush();
+
+			m_output.m_selector_palette = coder.get_bytes();
+
+			if (m_output.m_selector_palette.size() >= r.get_total_selector_clusters() * 4)
+			{
+				coder.init(1024 * 1024);
+
+				coder.put_bits(0, 1); // use global codebook
+				coder.put_bits(0, 1); // uses hybrid codebooks
+
+				coder.put_bits(1, 1); // raw bytes
+
+				for (uint32_t q = 0; q < r.get_total_selector_clusters(); q++)
+				{
+					const uint32_t i = m_selector_remap_table_new_to_old[q];
+
+					for (uint32_t j = 0; j < 4; j++)
+						coder.put_bits(m_selector_palette[i].get_byte(j), 8);
+				}
+
+				coder.flush();
+
+				m_output.m_selector_palette = coder.get_bytes();
+			}
+
+		}  // if (m_params.m_use_global_sel_codebook)        
+
+		debug_printf("Selector codebook bits: %u bytes: %u, Bits per entry: %3.1f, Avg bits/texel: %3.3f\n",
+			(int)m_output.m_selector_palette.size() * 8, (int)m_output.m_selector_palette.size(),
+			m_output.m_selector_palette.size() * 8.0f / r.get_total_selector_clusters(), m_output.m_selector_palette.size() * 8.0f / get_total_input_texels());
+
+		return true;
+	}
+
+	uint32_t basisu_backend::encode()
+	{
+		//const bool is_video = m_pFront_end->get_params().m_tex_type == basist::cBASISTexTypeVideoFrames;
+		m_output.m_slice_desc = m_slices;
+		m_output.m_etc1s = m_params.m_etc1s;
+		m_output.m_uses_global_codebooks = m_params.m_used_global_codebooks;
+		m_output.m_srgb = m_pFront_end->get_params().m_perceptual;
+
+		create_endpoint_palette();
+		create_selector_palette();
+
+		create_encoder_blocks();
+
+		if (!encode_image())
+			return 0;
+
+		if (!encode_endpoint_palette())
+			return 0;
+
+		if (!encode_selector_palette())
+			return 0;
+
+		uint32_t total_compressed_bytes = (uint32_t)(m_output.m_slice_image_tables.size() + m_output.m_endpoint_palette.size() + m_output.m_selector_palette.size());
+		for (uint32_t i = 0; i < m_output.m_slice_image_data.size(); i++)
+			total_compressed_bytes += (uint32_t)m_output.m_slice_image_data[i].size();
+
+		debug_printf("Wrote %u bytes, %3.3f bits/texel\n", total_compressed_bytes, total_compressed_bytes * 8.0f / get_total_input_texels());
+
+		return total_compressed_bytes;
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_backend.h b/thirdparty/basis_universal/encoder/basisu_backend.h
new file mode 100644
index 0000000000..393dccd22f
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_backend.h
@@ -0,0 +1,342 @@
+// basisu_backend.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "../transcoder/basisu.h"
+#include "basisu_enc.h"
+#include "../transcoder/basisu_transcoder_internal.h"
+#include "../transcoder/basisu_global_selector_palette.h"
+#include "basisu_frontend.h"
+
+namespace basisu
+{
+	struct encoder_block
+	{
+		encoder_block()
+		{
+			clear();
+		}
+				
+		uint32_t m_endpoint_predictor; 
+
+		int m_endpoint_index;
+		int m_selector_index;
+
+		int m_selector_history_buf_index;
+
+		bool m_is_cr_target;
+		void clear()
+		{
+			m_endpoint_predictor = 0;
+			
+			m_endpoint_index = 0;
+			m_selector_index = 0;
+						
+			m_selector_history_buf_index = 0;
+			m_is_cr_target = false;
+		}
+	};
+
+	typedef basisu::vector<encoder_block> encoder_block_vec;
+	typedef vector2D<encoder_block> encoder_block_vec2D;
+
+	struct etc1_endpoint_palette_entry
+	{
+		etc1_endpoint_palette_entry()
+		{
+			clear();
+		}
+
+		color_rgba m_color5;
+		uint32_t m_inten5;
+		bool m_color5_valid;
+				
+		void clear()
+		{
+			clear_obj(*this);
+		}
+	};
+
+	typedef basisu::vector<etc1_endpoint_palette_entry> etc1_endpoint_palette_entry_vec;
+
+	struct basisu_backend_params
+	{
+		bool m_etc1s;
+		bool m_debug, m_debug_images;
+		float m_endpoint_rdo_quality_thresh;
+		float m_selector_rdo_quality_thresh;
+		uint32_t m_compression_level;
+
+		bool m_use_global_sel_codebook;
+		uint32_t m_global_sel_codebook_pal_bits;
+		uint32_t m_global_sel_codebook_mod_bits;
+		bool m_use_hybrid_sel_codebooks;
+
+		bool m_used_global_codebooks;
+
+		basisu_backend_params()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_etc1s = false;
+			m_debug = false;
+			m_debug_images = false;
+			m_endpoint_rdo_quality_thresh = 0.0f;
+			m_selector_rdo_quality_thresh = 0.0f;
+			m_compression_level = 0;
+
+			m_use_global_sel_codebook = false;
+			m_global_sel_codebook_pal_bits = ETC1_GLOBAL_SELECTOR_CODEBOOK_MAX_PAL_BITS;
+			m_global_sel_codebook_mod_bits = basist::etc1_global_palette_entry_modifier::cTotalBits;
+			m_use_hybrid_sel_codebooks = false;
+			m_used_global_codebooks = false;
+		}
+	};
+
+	struct basisu_backend_slice_desc
+	{
+		basisu_backend_slice_desc()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			clear_obj(*this);
+		}
+
+		uint32_t m_first_block_index;
+
+		uint32_t m_orig_width;
+		uint32_t m_orig_height;
+
+		uint32_t m_width;
+		uint32_t m_height;
+
+		uint32_t m_num_blocks_x;
+		uint32_t m_num_blocks_y;
+
+		uint32_t m_num_macroblocks_x;
+		uint32_t m_num_macroblocks_y;
+
+		uint32_t m_source_file_index;		// also the basis image index
+		uint32_t m_mip_index;
+		bool m_alpha;
+		bool m_iframe;
+	};
+
+	typedef basisu::vector<basisu_backend_slice_desc> basisu_backend_slice_desc_vec;
+
+	struct basisu_backend_output
+	{
+		basist::basis_tex_format m_tex_format;
+
+		bool m_etc1s;
+		bool m_uses_global_codebooks;
+		bool m_srgb;
+
+		uint32_t m_num_endpoints;
+		uint32_t m_num_selectors;
+
+		uint8_vec m_endpoint_palette;
+		uint8_vec m_selector_palette;
+
+		basisu_backend_slice_desc_vec m_slice_desc;
+
+		uint8_vec m_slice_image_tables;
+		basisu::vector<uint8_vec> m_slice_image_data;
+		uint16_vec m_slice_image_crcs;
+
+		basisu_backend_output()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_tex_format = basist::basis_tex_format::cETC1S;
+			m_etc1s = false;
+			m_uses_global_codebooks = false;
+			m_srgb = true;
+
+			m_num_endpoints = 0;
+			m_num_selectors = 0;
+
+			m_endpoint_palette.clear();
+			m_selector_palette.clear();
+			m_slice_desc.clear();
+			m_slice_image_tables.clear();
+			m_slice_image_data.clear();
+			m_slice_image_crcs.clear();
+		}
+
+		uint32_t get_output_size_estimate() const
+		{
+			uint32_t total_compressed_bytes = (uint32_t)(m_slice_image_tables.size() + m_endpoint_palette.size() + m_selector_palette.size());
+			for (uint32_t i = 0; i < m_slice_image_data.size(); i++)
+				total_compressed_bytes += (uint32_t)m_slice_image_data[i].size();
+
+			return total_compressed_bytes;
+		}
+	};
+
+	class basisu_backend
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basisu_backend);
+
+	public:
+
+		basisu_backend();
+
+		void clear();
+
+		void init(basisu_frontend *pFront_end, basisu_backend_params &params, const basisu_backend_slice_desc_vec &slice_desc, const basist::etc1_global_selector_codebook *pGlobal_sel_codebook);
+
+		uint32_t encode();
+
+		const basisu_backend_output &get_output() const { return m_output; }
+		const basisu_backend_params& get_params() const { return m_params; }
+
+	private:
+		basisu_frontend *m_pFront_end;
+		basisu_backend_params m_params;
+		basisu_backend_slice_desc_vec m_slices;
+		basisu_backend_output m_output;
+		const basist::etc1_global_selector_codebook *m_pGlobal_sel_codebook;
+
+		etc1_endpoint_palette_entry_vec m_endpoint_palette;
+		basist::etc1_selector_palette_entry_vec m_selector_palette;
+
+		struct etc1_global_selector_cb_entry_desc
+		{
+			uint32_t m_pal_index;
+			uint32_t m_mod_index;
+			bool m_was_used;
+		};
+
+		typedef basisu::vector<etc1_global_selector_cb_entry_desc> etc1_global_selector_cb_entry_desc_vec;
+
+		etc1_global_selector_cb_entry_desc_vec m_global_selector_palette_desc;
+
+		basisu::vector<encoder_block_vec2D> m_slice_encoder_blocks;
+
+		// Maps OLD to NEW endpoint/selector indices
+		uint_vec m_endpoint_remap_table_old_to_new;
+		uint_vec m_endpoint_remap_table_new_to_old;
+		bool_vec m_old_endpoint_was_used;
+		bool_vec m_new_endpoint_was_used;
+
+		uint_vec m_selector_remap_table_old_to_new;
+
+		// Maps NEW to OLD endpoint/selector indices
+		uint_vec m_selector_remap_table_new_to_old;
+
+		uint32_t get_total_slices() const
+		{
+			return (uint32_t)m_slices.size();
+		}
+
+		uint32_t get_total_slice_blocks() const
+		{
+			return m_pFront_end->get_total_output_blocks();
+		}
+
+		uint32_t get_block_index(uint32_t slice_index, uint32_t block_x, uint32_t block_y) const
+		{
+			const basisu_backend_slice_desc &slice = m_slices[slice_index];
+
+			assert((block_x < slice.m_num_blocks_x) && (block_y < slice.m_num_blocks_y));
+
+			return slice.m_first_block_index + block_y * slice.m_num_blocks_x + block_x;
+		}
+				
+		uint32_t get_total_blocks(uint32_t slice_index) const
+		{
+			return m_slices[slice_index].m_num_blocks_x * m_slices[slice_index].m_num_blocks_y;
+		}
+								
+		uint32_t get_total_blocks() const
+		{
+			uint32_t total_blocks = 0;
+			for (uint32_t i = 0; i < m_slices.size(); i++)
+				total_blocks += get_total_blocks(i);
+			return total_blocks;
+		}
+
+		// Returns the total number of input texels, not counting padding up to blocks/macroblocks.
+		uint32_t get_total_input_texels(uint32_t slice_index) const
+		{
+			return m_slices[slice_index].m_orig_width * m_slices[slice_index].m_orig_height;
+		}
+
+		uint32_t get_total_input_texels() const
+		{
+			uint32_t total_texels = 0;
+			for (uint32_t i = 0; i < m_slices.size(); i++)
+				total_texels += get_total_input_texels(i);
+			return total_texels;
+		}
+
+		int find_slice(uint32_t block_index, uint32_t *pBlock_x, uint32_t *pBlock_y) const
+		{
+			for (uint32_t i = 0; i < m_slices.size(); i++)
+			{
+				if ((block_index >= m_slices[i].m_first_block_index) && (block_index < (m_slices[i].m_first_block_index + m_slices[i].m_num_blocks_x * m_slices[i].m_num_blocks_y)))
+				{
+					const uint32_t ofs = block_index - m_slices[i].m_first_block_index;
+					const uint32_t x = ofs % m_slices[i].m_num_blocks_x;
+					const uint32_t y = ofs / m_slices[i].m_num_blocks_x;
+
+					if (pBlock_x) *pBlock_x = x;
+					if (pBlock_y) *pBlock_y = y;
+
+					return i;
+				}
+			}
+			return -1;
+		}
+
+		void create_endpoint_palette();
+
+		void create_selector_palette();
+
+		// endpoint palette
+		//   5:5:5 and predicted 4:4:4 colors, 1 or 2 3-bit intensity table indices
+		// selector palette
+		//   4x4 2-bit selectors
+
+		// per-macroblock:
+		//  4 diff bits
+		//  4 flip bits
+		//  Endpoint template index, 1-8 endpoint indices
+		//      Alternately, if no template applies, we can send 4 ETC1S bits followed by 4-8 endpoint indices
+		//  4 selector indices
+
+		void reoptimize_and_sort_endpoints_codebook(uint32_t total_block_endpoints_remapped, uint_vec &all_endpoint_indices);
+		void sort_selector_codebook();
+		void create_encoder_blocks();
+		void compute_slice_crcs();
+		bool encode_image();
+		bool encode_endpoint_palette();
+		bool encode_selector_palette();
+		int find_video_frame(int slice_index, int delta);
+		void check_for_valid_cr_blocks();
+	};
+
+} // namespace basisu
+
diff --git a/thirdparty/basis_universal/encoder/basisu_basis_file.cpp b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
new file mode 100644
index 0000000000..f4c77bef23
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.cpp
@@ -0,0 +1,269 @@
+// basisu_basis_file.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_basis_file.h"
+#include "../transcoder/basisu_transcoder.h"
+
+// The output file version. Keep in sync with BASISD_SUPPORTED_BASIS_VERSION.
+#define BASIS_FILE_VERSION (0x13)
+
+namespace basisu
+{
+	void basisu_file::create_header(const basisu_backend_output &encoder_output, basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame)
+	{
+		m_header.m_header_size = sizeof(basist::basis_file_header);
+
+		m_header.m_data_size = m_total_file_size - sizeof(basist::basis_file_header);
+
+		m_header.m_total_slices = (uint32_t)encoder_output.m_slice_desc.size();
+		
+		m_header.m_total_images = 0;
+		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
+			m_header.m_total_images = maximum<uint32_t>(m_header.m_total_images, encoder_output.m_slice_desc[i].m_source_file_index + 1);
+		
+		m_header.m_tex_format = (int)encoder_output.m_tex_format;
+		m_header.m_flags = 0;
+		
+		if (encoder_output.m_etc1s)
+		{
+			assert(encoder_output.m_tex_format == basist::basis_tex_format::cETC1S);
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagETC1S;
+		}
+		else
+		{
+			assert(encoder_output.m_tex_format != basist::basis_tex_format::cETC1S);
+		}
+
+		if (y_flipped)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagYFlipped;
+		if (encoder_output.m_uses_global_codebooks)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagUsesGlobalCodebook;
+		if (encoder_output.m_srgb)
+			m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagSRGB;
+				
+		for (uint32_t i = 0; i < encoder_output.m_slice_desc.size(); i++)
+		{
+			if (encoder_output.m_slice_desc[i].m_alpha)
+			{
+				m_header.m_flags = m_header.m_flags | basist::cBASISHeaderFlagHasAlphaSlices;
+				break;
+			}
+		}
+
+		m_header.m_tex_type = static_cast<uint8_t>(tex_type);
+		m_header.m_us_per_frame = clamp<uint32_t>(us_per_frame, 0, basist::cBASISMaxUSPerFrame);
+
+		m_header.m_userdata0 = userdata0;
+		m_header.m_userdata1 = userdata1;
+
+		m_header.m_total_endpoints = encoder_output.m_num_endpoints;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_endpoint_cb_file_ofs = m_endpoint_cb_file_ofs;
+			m_header.m_endpoint_cb_file_size = (uint32_t)encoder_output.m_endpoint_palette.size();
+		}
+		else
+		{
+			assert(!m_endpoint_cb_file_ofs);
+		}
+
+		m_header.m_total_selectors = encoder_output.m_num_selectors;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			m_header.m_selector_cb_file_ofs = m_selector_cb_file_ofs;
+			m_header.m_selector_cb_file_size = (uint32_t)encoder_output.m_selector_palette.size();
+		}
+		else
+		{
+			assert(!m_selector_cb_file_ofs);
+		}
+
+		m_header.m_tables_file_ofs = m_tables_file_ofs;
+		m_header.m_tables_file_size = (uint32_t)encoder_output.m_slice_image_tables.size();
+
+		m_header.m_slice_desc_file_ofs = m_slice_descs_file_ofs;
+	}
+
+	bool basisu_file::create_image_descs(const basisu_backend_output &encoder_output)
+	{
+		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;
+
+		m_images_descs.resize(slice_descs.size());
+
+		uint64_t cur_slice_file_ofs = m_first_image_file_ofs;
+		for (uint32_t i = 0; i < slice_descs.size(); i++)
+		{
+			clear_obj(m_images_descs[i]);
+
+			m_images_descs[i].m_image_index = slice_descs[i].m_source_file_index;
+			m_images_descs[i].m_level_index = slice_descs[i].m_mip_index;
+			
+			if (slice_descs[i].m_alpha)
+				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsHasAlpha;
+			if (slice_descs[i].m_iframe)
+				m_images_descs[i].m_flags = m_images_descs[i].m_flags | basist::cSliceDescFlagsFrameIsIFrame;
+
+			m_images_descs[i].m_orig_width = slice_descs[i].m_orig_width;
+			m_images_descs[i].m_orig_height = slice_descs[i].m_orig_height;
+			m_images_descs[i].m_num_blocks_x = slice_descs[i].m_num_blocks_x;
+			m_images_descs[i].m_num_blocks_y = slice_descs[i].m_num_blocks_y;
+			m_images_descs[i].m_slice_data_crc16 = encoder_output.m_slice_image_crcs[i];
+
+			if (encoder_output.m_slice_image_data[i].size() > UINT32_MAX)
+			{
+				error_printf("basisu_file::create_image_descs: Basis file too large\n");
+				return false;
+			}
+
+			const uint32_t image_size = (uint32_t)encoder_output.m_slice_image_data[i].size();
+
+			m_images_descs[i].m_file_ofs = (uint32_t)cur_slice_file_ofs;
+			m_images_descs[i].m_file_size = image_size;
+
+			cur_slice_file_ofs += image_size;
+			if (cur_slice_file_ofs > UINT32_MAX)
+			{
+				error_printf("basisu_file::create_image_descs: Basis file too large\n");
+				return false;
+			}
+		}
+
+		assert(cur_slice_file_ofs == m_total_file_size);
+		return true;
+	}
+
+	void basisu_file::create_comp_data(const basisu_backend_output &encoder_output)
+	{
+		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;
+
+		append_vector(m_comp_data, reinterpret_cast<const uint8_t *>(&m_header), sizeof(m_header));
+
+		assert(m_comp_data.size() == m_slice_descs_file_ofs);
+		append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&m_images_descs[0]), m_images_descs.size() * sizeof(m_images_descs[0]));
+
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			if (encoder_output.m_endpoint_palette.size())
+			{
+				assert(m_comp_data.size() == m_endpoint_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_endpoint_palette[0]), encoder_output.m_endpoint_palette.size());
+			}
+
+			if (encoder_output.m_selector_palette.size())
+			{
+				assert(m_comp_data.size() == m_selector_cb_file_ofs);
+				append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_selector_palette[0]), encoder_output.m_selector_palette.size());
+			}
+		}
+
+		if (encoder_output.m_slice_image_tables.size())
+		{
+			assert(m_comp_data.size() == m_tables_file_ofs);
+			append_vector(m_comp_data, reinterpret_cast<const uint8_t*>(&encoder_output.m_slice_image_tables[0]), encoder_output.m_slice_image_tables.size());
+		}
+
+		assert(m_comp_data.size() == m_first_image_file_ofs);
+		for (uint32_t i = 0; i < slice_descs.size(); i++)
+			append_vector(m_comp_data, &encoder_output.m_slice_image_data[i][0], encoder_output.m_slice_image_data[i].size());
+
+		assert(m_comp_data.size() == m_total_file_size);
+	}
+
+	void basisu_file::fixup_crcs()
+	{
+		basist::basis_file_header *pHeader = reinterpret_cast<basist::basis_file_header *>(&m_comp_data[0]);
+
+		pHeader->m_data_size = m_total_file_size - sizeof(basist::basis_file_header);
+		pHeader->m_data_crc16 = basist::crc16(&m_comp_data[0] + sizeof(basist::basis_file_header), m_total_file_size - sizeof(basist::basis_file_header), 0);
+				
+		pHeader->m_header_crc16 = basist::crc16(&pHeader->m_data_size, sizeof(basist::basis_file_header) - BASISU_OFFSETOF(basist::basis_file_header, m_data_size), 0);
+
+		pHeader->m_sig = basist::basis_file_header::cBASISSigValue;
+		pHeader->m_ver = BASIS_FILE_VERSION;// basist::basis_file_header::cBASISFirstVersion;
+	}
+
+	bool basisu_file::init(const basisu_backend_output &encoder_output, basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame)
+	{
+		clear();
+
+		const basisu_backend_slice_desc_vec &slice_descs = encoder_output.m_slice_desc;
+
+		// The Basis file uses 32-bit fields for lots of stuff, so make sure it's not too large.
+		uint64_t check_size = 0;
+		if (!encoder_output.m_uses_global_codebooks)
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
+			(uint64_t)encoder_output.m_endpoint_palette.size() + (uint64_t)encoder_output.m_selector_palette.size() + (uint64_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			check_size = (uint64_t)sizeof(basist::basis_file_header) + (uint64_t)sizeof(basist::basis_slice_desc) * slice_descs.size() +
+				(uint64_t)encoder_output.m_slice_image_tables.size();
+		}
+		if (check_size >= 0xFFFF0000ULL)
+		{
+			error_printf("basisu_file::init: File is too large!\n");
+			return false;
+		}
+
+		m_header_file_ofs = 0;
+		m_slice_descs_file_ofs = sizeof(basist::basis_file_header);
+		if (encoder_output.m_tex_format == basist::basis_tex_format::cETC1S)
+		{
+			if (encoder_output.m_uses_global_codebooks)
+			{
+				m_endpoint_cb_file_ofs = 0;
+				m_selector_cb_file_ofs = 0;
+				m_tables_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+			}
+			else
+			{
+				m_endpoint_cb_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+				m_selector_cb_file_ofs = m_endpoint_cb_file_ofs + (uint32_t)encoder_output.m_endpoint_palette.size();
+				m_tables_file_ofs = m_selector_cb_file_ofs + (uint32_t)encoder_output.m_selector_palette.size();
+			}
+			m_first_image_file_ofs = m_tables_file_ofs + (uint32_t)encoder_output.m_slice_image_tables.size();
+		}
+		else
+		{
+			m_endpoint_cb_file_ofs = 0;
+			m_selector_cb_file_ofs = 0;
+			m_tables_file_ofs = 0;
+			m_first_image_file_ofs = m_slice_descs_file_ofs + sizeof(basist::basis_slice_desc) * (uint32_t)slice_descs.size();
+		}
+				
+		uint64_t total_file_size = m_first_image_file_ofs;
+		for (uint32_t i = 0; i < encoder_output.m_slice_image_data.size(); i++)
+			total_file_size += encoder_output.m_slice_image_data[i].size();
+		if (total_file_size >= 0xFFFF0000ULL)
+		{
+			error_printf("basisu_file::init: File is too large!\n");
+			return false;
+		}
+
+		m_total_file_size = (uint32_t)total_file_size;
+
+		create_header(encoder_output, tex_type, userdata0, userdata1, y_flipped, us_per_frame);
+
+		if (!create_image_descs(encoder_output))
+			return false;
+
+		create_comp_data(encoder_output);
+
+		fixup_crcs();
+
+		return true;
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_basis_file.h b/thirdparty/basis_universal/encoder/basisu_basis_file.h
new file mode 100644
index 0000000000..98498a0121
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_basis_file.h
@@ -0,0 +1,70 @@
+// basisu_basis_file.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu_file_headers.h"
+#include "basisu_backend.h"
+
+namespace basisu
+{
+	class basisu_file
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basisu_file);
+
+	public:
+		basisu_file()
+		{
+		}
+
+		void clear()
+		{
+			m_comp_data.clear();
+
+			clear_obj(m_header);
+			m_images_descs.clear();
+
+			m_header_file_ofs = 0;
+			m_slice_descs_file_ofs = 0;
+			m_endpoint_cb_file_ofs = 0;
+			m_selector_cb_file_ofs = 0;
+			m_tables_file_ofs = 0;
+			m_first_image_file_ofs = 0;
+			m_total_file_size = 0;
+		}
+
+		bool init(const basisu_backend_output& encoder_output, basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame);
+
+		const uint8_vec &get_compressed_data() const { return m_comp_data; }
+
+	private:
+		basist::basis_file_header m_header;
+		basisu::vector<basist::basis_slice_desc> m_images_descs;
+
+		uint8_vec m_comp_data;
+
+		uint32_t m_header_file_ofs;
+		uint32_t m_slice_descs_file_ofs;
+		uint32_t m_endpoint_cb_file_ofs;
+		uint32_t m_selector_cb_file_ofs;
+		uint32_t m_tables_file_ofs;
+		uint32_t m_first_image_file_ofs;
+		uint32_t m_total_file_size;
+
+		void create_header(const basisu_backend_output& encoder_output,  basist::basis_texture_type tex_type, uint32_t userdata0, uint32_t userdata1, bool y_flipped, uint32_t us_per_frame);
+		bool create_image_descs(const basisu_backend_output& encoder_output);
+		void create_comp_data(const basisu_backend_output& encoder_output);
+		void fixup_crcs();
+	};
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
new file mode 100644
index 0000000000..06aa7eb8b1
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.cpp
@@ -0,0 +1,1984 @@
+// File: basisu_bc7enc.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_bc7enc.h"
+
+#ifdef _DEBUG
+#define BC7ENC_CHECK_OVERALL_ERROR 1
+#else
+#define BC7ENC_CHECK_OVERALL_ERROR 0
+#endif
+
+using namespace basist;
+
+namespace basisu
+{
+
+// Helpers
+static inline color_quad_u8 *color_quad_u8_set_clamped(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { pRes->m_c[0] = (uint8_t)clampi(r, 0, 255); pRes->m_c[1] = (uint8_t)clampi(g, 0, 255); pRes->m_c[2] = (uint8_t)clampi(b, 0, 255); pRes->m_c[3] = (uint8_t)clampi(a, 0, 255); return pRes; }
+static inline color_quad_u8 *color_quad_u8_set(color_quad_u8 *pRes, int32_t r, int32_t g, int32_t b, int32_t a) { assert((uint32_t)(r | g | b | a) <= 255); pRes->m_c[0] = (uint8_t)r; pRes->m_c[1] = (uint8_t)g; pRes->m_c[2] = (uint8_t)b; pRes->m_c[3] = (uint8_t)a; return pRes; }
+static inline bc7enc_bool color_quad_u8_notequals(const color_quad_u8 *pLHS, const color_quad_u8 *pRHS) { return (pLHS->m_c[0] != pRHS->m_c[0]) || (pLHS->m_c[1] != pRHS->m_c[1]) || (pLHS->m_c[2] != pRHS->m_c[2]) || (pLHS->m_c[3] != pRHS->m_c[3]); }
+static inline bc7enc_vec4F*vec4F_set_scalar(bc7enc_vec4F*pV, float x) {	pV->m_c[0] = x; pV->m_c[1] = x; pV->m_c[2] = x;	pV->m_c[3] = x;	return pV; }
+static inline bc7enc_vec4F*vec4F_set(bc7enc_vec4F*pV, float x, float y, float z, float w) {	pV->m_c[0] = x;	pV->m_c[1] = y;	pV->m_c[2] = z;	pV->m_c[3] = w;	return pV; }
+static inline bc7enc_vec4F*vec4F_saturate_in_place(bc7enc_vec4F*pV) { pV->m_c[0] = saturate(pV->m_c[0]); pV->m_c[1] = saturate(pV->m_c[1]); pV->m_c[2] = saturate(pV->m_c[2]); pV->m_c[3] = saturate(pV->m_c[3]); return pV; }
+static inline bc7enc_vec4F vec4F_saturate(const bc7enc_vec4F*pV) { bc7enc_vec4F res; res.m_c[0] = saturate(pV->m_c[0]); res.m_c[1] = saturate(pV->m_c[1]); res.m_c[2] = saturate(pV->m_c[2]); res.m_c[3] = saturate(pV->m_c[3]); return res; }
+static inline bc7enc_vec4F vec4F_from_color(const color_quad_u8 *pC) { bc7enc_vec4F res; vec4F_set(&res, pC->m_c[0], pC->m_c[1], pC->m_c[2], pC->m_c[3]); return res; }
+static inline bc7enc_vec4F vec4F_add(const bc7enc_vec4F*pLHS, const bc7enc_vec4F*pRHS) { bc7enc_vec4F res; vec4F_set(&res, pLHS->m_c[0] + pRHS->m_c[0], pLHS->m_c[1] + pRHS->m_c[1], pLHS->m_c[2] + pRHS->m_c[2], pLHS->m_c[3] + pRHS->m_c[3]); return res; }
+static inline bc7enc_vec4F vec4F_sub(const bc7enc_vec4F*pLHS, const bc7enc_vec4F*pRHS) { bc7enc_vec4F res; vec4F_set(&res, pLHS->m_c[0] - pRHS->m_c[0], pLHS->m_c[1] - pRHS->m_c[1], pLHS->m_c[2] - pRHS->m_c[2], pLHS->m_c[3] - pRHS->m_c[3]); return res; }
+static inline float vec4F_dot(const bc7enc_vec4F*pLHS, const bc7enc_vec4F*pRHS) { return pLHS->m_c[0] * pRHS->m_c[0] + pLHS->m_c[1] * pRHS->m_c[1] + pLHS->m_c[2] * pRHS->m_c[2] + pLHS->m_c[3] * pRHS->m_c[3]; }
+static inline bc7enc_vec4F vec4F_mul(const bc7enc_vec4F*pLHS, float s) { bc7enc_vec4F res; vec4F_set(&res, pLHS->m_c[0] * s, pLHS->m_c[1] * s, pLHS->m_c[2] * s, pLHS->m_c[3] * s); return res; }
+static inline bc7enc_vec4F* vec4F_normalize_in_place(bc7enc_vec4F*pV) { float s = pV->m_c[0] * pV->m_c[0] + pV->m_c[1] * pV->m_c[1] + pV->m_c[2] * pV->m_c[2] + pV->m_c[3] * pV->m_c[3]; if (s != 0.0f) { s = 1.0f / sqrtf(s); pV->m_c[0] *= s; pV->m_c[1] *= s; pV->m_c[2] *= s; pV->m_c[3] *= s; } return pV; }
+
+// Precomputed weight constants used during least fit determination. For each entry in g_bc7_weights[]: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w
+const float g_bc7_weights1x[2 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_bc7_weights2x[4 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.107666f, 0.220459f, 0.451416f, 0.328125f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_bc7_weights3x[8 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.177979f, 0.243896f, 0.334229f, 0.421875f, 0.334229f, 0.243896f, 0.177979f, 0.578125f, 0.516602f, 0.202148f,
+	0.079102f, 0.718750f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_bc7_weights4x[16 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.019775f, 0.120850f, 0.738525f, 0.140625f, 0.041260f, 0.161865f, 0.635010f, 0.203125f, 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f,
+	0.451416f, 0.328125f, 0.165039f, 0.241211f, 0.352539f, 0.406250f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.352539f, 0.241211f, 0.165039f, 0.593750f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
+	0.635010f, 0.161865f, 0.041260f, 0.796875f, 0.738525f, 0.120850f, 0.019775f, 0.859375f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_astc_weights4x[16 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.015625f, 0.109375f, 0.765625f, 0.125000f, 0.035156f, 0.152344f, 0.660156f, 0.187500f, 0.070557f, 0.195068f, 0.539307f, 0.265625f, 0.107666f, 0.220459f,
+	0.451416f, 0.328125f, 0.152588f, 0.238037f, 0.371338f, 0.390625f, 0.205322f, 0.247803f, 0.299072f, 0.453125f, 0.299072f, 0.247803f, 0.205322f, 0.546875f, 0.371338f, 0.238037f, 0.152588f, 0.609375f, 0.451416f, 0.220459f, 0.107666f, 0.671875f, 0.539307f, 0.195068f, 0.070557f, 0.734375f,
+	0.660156f, 0.152344f, 0.035156f, 0.812500f, 0.765625f, 0.109375f, 0.015625f, 0.875000f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_astc_weights5x[32 * 4] = { 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000977f, 0.030273f, 0.938477f, 0.031250f, 0.003906f, 0.058594f, 0.878906f, 0.062500f, 0.008789f, 0.084961f, 0.821289f,
+	0.093750f, 0.015625f, 0.109375f, 0.765625f, 0.125000f, 0.024414f, 0.131836f, 0.711914f, 0.156250f, 0.035156f, 0.152344f, 0.660156f, 0.187500f, 0.047852f, 0.170898f, 0.610352f, 0.218750f, 0.062500f, 0.187500f,
+	0.562500f, 0.250000f, 0.079102f, 0.202148f, 0.516602f, 0.281250f, 0.097656f, 0.214844f, 0.472656f, 0.312500f, 0.118164f, 0.225586f, 0.430664f, 0.343750f, 0.140625f, 0.234375f, 0.390625f, 0.375000f, 0.165039f,
+	0.241211f, 0.352539f, 0.406250f, 0.191406f, 0.246094f, 0.316406f, 0.437500f, 0.219727f, 0.249023f, 0.282227f, 0.468750f, 0.282227f, 0.249023f, 0.219727f, 0.531250f, 0.316406f, 0.246094f, 0.191406f, 0.562500f,
+	0.352539f, 0.241211f, 0.165039f, 0.593750f, 0.390625f, 0.234375f, 0.140625f, 0.625000f, 0.430664f, 0.225586f, 0.118164f, 0.656250f, 0.472656f, 0.214844f, 0.097656f, 0.687500f, 0.516602f, 0.202148f, 0.079102f,
+	0.718750f, 0.562500f, 0.187500f, 0.062500f, 0.750000f, 0.610352f, 0.170898f, 0.047852f, 0.781250f, 0.660156f, 0.152344f, 0.035156f, 0.812500f, 0.711914f, 0.131836f, 0.024414f, 0.843750f, 0.765625f, 0.109375f,
+	0.015625f, 0.875000f, 0.821289f, 0.084961f, 0.008789f, 0.906250f, 0.878906f, 0.058594f, 0.003906f, 0.937500f, 0.938477f, 0.030273f, 0.000977f, 0.968750f, 1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+const float g_astc_weights_3levelsx[3 * 4] = {
+	0.000000f, 0.000000f, 1.000000f, 0.000000f,
+	.5f * .5f, (1.0f - .5f) * .5f, (1.0f - .5f) * (1.0f - .5f), .5f,
+	1.000000f, 0.000000f, 0.000000f, 1.000000f };
+
+static endpoint_err g_bc7_mode_1_optimal_endpoints[256][2]; // [c][pbit]
+static const uint32_t BC7ENC_MODE_1_OPTIMAL_INDEX = 2;
+
+static endpoint_err g_astc_4bit_3bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX = 2;
+
+static endpoint_err g_astc_4bit_2bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX = 1;
+
+static endpoint_err g_astc_range7_2bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX = 1;
+
+static endpoint_err g_astc_range13_4bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE13_4BIT_OPTIMAL_INDEX = 2;
+
+static endpoint_err g_astc_range13_2bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX = 1;
+
+static endpoint_err g_astc_range11_5bit_optimal_endpoints[256]; // [c]
+static const uint32_t BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX = 13; // not 1, which is optimal, because 26 losslessly maps to BC7 4-bit weights
+
+astc_quant_bin g_astc_sorted_order_unquant[BC7ENC_TOTAL_ASTC_RANGES][256]; // [sorted unquantized order]
+
+static uint8_t g_astc_nearest_sorted_index[BC7ENC_TOTAL_ASTC_RANGES][256];
+
+static void astc_init()
+{
+	for (uint32_t range = 0; range < BC7ENC_TOTAL_ASTC_RANGES; range++)
+	{
+		if (!astc_is_valid_endpoint_range(range))
+			continue;
+				
+		const uint32_t levels = astc_get_levels(range);
+
+		uint32_t vals[256];
+		// TODO
+		for (uint32_t i = 0; i < levels; i++)
+			vals[i] = (unquant_astc_endpoint_val(i, range) << 8) | i;
+		
+		std::sort(vals, vals + levels);
+
+		for (uint32_t i = 0; i < levels; i++)
+		{
+			uint32_t order = vals[i] & 0xFF;
+			uint32_t unq = vals[i] >> 8;
+						
+			g_astc_sorted_order_unquant[range][i].m_unquant = (uint8_t)unq;
+			g_astc_sorted_order_unquant[range][i].m_index = (uint8_t)order;
+			
+		} // i
+
+#if 0
+		if (g_astc_bise_range_table[range][1] || g_astc_bise_range_table[range][2])
+		{
+			printf("// Range: %u, Levels: %u, Bits: %u, Trits: %u, Quints: %u\n", range, levels, g_astc_bise_range_table[range][0], g_astc_bise_range_table[range][1], g_astc_bise_range_table[range][2]);
+
+			printf("{");
+			for (uint32_t i = 0; i < levels; i++)
+			{
+				printf("{%u,%u}", g_astc_sorted_order_unquant[range][i].m_index, g_astc_sorted_order_unquant[range][i].m_unquant);
+				if (i != (levels - 1))
+					printf(",");
+			}
+			printf("}\n");
+		}
+#endif
+
+#if 0
+		if (g_astc_bise_range_table[range][1] || g_astc_bise_range_table[range][2])
+		{
+			printf("// Range: %u, Levels: %u, Bits: %u, Trits: %u, Quints: %u\n", range, levels, g_astc_bise_range_table[range][0], g_astc_bise_range_table[range][1], g_astc_bise_range_table[range][2]);
+
+			printf("{");
+			for (uint32_t i = 0; i < levels; i++)
+			{
+				printf("{%u,%u}", g_astc_unquant[range][i].m_index, g_astc_unquant[range][i].m_unquant);
+				if (i != (levels - 1))
+					printf(",");
+			}
+			printf("}\n");
+		}
+#endif
+
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			uint32_t best_index = 0;
+			int best_err = INT32_MAX;
+
+			for (uint32_t j = 0; j < levels; j++)
+			{
+				int err = g_astc_sorted_order_unquant[range][j].m_unquant - i;
+				if (err < 0)
+					err = -err;
+				if (err < best_err)
+				{
+					best_err = err;
+					best_index = j;
+				}
+			}
+
+			g_astc_nearest_sorted_index[range][i] = (uint8_t)best_index;
+		} // i
+	} // range
+}
+
+static inline uint32_t astc_interpolate(uint32_t l, uint32_t h, uint32_t w)
+{
+	// This is for linear values, not sRGB.
+	l = (l << 8) | l;
+	h = (h << 8) | h;
+	uint32_t k = (l * (64 - w) + h * w + 32) >> 6;
+	return k >> 8;
+}
+
+// Initialize the lookup table used for optimal single color compression in mode 1. Must be called before encoding.
+void bc7enc_compress_block_init()
+{
+	astc_init();
+			
+	// BC7 666.1
+	for (int c = 0; c < 256; c++)
+	{
+		for (uint32_t lp = 0; lp < 2; lp++)
+		{
+			endpoint_err best;
+			best.m_error = (uint16_t)UINT16_MAX;
+			for (uint32_t l = 0; l < 64; l++)
+			{
+				uint32_t low = ((l << 1) | lp) << 1;
+				low |= (low >> 7);
+				for (uint32_t h = 0; h < 64; h++)
+				{
+					uint32_t high = ((h << 1) | lp) << 1;
+					high |= (high >> 7);
+					const int k = (low * (64 - g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX] + 32) >> 6;
+					const int err = (k - c) * (k - c);
+					if (err < best.m_error)
+					{
+						best.m_error = (uint16_t)err;
+						best.m_lo = (uint8_t)l;
+						best.m_hi = (uint8_t)h;
+					}
+				} // h
+			} // l
+			g_bc7_mode_1_optimal_endpoints[c][lp] = best;
+		} // lp
+	} // c
+
+	// ASTC [0,15] 3-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 16; l++)
+		{
+			uint32_t low = (l << 4) | l;
+			
+			for (uint32_t h = 0; h < 16; h++)
+			{
+				uint32_t high = (h << 4) | h;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_4bit_3bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC [0,15] 2-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 16; l++)
+		{
+			uint32_t low = (l << 4) | l;
+			
+			for (uint32_t h = 0; h < 16; h++)
+			{
+				uint32_t high = (h << 4) | h;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_4bit_2bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 7 [0,11] 2-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 12; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[7][l].m_unquant;
+			
+			for (uint32_t h = 0; h < 12; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[7][h].m_unquant;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_range7_2bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 13 [0,47] 4-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 48; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[13][l].m_unquant;
+			
+			for (uint32_t h = 0; h < 48; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
+				
+				const int k = astc_interpolate(low, high, g_astc_weights4[BC7ENC_ASTC_RANGE13_4BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_range13_4bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 13 [0,47] 2-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 48; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[13][l].m_unquant;
+			
+			for (uint32_t h = 0; h < 48; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[13][h].m_unquant;
+				
+				const int k = astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+		
+		g_astc_range13_2bit_optimal_endpoints[c] = best;
+		
+	} // c
+
+	// ASTC range 11 [0,31] 5-bit
+	for (int c = 0; c < 256; c++)
+	{
+		endpoint_err best;
+		best.m_error = (uint16_t)UINT16_MAX;
+		for (uint32_t l = 0; l < 32; l++)
+		{
+			uint32_t low = g_astc_sorted_order_unquant[11][l].m_unquant;
+
+			for (uint32_t h = 0; h < 32; h++)
+			{
+				uint32_t high = g_astc_sorted_order_unquant[11][h].m_unquant;
+
+				const int k = astc_interpolate(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
+				const int err = (k - c) * (k - c);
+
+				if (err < best.m_error)
+				{
+					best.m_error = (uint16_t)err;
+					best.m_lo = (uint8_t)l;
+					best.m_hi = (uint8_t)h;
+				}
+			} // h
+		} // l
+
+		g_astc_range11_5bit_optimal_endpoints[c] = best;
+
+	} // c
+}
+
+static void compute_least_squares_endpoints_rgba(uint32_t N, const uint8_t *pSelectors, const bc7enc_vec4F* pSelector_weights, bc7enc_vec4F* pXl, bc7enc_vec4F* pXh, const color_quad_u8 *pColors)
+{
+	// Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf 
+	// I did this in matrix form first, expanded out all the ops, then optimized it a bit.
+	double z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	double q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	double q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	double q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+	double q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f;
+	
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel].m_c[0];
+		z10 += pSelector_weights[sel].m_c[1];
+		z11 += pSelector_weights[sel].m_c[2];
+		float w = pSelector_weights[sel].m_c[3];
+		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
+		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
+		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
+		q00_a += w * pColors[i].m_c[3]; t_a += pColors[i].m_c[3];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+	q10_a = t_a - q00_a;
+
+	z01 = z10;
+
+	double det = z00 * z11 - z01 * z10;
+	if (det != 0.0f)
+		det = 1.0f / det;
+
+	double iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+	pXl->m_c[3] = (float)(iz00 * q00_a + iz01 * q10_a); pXh->m_c[3] = (float)(iz10 * q00_a + iz11 * q10_a);
+
+	for (uint32_t c = 0; c < 4; c++)
+	{
+		if ((pXl->m_c[c] < 0.0f) || (pXh->m_c[c] > 255.0f))
+		{
+			uint32_t lo_v = UINT32_MAX, hi_v = 0;
+			for (uint32_t i = 0; i < N; i++)
+			{
+				lo_v = minimumu(lo_v, pColors[i].m_c[c]);
+				hi_v = maximumu(hi_v, pColors[i].m_c[c]);
+			}
+
+			if (lo_v == hi_v)
+			{
+				pXl->m_c[c] = (float)lo_v;
+				pXh->m_c[c] = (float)hi_v;
+			}
+		}
+	}
+}
+
+static void compute_least_squares_endpoints_rgb(uint32_t N, const uint8_t *pSelectors, const bc7enc_vec4F*pSelector_weights, bc7enc_vec4F*pXl, bc7enc_vec4F*pXh, const color_quad_u8 *pColors)
+{
+	double z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f;
+	double q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f;
+	double q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f;
+	double q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f;
+
+	for (uint32_t i = 0; i < N; i++)
+	{
+		const uint32_t sel = pSelectors[i];
+		z00 += pSelector_weights[sel].m_c[0];
+		z10 += pSelector_weights[sel].m_c[1];
+		z11 += pSelector_weights[sel].m_c[2];
+		float w = pSelector_weights[sel].m_c[3];
+		q00_r += w * pColors[i].m_c[0]; t_r += pColors[i].m_c[0];
+		q00_g += w * pColors[i].m_c[1]; t_g += pColors[i].m_c[1];
+		q00_b += w * pColors[i].m_c[2]; t_b += pColors[i].m_c[2];
+	}
+
+	q10_r = t_r - q00_r;
+	q10_g = t_g - q00_g;
+	q10_b = t_b - q00_b;
+
+	z01 = z10;
+
+	double det = z00 * z11 - z01 * z10;
+	if (det != 0.0f)
+		det = 1.0f / det;
+
+	double iz00, iz01, iz10, iz11;
+	iz00 = z11 * det;
+	iz01 = -z01 * det;
+	iz10 = -z10 * det;
+	iz11 = z00 * det;
+
+	pXl->m_c[0] = (float)(iz00 * q00_r + iz01 * q10_r); pXh->m_c[0] = (float)(iz10 * q00_r + iz11 * q10_r);
+	pXl->m_c[1] = (float)(iz00 * q00_g + iz01 * q10_g); pXh->m_c[1] = (float)(iz10 * q00_g + iz11 * q10_g);
+	pXl->m_c[2] = (float)(iz00 * q00_b + iz01 * q10_b); pXh->m_c[2] = (float)(iz10 * q00_b + iz11 * q10_b);
+	pXl->m_c[3] = 255.0f; pXh->m_c[3] = 255.0f;
+
+	for (uint32_t c = 0; c < 3; c++)
+	{
+		if ((pXl->m_c[c] < 0.0f) || (pXh->m_c[c] > 255.0f))
+		{
+			uint32_t lo_v = UINT32_MAX, hi_v = 0;
+			for (uint32_t i = 0; i < N; i++)
+			{
+				lo_v = minimumu(lo_v, pColors[i].m_c[c]);
+				hi_v = maximumu(hi_v, pColors[i].m_c[c]);
+			}
+
+			if (lo_v == hi_v)
+			{
+				pXl->m_c[c] = (float)lo_v;
+				pXh->m_c[c] = (float)hi_v;
+			}
+		}
+	}
+}
+
+static inline color_quad_u8 scale_color(const color_quad_u8* pC, const color_cell_compressor_params* pParams)
+{
+	color_quad_u8 results;
+
+	if (pParams->m_astc_endpoint_range)
+	{
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			results.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pC->m_c[i]].m_unquant;
+		}
+	}
+	else
+	{
+		const uint32_t n = pParams->m_comp_bits + (pParams->m_has_pbits ? 1 : 0);
+		assert((n >= 4) && (n <= 8));
+
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			uint32_t v = pC->m_c[i] << (8 - n);
+			v |= (v >> n);
+			assert(v <= 255);
+			results.m_c[i] = (uint8_t)(v);
+		}
+	}
+
+	return results;
+}
+
+static inline uint64_t compute_color_distance_rgb(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc_bool perceptual, const uint32_t weights[4])
+{
+	int dr, dg, db;
+
+	if (perceptual)
+	{
+		const int l1 = pE1->m_c[0] * 109 + pE1->m_c[1] * 366 + pE1->m_c[2] * 37;
+		const int cr1 = ((int)pE1->m_c[0] << 9) - l1;
+		const int cb1 = ((int)pE1->m_c[2] << 9) - l1;
+		const int l2 = pE2->m_c[0] * 109 + pE2->m_c[1] * 366 + pE2->m_c[2] * 37;
+		const int cr2 = ((int)pE2->m_c[0] << 9) - l2;
+		const int cb2 = ((int)pE2->m_c[2] << 9) - l2;
+		dr = (l1 - l2) >> 8;
+		dg = (cr1 - cr2) >> 8;
+		db = (cb1 - cb2) >> 8;
+	}
+	else
+	{
+		dr = (int)pE1->m_c[0] - (int)pE2->m_c[0];
+		dg = (int)pE1->m_c[1] - (int)pE2->m_c[1];
+		db = (int)pE1->m_c[2] - (int)pE2->m_c[2];
+	}
+
+	return weights[0] * (uint32_t)(dr * dr) + weights[1] * (uint32_t)(dg * dg) + weights[2] * (uint32_t)(db * db);
+}
+
+static inline uint64_t compute_color_distance_rgba(const color_quad_u8 *pE1, const color_quad_u8 *pE2, bc7enc_bool perceptual, const uint32_t weights[4])
+{
+	int da = (int)pE1->m_c[3] - (int)pE2->m_c[3];
+	return compute_color_distance_rgb(pE1, pE2, perceptual, weights) + (weights[3] * (uint32_t)(da * da));
+}
+
+static uint64_t pack_mode1_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	uint32_t best_err = UINT_MAX;
+	uint32_t best_p = 0;
+
+	for (uint32_t p = 0; p < 2; p++)
+	{
+		uint32_t err = g_bc7_mode_1_optimal_endpoints[r][p].m_error + g_bc7_mode_1_optimal_endpoints[g][p].m_error + g_bc7_mode_1_optimal_endpoints[b][p].m_error;
+		if (err < best_err)
+		{
+			best_err = err;
+			best_p = p;
+		}
+	}
+
+	const endpoint_err *pEr = &g_bc7_mode_1_optimal_endpoints[r][best_p];
+	const endpoint_err *pEg = &g_bc7_mode_1_optimal_endpoints[g][best_p];
+	const endpoint_err *pEb = &g_bc7_mode_1_optimal_endpoints[b][best_p];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
+	pResults->m_pbits[0] = best_p;
+	pResults->m_pbits[1] = 0;
+
+	memset(pSelectors, BC7ENC_MODE_1_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		uint32_t low = ((pResults->m_low_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
+		low |= (low >> 7);
+
+		uint32_t high = ((pResults->m_high_endpoint.m_c[i] << 1) | pResults->m_pbits[0]) << 1;
+		high |= (high >> 7);
+
+		p.m_c[i] = (uint8_t)((low * (64 - g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX]) + high * g_bc7_weights3[BC7ENC_MODE_1_OPTIMAL_INDEX] + 32) >> 6);
+	}
+	p.m_c[3] = 255;
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_4bit_3bit_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	const endpoint_err *pEr = &g_astc_4bit_3bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_4bit_3bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_4bit_3bit_optimal_endpoints[b];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
+		uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights3[BC7ENC_ASTC_4BIT_3BIT_OPTIMAL_INDEX]);
+	}
+	p.m_c[3] = 255;
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_4bit_2bit_to_one_color_rgba(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint32_t a, uint8_t *pSelectors)
+{
+	const endpoint_err *pEr = &g_astc_4bit_2bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_4bit_2bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_4bit_2bit_optimal_endpoints[b];
+	const endpoint_err *pEa = &g_astc_4bit_2bit_optimal_endpoints[a];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, pEa->m_lo);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, pEa->m_hi);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		uint32_t low = (pResults->m_low_endpoint.m_c[i] << 4) | pResults->m_low_endpoint.m_c[i];
+		uint32_t high = (pResults->m_high_endpoint.m_c[i] << 4) | pResults->m_high_endpoint.m_c[i];
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_4BIT_2BIT_OPTIMAL_INDEX]);
+	}
+	
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgba(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_range7_2bit_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	assert(pParams->m_astc_endpoint_range == 7 && pParams->m_num_selector_weights == 4);
+
+	const endpoint_err *pEr = &g_astc_range7_2bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_range7_2bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_range7_2bit_optimal_endpoints[b];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 0);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 0);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		uint32_t low = g_astc_sorted_order_unquant[7][pResults->m_low_endpoint.m_c[i]].m_unquant;
+		uint32_t high = g_astc_sorted_order_unquant[7][pResults->m_high_endpoint.m_c[i]].m_unquant;
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE7_2BIT_OPTIMAL_INDEX]);
+	}
+	p.m_c[3] = 255;
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_range13_2bit_to_one_color(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t *pSelectors)
+{
+	assert(pParams->m_astc_endpoint_range == 13 && pParams->m_num_selector_weights == 4 && !pParams->m_has_alpha);
+
+	const endpoint_err *pEr = &g_astc_range13_2bit_optimal_endpoints[r];
+	const endpoint_err *pEg = &g_astc_range13_2bit_optimal_endpoints[g];
+	const endpoint_err *pEb = &g_astc_range13_2bit_optimal_endpoints[b];
+	
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 47);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 47);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		uint32_t low = g_astc_sorted_order_unquant[13][pResults->m_low_endpoint.m_c[i]].m_unquant;
+		uint32_t high = g_astc_sorted_order_unquant[13][pResults->m_high_endpoint.m_c[i]].m_unquant;
+		
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_bc7_weights2[BC7ENC_ASTC_RANGE13_2BIT_OPTIMAL_INDEX]);
+	}
+	
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t pack_astc_range11_5bit_to_one_color(const color_cell_compressor_params* pParams, color_cell_compressor_results* pResults, uint32_t r, uint32_t g, uint32_t b, uint8_t* pSelectors)
+{
+	assert(pParams->m_astc_endpoint_range == 11 && pParams->m_num_selector_weights == 32 && !pParams->m_has_alpha);
+
+	const endpoint_err* pEr = &g_astc_range11_5bit_optimal_endpoints[r];
+	const endpoint_err* pEg = &g_astc_range11_5bit_optimal_endpoints[g];
+	const endpoint_err* pEb = &g_astc_range11_5bit_optimal_endpoints[b];
+
+	color_quad_u8_set(&pResults->m_low_endpoint, pEr->m_lo, pEg->m_lo, pEb->m_lo, 31);
+	color_quad_u8_set(&pResults->m_high_endpoint, pEr->m_hi, pEg->m_hi, pEb->m_hi, 31);
+	pResults->m_pbits[0] = 0;
+	pResults->m_pbits[1] = 0;
+
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+		pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+	}
+
+	memset(pSelectors, BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX, pParams->m_num_pixels);
+
+	color_quad_u8 p;
+	for (uint32_t i = 0; i < 4; i++)
+	{
+		uint32_t low = g_astc_sorted_order_unquant[11][pResults->m_low_endpoint.m_c[i]].m_unquant;
+		uint32_t high = g_astc_sorted_order_unquant[11][pResults->m_high_endpoint.m_c[i]].m_unquant;
+
+		p.m_c[i] = (uint8_t)astc_interpolate(low, high, g_astc_weights5[BC7ENC_ASTC_RANGE11_5BIT_OPTIMAL_INDEX]);
+	}
+
+	uint64_t total_err = 0;
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		total_err += compute_color_distance_rgb(&p, &pParams->m_pPixels[i], pParams->m_perceptual, pParams->m_weights);
+
+	pResults->m_best_overall_err = total_err;
+
+	return total_err;
+}
+
+static uint64_t evaluate_solution(const color_quad_u8 *pLow, const color_quad_u8 *pHigh, const uint32_t pbits[2], const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	color_quad_u8 quantMinColor = *pLow;
+	color_quad_u8 quantMaxColor = *pHigh;
+
+	if (pParams->m_has_pbits)
+	{
+		uint32_t minPBit, maxPBit;
+
+		if (pParams->m_endpoints_share_pbit)
+			maxPBit = minPBit = pbits[0];
+		else
+		{
+			minPBit = pbits[0];
+			maxPBit = pbits[1];
+		}
+
+		quantMinColor.m_c[0] = (uint8_t)((pLow->m_c[0] << 1) | minPBit);
+		quantMinColor.m_c[1] = (uint8_t)((pLow->m_c[1] << 1) | minPBit);
+		quantMinColor.m_c[2] = (uint8_t)((pLow->m_c[2] << 1) | minPBit);
+		quantMinColor.m_c[3] = (uint8_t)((pLow->m_c[3] << 1) | minPBit);
+
+		quantMaxColor.m_c[0] = (uint8_t)((pHigh->m_c[0] << 1) | maxPBit);
+		quantMaxColor.m_c[1] = (uint8_t)((pHigh->m_c[1] << 1) | maxPBit);
+		quantMaxColor.m_c[2] = (uint8_t)((pHigh->m_c[2] << 1) | maxPBit);
+		quantMaxColor.m_c[3] = (uint8_t)((pHigh->m_c[3] << 1) | maxPBit);
+	}
+
+	color_quad_u8 actualMinColor = scale_color(&quantMinColor, pParams);
+	color_quad_u8 actualMaxColor = scale_color(&quantMaxColor, pParams);
+
+	const uint32_t N = pParams->m_num_selector_weights;
+	assert(N >= 1 && N <= 32);
+
+	color_quad_u8 weightedColors[32];
+	weightedColors[0] = actualMinColor;
+	weightedColors[N - 1] = actualMaxColor;
+
+	const uint32_t nc = pParams->m_has_alpha ? 4 : 3;
+	if (pParams->m_astc_endpoint_range)
+	{
+		for (uint32_t i = 1; i < (N - 1); i++)
+		{
+			for (uint32_t j = 0; j < nc; j++)
+				weightedColors[i].m_c[j] = (uint8_t)(astc_interpolate(actualMinColor.m_c[j], actualMaxColor.m_c[j], pParams->m_pSelector_weights[i]));
+		}
+	}
+	else
+	{
+		for (uint32_t i = 1; i < (N - 1); i++)
+			for (uint32_t j = 0; j < nc; j++)
+				weightedColors[i].m_c[j] = (uint8_t)((actualMinColor.m_c[j] * (64 - pParams->m_pSelector_weights[i]) + actualMaxColor.m_c[j] * pParams->m_pSelector_weights[i] + 32) >> 6);
+	}
+
+	const int lr = actualMinColor.m_c[0];
+	const int lg = actualMinColor.m_c[1];
+	const int lb = actualMinColor.m_c[2];
+	const int dr = actualMaxColor.m_c[0] - lr;
+	const int dg = actualMaxColor.m_c[1] - lg;
+	const int db = actualMaxColor.m_c[2] - lb;
+	
+	uint64_t total_err = 0;
+	
+	if (pParams->m_pForce_selectors)
+	{
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			const color_quad_u8* pC = &pParams->m_pPixels[i];
+			
+			const uint8_t sel = pParams->m_pForce_selectors[i];
+			assert(sel < N);
+			
+			total_err += (pParams->m_has_alpha ? compute_color_distance_rgba : compute_color_distance_rgb)(&weightedColors[sel], pC, pParams->m_perceptual, pParams->m_weights);
+
+			pResults->m_pSelectors_temp[i] = sel;
+		}
+	}
+	else if (!pParams->m_perceptual)
+	{
+		if (pParams->m_has_alpha)
+		{
+			const int la = actualMinColor.m_c[3];
+			const int da = actualMaxColor.m_c[3] - la;
+
+			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + squarei(da) + .00000125f);
+
+			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+			{
+				const color_quad_u8 *pC = &pParams->m_pPixels[i];
+				int r = pC->m_c[0];
+				int g = pC->m_c[1];
+				int b = pC->m_c[2];
+				int a = pC->m_c[3];
+
+				int best_sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db + (a - la) * da) * f + .5f);
+				best_sel = clampi(best_sel, 1, N - 1);
+
+				uint64_t err0 = compute_color_distance_rgba(&weightedColors[best_sel - 1], pC, BC7ENC_FALSE, pParams->m_weights);
+				uint64_t err1 = compute_color_distance_rgba(&weightedColors[best_sel], pC, BC7ENC_FALSE, pParams->m_weights);
+
+				if (err0 == err1)
+				{
+					// Prefer non-interpolation
+					if ((best_sel - 1) == 0)
+						best_sel = 0;
+				}
+				else if (err1 > err0)
+				{
+					err1 = err0;
+					--best_sel;
+				}
+				total_err += err1;
+								
+				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+			}
+		}
+		else
+		{
+			const float f = N / (float)(squarei(dr) + squarei(dg) + squarei(db) + .00000125f);
+
+			for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+			{
+				const color_quad_u8 *pC = &pParams->m_pPixels[i];
+				int r = pC->m_c[0];
+				int g = pC->m_c[1];
+				int b = pC->m_c[2];
+
+				int sel = (int)((float)((r - lr) * dr + (g - lg) * dg + (b - lb) * db) * f + .5f);
+				sel = clampi(sel, 1, N - 1);
+
+				uint64_t err0 = compute_color_distance_rgb(&weightedColors[sel - 1], pC, BC7ENC_FALSE, pParams->m_weights);
+				uint64_t err1 = compute_color_distance_rgb(&weightedColors[sel], pC, BC7ENC_FALSE, pParams->m_weights);
+
+				int best_sel = sel;
+				uint64_t best_err = err1;
+				if (err0 == err1)
+				{
+					// Prefer non-interpolation
+					if ((best_sel - 1) == 0)
+						best_sel = 0;
+				}
+				else if (err0 < best_err)
+				{
+					best_err = err0;
+					best_sel = sel - 1;
+				}
+
+				total_err += best_err;
+
+				pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+			}
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint64_t best_err = UINT64_MAX;
+			uint32_t best_sel = 0;
+
+			if (pParams->m_has_alpha)
+			{
+				for (uint32_t j = 0; j < N; j++)
+				{
+					uint64_t err = compute_color_distance_rgba(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC_TRUE, pParams->m_weights);
+					if (err < best_err)
+					{
+						best_err = err;
+						best_sel = j;
+					}
+					// Prefer non-interpolation
+					else if ((err == best_err) && (j == (N - 1)))
+						best_sel = j;
+				}
+			}
+			else
+			{
+				for (uint32_t j = 0; j < N; j++)
+				{
+					uint64_t err = compute_color_distance_rgb(&weightedColors[j], &pParams->m_pPixels[i], BC7ENC_TRUE, pParams->m_weights);
+					if (err < best_err)
+					{
+						best_err = err;
+						best_sel = j;
+					}
+					// Prefer non-interpolation
+					else if ((err == best_err) && (j == (N - 1)))
+						best_sel = j;
+				}
+			}
+
+			total_err += best_err;
+
+			pResults->m_pSelectors_temp[i] = (uint8_t)best_sel;
+		}
+	}
+
+	if (total_err < pResults->m_best_overall_err)
+	{
+		pResults->m_best_overall_err = total_err;
+
+		pResults->m_low_endpoint = *pLow;
+		pResults->m_high_endpoint = *pHigh;
+
+		pResults->m_pbits[0] = pbits[0];
+		pResults->m_pbits[1] = pbits[1];
+
+		memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+	}
+				
+	return total_err;
+}
+
+static bool areDegenerateEndpoints(color_quad_u8* pTrialMinColor, color_quad_u8* pTrialMaxColor, const bc7enc_vec4F* pXl, const bc7enc_vec4F* pXh)
+{
+	for (uint32_t i = 0; i < 3; i++)
+	{
+		if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
+		{
+			if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.0f)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static void fixDegenerateEndpoints(uint32_t mode, color_quad_u8 *pTrialMinColor, color_quad_u8 *pTrialMaxColor, const bc7enc_vec4F*pXl, const bc7enc_vec4F*pXh, uint32_t iscale, int flags)
+{
+	if (mode == 255)
+	{
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
+			{
+				if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.000125f)
+				{
+					if (flags & 1)
+					{
+						if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+					}
+					if (flags & 2)
+					{
+						if (pTrialMaxColor->m_c[i] < iscale)
+							pTrialMaxColor->m_c[i]++;
+					}
+				}
+			}
+		}
+	}
+	else if (mode == 1)
+	{
+		// fix degenerate case where the input collapses to a single colorspace voxel, and we loose all freedom (test with grayscale ramps)
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			if (pTrialMinColor->m_c[i] == pTrialMaxColor->m_c[i])
+			{
+				if (fabs(pXl->m_c[i] - pXh->m_c[i]) > 0.000125f)
+				{
+					if (pTrialMinColor->m_c[i] > (iscale >> 1))
+					{
+						if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+						else
+							if (pTrialMaxColor->m_c[i] < iscale)
+								pTrialMaxColor->m_c[i]++;
+					}
+					else
+					{
+						if (pTrialMaxColor->m_c[i] < iscale)
+							pTrialMaxColor->m_c[i]++;
+						else if (pTrialMinColor->m_c[i] > 0)
+							pTrialMinColor->m_c[i]--;
+					}
+				}
+			}
+		}
+	}
+}
+
+static uint64_t find_optimal_solution(uint32_t mode, bc7enc_vec4F xl, bc7enc_vec4F xh, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	vec4F_saturate_in_place(&xl); vec4F_saturate_in_place(&xh);
+
+	if (pParams->m_astc_endpoint_range)
+	{
+		const uint32_t levels = astc_get_levels(pParams->m_astc_endpoint_range);
+
+		const float scale = 255.0f;
+
+		color_quad_u8 trialMinColor8Bit, trialMaxColor8Bit;
+		color_quad_u8_set_clamped(&trialMinColor8Bit, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f));
+		color_quad_u8_set_clamped(&trialMaxColor8Bit, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f));
+
+		color_quad_u8 trialMinColor, trialMaxColor;
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			trialMinColor.m_c[i] = g_astc_nearest_sorted_index[pParams->m_astc_endpoint_range][trialMinColor8Bit.m_c[i]];
+			trialMaxColor.m_c[i] = g_astc_nearest_sorted_index[pParams->m_astc_endpoint_range][trialMaxColor8Bit.m_c[i]];
+		}
+
+		if (areDegenerateEndpoints(&trialMinColor, &trialMaxColor, &xl, &xh))
+		{
+			color_quad_u8 trialMinColorOrig(trialMinColor), trialMaxColorOrig(trialMaxColor);
+
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 1);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+
+			trialMinColor = trialMinColorOrig;
+			trialMaxColor = trialMaxColorOrig;
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 0);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+
+			trialMinColor = trialMinColorOrig;
+			trialMaxColor = trialMaxColorOrig;
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 2);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+
+			trialMinColor = trialMinColorOrig;
+			trialMaxColor = trialMaxColorOrig;
+			fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, levels - 1, 3);
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+		}
+		else
+		{
+			if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+			{
+				evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+			}
+		}
+
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			pResults->m_astc_low_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[i]].m_index;
+			pResults->m_astc_high_endpoint.m_c[i] = g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[i]].m_index;
+		}
+	}
+	else if (pParams->m_has_pbits)
+	{
+		const int iscalep = (1 << (pParams->m_comp_bits + 1)) - 1;
+		const float scalep = (float)iscalep;
+
+		const int32_t totalComps = pParams->m_has_alpha ? 4 : 3;
+
+		uint32_t best_pbits[2];
+		color_quad_u8 bestMinColor, bestMaxColor;
+
+		if (!pParams->m_endpoints_share_pbit)
+		{
+			float best_err0 = 1e+9;
+			float best_err1 = 1e+9;
+
+			for (int p = 0; p < 2; p++)
+			{
+				color_quad_u8 xMinColor, xMaxColor;
+
+				// Notes: The pbit controls which quantization intervals are selected.
+				// total_levels=2^(comp_bits+1), where comp_bits=4 for mode 0, etc.
+				// pbit 0: v=(b*2)/(total_levels-1), pbit 1: v=(b*2+1)/(total_levels-1) where b is the component bin from [0,total_levels/2-1] and v is the [0,1] component value
+				// rearranging you get for pbit 0: b=floor(v*(total_levels-1)/2+.5)
+				// rearranging you get for pbit 1: b=floor((v*(total_levels-1)-1)/2+.5)
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				}
+
+				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
+				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
+
+				float err0 = 0, err1 = 0;
+				for (int i = 0; i < totalComps; i++)
+				{
+					err0 += squaref(scaledLow.m_c[i] - xl.m_c[i] * 255.0f);
+					err1 += squaref(scaledHigh.m_c[i] - xh.m_c[i] * 255.0f);
+				}
+
+				if (err0 < best_err0)
+				{
+					best_err0 = err0;
+					best_pbits[0] = p;
+
+					bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1;
+					bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1;
+					bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1;
+					bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1;
+				}
+
+				if (err1 < best_err1)
+				{
+					best_err1 = err1;
+					best_pbits[1] = p;
+
+					bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1;
+					bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1;
+					bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1;
+					bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1;
+				}
+			}
+		}
+		else
+		{
+			// Endpoints share pbits
+			float best_err = 1e+9;
+
+			for (int p = 0; p < 2; p++)
+			{
+				color_quad_u8 xMinColor, xMaxColor;
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+					xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh.m_c[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p));
+				}
+
+				color_quad_u8 scaledLow = scale_color(&xMinColor, pParams);
+				color_quad_u8 scaledHigh = scale_color(&xMaxColor, pParams);
+
+				float err = 0;
+				for (int i = 0; i < totalComps; i++)
+					err += squaref((scaledLow.m_c[i] / 255.0f) - xl.m_c[i]) + squaref((scaledHigh.m_c[i] / 255.0f) - xh.m_c[i]);
+
+				if (err < best_err)
+				{
+					best_err = err;
+					best_pbits[0] = p;
+					best_pbits[1] = p;
+					for (uint32_t j = 0; j < 4; j++)
+					{
+						bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1;
+						bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1;
+					}
+				}
+			}
+		}
+						
+		fixDegenerateEndpoints(mode, &bestMinColor, &bestMaxColor, &xl, &xh, iscalep >> 1, 0);
+
+		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&bestMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&bestMaxColor, &pResults->m_high_endpoint) || (best_pbits[0] != pResults->m_pbits[0]) || (best_pbits[1] != pResults->m_pbits[1]))
+			evaluate_solution(&bestMinColor, &bestMaxColor, best_pbits, pParams, pResults);
+	}
+	else
+	{
+		const int iscale = (1 << pParams->m_comp_bits) - 1;
+		const float scale = (float)iscale;
+
+		color_quad_u8 trialMinColor, trialMaxColor;
+		color_quad_u8_set_clamped(&trialMinColor, (int)(xl.m_c[0] * scale + .5f), (int)(xl.m_c[1] * scale + .5f), (int)(xl.m_c[2] * scale + .5f), (int)(xl.m_c[3] * scale + .5f));
+		color_quad_u8_set_clamped(&trialMaxColor, (int)(xh.m_c[0] * scale + .5f), (int)(xh.m_c[1] * scale + .5f), (int)(xh.m_c[2] * scale + .5f), (int)(xh.m_c[3] * scale + .5f));
+
+		fixDegenerateEndpoints(mode, &trialMinColor, &trialMaxColor, &xl, &xh, iscale, 0);
+
+		if ((pResults->m_best_overall_err == UINT64_MAX) || color_quad_u8_notequals(&trialMinColor, &pResults->m_low_endpoint) || color_quad_u8_notequals(&trialMaxColor, &pResults->m_high_endpoint))
+			evaluate_solution(&trialMinColor, &trialMaxColor, pResults->m_pbits, pParams, pResults);
+	}
+
+	return pResults->m_best_overall_err;
+}
+
+void check_best_overall_error(const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults)
+{
+	const uint32_t n = pParams->m_num_selector_weights;
+
+	assert(n <= 32);
+
+	color_quad_u8 colors[32];
+	for (uint32_t c = 0; c < 4; c++)
+	{
+		colors[0].m_c[c] = g_astc_unquant[pParams->m_astc_endpoint_range][pResults->m_astc_low_endpoint.m_c[c]].m_unquant;
+		assert(colors[0].m_c[c] == g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_low_endpoint.m_c[c]].m_unquant);
+
+		colors[n-1].m_c[c] = g_astc_unquant[pParams->m_astc_endpoint_range][pResults->m_astc_high_endpoint.m_c[c]].m_unquant;
+		assert(colors[n-1].m_c[c] == g_astc_sorted_order_unquant[pParams->m_astc_endpoint_range][pResults->m_high_endpoint.m_c[c]].m_unquant);
+	}
+	
+	for (uint32_t i = 1; i < pParams->m_num_selector_weights - 1; i++)
+		for (uint32_t c = 0; c < 4; c++)
+			colors[i].m_c[c] = (uint8_t)astc_interpolate(colors[0].m_c[c], colors[n - 1].m_c[c], pParams->m_pSelector_weights[i]);
+
+	uint64_t total_err = 0;
+	for (uint32_t p = 0; p < pParams->m_num_pixels; p++)
+	{
+		const color_quad_u8 &orig = pParams->m_pPixels[p];
+		const color_quad_u8 &packed = colors[pResults->m_pSelectors[p]];
+				
+		if (pParams->m_has_alpha)
+			total_err += compute_color_distance_rgba(&orig, &packed, pParams->m_perceptual, pParams->m_weights);
+		else
+			total_err += compute_color_distance_rgb(&orig, &packed, pParams->m_perceptual, pParams->m_weights);
+	}
+	assert(total_err == pResults->m_best_overall_err);
+	
+	// HACK HACK
+	//if (total_err != pResults->m_best_overall_err)
+	//	printf("X");
+}
+
+static bool is_solid_rgb(const color_cell_compressor_params *pParams, uint32_t &r, uint32_t &g, uint32_t &b)
+{
+	r = pParams->m_pPixels[0].m_c[0];
+	g = pParams->m_pPixels[0].m_c[1];
+	b = pParams->m_pPixels[0].m_c[2];
+
+	bool allSame = true;
+	for (uint32_t i = 1; i < pParams->m_num_pixels; i++)
+	{
+		if ((r != pParams->m_pPixels[i].m_c[0]) || (g != pParams->m_pPixels[i].m_c[1]) || (b != pParams->m_pPixels[i].m_c[2]))
+		{
+			allSame = false;
+			break;
+		}
+	}
+
+	return allSame;
+}
+
+static bool is_solid_rgba(const color_cell_compressor_params *pParams, uint32_t &r, uint32_t &g, uint32_t &b, uint32_t &a)
+{
+	r = pParams->m_pPixels[0].m_c[0];
+	g = pParams->m_pPixels[0].m_c[1];
+	b = pParams->m_pPixels[0].m_c[2];
+	a = pParams->m_pPixels[0].m_c[3];
+
+	bool allSame = true;
+	for (uint32_t i = 1; i < pParams->m_num_pixels; i++)
+	{
+		if ((r != pParams->m_pPixels[i].m_c[0]) || (g != pParams->m_pPixels[i].m_c[1]) || (b != pParams->m_pPixels[i].m_c[2]) || (a != pParams->m_pPixels[i].m_c[3]))
+		{
+			allSame = false;
+			break;
+		}
+	}
+
+	return allSame;
+}
+
+uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params *pParams, color_cell_compressor_results *pResults, const bc7enc_compress_block_params *pComp_params)
+{
+	if (!pParams->m_astc_endpoint_range)
+	{
+		assert((mode == 6) || (!pParams->m_has_alpha));
+	}
+	assert(pParams->m_num_selector_weights >= 1 && pParams->m_num_selector_weights <= 32);
+	assert(pParams->m_pSelector_weights[0] == 0);
+	assert(pParams->m_pSelector_weights[pParams->m_num_selector_weights - 1] == 64);
+
+	pResults->m_best_overall_err = UINT64_MAX;
+
+	uint32_t cr, cg, cb, ca;
+
+	// If the partition's colors are all the same, then just pack them as a single color.
+	if (!pParams->m_pForce_selectors)
+	{
+		if (mode == 1)
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_mode1_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 8) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_4bit_3bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 7) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_range7_2bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 4) && (pParams->m_has_alpha))
+		{
+			if (is_solid_rgba(pParams, cr, cg, cb, ca))
+				return pack_astc_4bit_2bit_to_one_color_rgba(pParams, pResults, cr, cg, cb, ca, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 13) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_range13_2bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+		else if ((pParams->m_astc_endpoint_range == 11) && (pParams->m_num_selector_weights == 32) && (!pParams->m_has_alpha))
+		{
+			if (is_solid_rgb(pParams, cr, cg, cb))
+				return pack_astc_range11_5bit_to_one_color(pParams, pResults, cr, cg, cb, pResults->m_pSelectors);
+		}
+	}
+
+	// Compute partition's mean color and principle axis.
+	bc7enc_vec4F meanColor, axis;
+	vec4F_set_scalar(&meanColor, 0.0f);
+
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+	{
+		bc7enc_vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+		meanColor = vec4F_add(&meanColor, &color);
+	}
+				
+	bc7enc_vec4F meanColorScaled = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels));
+
+	meanColor = vec4F_mul(&meanColor, 1.0f / (float)(pParams->m_num_pixels * 255.0f));
+	vec4F_saturate_in_place(&meanColor);
+	
+	if (pParams->m_has_alpha)
+	{
+		// Use incremental PCA for RGBA PCA, because it's simple.
+		vec4F_set_scalar(&axis, 0.0f);
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			bc7enc_vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+			color = vec4F_sub(&color, &meanColorScaled);
+			bc7enc_vec4F a = vec4F_mul(&color, color.m_c[0]);
+			bc7enc_vec4F b = vec4F_mul(&color, color.m_c[1]);
+			bc7enc_vec4F c = vec4F_mul(&color, color.m_c[2]);
+			bc7enc_vec4F d = vec4F_mul(&color, color.m_c[3]);
+			bc7enc_vec4F n = i ? axis : color;
+			vec4F_normalize_in_place(&n);
+			axis.m_c[0] += vec4F_dot(&a, &n);
+			axis.m_c[1] += vec4F_dot(&b, &n);
+			axis.m_c[2] += vec4F_dot(&c, &n);
+			axis.m_c[3] += vec4F_dot(&d, &n);
+		}
+		vec4F_normalize_in_place(&axis);
+	}
+	else
+	{
+		// Use covar technique for RGB PCA, because it doesn't require per-pixel normalization.
+		float cov[6] = { 0, 0, 0, 0, 0, 0 };
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			const color_quad_u8 *pV = &pParams->m_pPixels[i];
+			float r = pV->m_c[0] - meanColorScaled.m_c[0];
+			float g = pV->m_c[1] - meanColorScaled.m_c[1];
+			float b = pV->m_c[2] - meanColorScaled.m_c[2];
+			cov[0] += r*r; cov[1] += r*g; cov[2] += r*b; cov[3] += g*g; cov[4] += g*b; cov[5] += b*b;
+		}
+
+		float xr = .9f, xg = 1.0f, xb = .7f;
+		for (uint32_t iter = 0; iter < 3; iter++)
+		{
+			float r = xr * cov[0] + xg * cov[1] + xb * cov[2];
+			float g = xr * cov[1] + xg * cov[3] + xb * cov[4];
+			float b = xr * cov[2] + xg * cov[4] + xb * cov[5];
+
+			float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));
+			if (m > 1e-10f)
+			{
+				m = 1.0f / m;
+				r *= m; g *= m; b *= m;
+			}
+
+			xr = r; xg = g; xb = b;
+		}
+
+		float len = xr * xr + xg * xg + xb * xb;
+		if (len < 1e-10f)
+			vec4F_set_scalar(&axis, 0.0f);
+		else
+		{
+			len = 1.0f / sqrtf(len);
+			xr *= len; xg *= len; xb *= len;
+			vec4F_set(&axis, xr, xg, xb, 0);
+		}
+	}
+				
+	if (vec4F_dot(&axis, &axis) < .5f)
+	{
+		if (pParams->m_perceptual)
+			vec4F_set(&axis, .213f, .715f, .072f, pParams->m_has_alpha ? .715f : 0);
+		else
+			vec4F_set(&axis, 1.0f, 1.0f, 1.0f, pParams->m_has_alpha ? 1.0f : 0);
+		vec4F_normalize_in_place(&axis);
+	}
+			
+	bc7enc_vec4F minColor, maxColor;
+
+	float l = 1e+9f, h = -1e+9f;
+
+	for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+	{
+		bc7enc_vec4F color = vec4F_from_color(&pParams->m_pPixels[i]);
+
+		bc7enc_vec4F q = vec4F_sub(&color, &meanColorScaled);
+		float d = vec4F_dot(&q, &axis);
+
+		l = minimumf(l, d);
+		h = maximumf(h, d);
+	}
+
+	l *= (1.0f / 255.0f);
+	h *= (1.0f / 255.0f);
+
+	bc7enc_vec4F b0 = vec4F_mul(&axis, l);
+	bc7enc_vec4F b1 = vec4F_mul(&axis, h);
+	bc7enc_vec4F c0 = vec4F_add(&meanColor, &b0);
+	bc7enc_vec4F c1 = vec4F_add(&meanColor, &b1);
+	minColor = vec4F_saturate(&c0);
+	maxColor = vec4F_saturate(&c1);
+				
+	bc7enc_vec4F whiteVec;
+	vec4F_set_scalar(&whiteVec, 1.0f);
+	if (vec4F_dot(&minColor, &whiteVec) > vec4F_dot(&maxColor, &whiteVec))
+	{
+#if 1
+		std::swap(minColor.m_c[0], maxColor.m_c[0]);
+		std::swap(minColor.m_c[1], maxColor.m_c[1]);
+		std::swap(minColor.m_c[2], maxColor.m_c[2]);
+		std::swap(minColor.m_c[3], maxColor.m_c[3]);
+#elif 0
+		// Fails to compile correctly with MSVC 2019 (code generation bug)
+		std::swap(minColor, maxColor);
+#else
+		// Fails with MSVC 2019
+		bc7enc_vec4F temp = minColor;
+		minColor = maxColor;
+		maxColor = temp;
+#endif
+	}
+
+	// First find a solution using the block's PCA.
+	if (!find_optimal_solution(mode, minColor, maxColor, pParams, pResults))
+		return 0;
+	
+	for (uint32_t i = 0; i < pComp_params->m_least_squares_passes; i++)
+	{
+		// Now try to refine the solution using least squares by computing the optimal endpoints from the current selectors.
+		bc7enc_vec4F xl, xh;
+		vec4F_set_scalar(&xl, 0.0f);
+		vec4F_set_scalar(&xh, 0.0f);
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, pResults->m_pSelectors, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+				
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+	}
+	
+	if ((!pParams->m_pForce_selectors) && (pComp_params->m_uber_level > 0))
+	{
+		// In uber level 1, try varying the selectors a little, somewhat like cluster fit would. First try incrementing the minimum selectors,
+		// then try decrementing the selectrors, then try both.
+		uint8_t selectors_temp[16], selectors_temp1[16];
+		memcpy(selectors_temp, pResults->m_pSelectors, pParams->m_num_pixels);
+
+		const int max_selector = pParams->m_num_selector_weights - 1;
+
+		uint32_t min_sel = 256;
+		uint32_t max_sel = 0;
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			min_sel = minimumu(min_sel, sel);
+			max_sel = maximumu(max_sel, sel);
+		}
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
+				sel++;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		bc7enc_vec4F xl, xh;
+		vec4F_set_scalar(&xl, 0.0f);
+		vec4F_set_scalar(&xh, 0.0f);
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+				
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == max_sel) && (sel > 0))
+				sel--;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+				
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+		{
+			uint32_t sel = selectors_temp[i];
+			if ((sel == min_sel) && (sel < (pParams->m_num_selector_weights - 1)))
+				sel++;
+			else if ((sel == max_sel) && (sel > 0))
+				sel--;
+			selectors_temp1[i] = (uint8_t)sel;
+		}
+
+		if (pParams->m_has_alpha)
+			compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+		else
+			compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+		xl = vec4F_mul(&xl, (1.0f / 255.0f));
+		xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+		if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+			return 0;
+
+		// In uber levels 2+, try taking more advantage of endpoint extrapolation by scaling the selectors in one direction or another.
+		const uint32_t uber_err_thresh = (pParams->m_num_pixels * 56) >> 4;
+		if ((pComp_params->m_uber_level >= 2) && (pResults->m_best_overall_err > uber_err_thresh))
+		{
+			const int Q = (pComp_params->m_uber_level >= 4) ? (pComp_params->m_uber_level - 2) : 1;
+			for (int ly = -Q; ly <= 1; ly++)
+			{
+				for (int hy = max_selector - 1; hy <= (max_selector + Q); hy++)
+				{
+					if ((ly == 0) && (hy == max_selector))
+						continue;
+
+					for (uint32_t i = 0; i < pParams->m_num_pixels; i++)
+						selectors_temp1[i] = (uint8_t)clampf(floorf((float)max_selector * ((float)selectors_temp[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_selector);
+
+					//bc7enc_vec4F xl, xh;
+					vec4F_set_scalar(&xl, 0.0f);
+					vec4F_set_scalar(&xh, 0.0f);
+					if (pParams->m_has_alpha)
+						compute_least_squares_endpoints_rgba(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+					else
+						compute_least_squares_endpoints_rgb(pParams->m_num_pixels, selectors_temp1, pParams->m_pSelector_weightsx, &xl, &xh, pParams->m_pPixels);
+
+					xl = vec4F_mul(&xl, (1.0f / 255.0f));
+					xh = vec4F_mul(&xh, (1.0f / 255.0f));
+
+					if (!find_optimal_solution(mode, xl, xh, pParams, pResults))
+						return 0;
+				}
+			}
+		}
+	}
+	
+	if (!pParams->m_pForce_selectors)
+	{
+		// Try encoding the partition as a single color by using the optimal single colors tables to encode the block to its mean.
+		if (mode == 1)
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_mode1_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 8) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_4bit_3bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 7) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_range7_2bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 8) && (pParams->m_num_selector_weights == 4) && (pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f), a = (int)(.5f + meanColor.m_c[3] * 255.0f);
+			uint64_t avg_err = pack_astc_4bit_2bit_to_one_color_rgba(pParams, &avg_results, r, g, b, a, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 13) && (pParams->m_num_selector_weights == 4) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_range13_2bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+		else if ((pParams->m_astc_endpoint_range == 11) && (pParams->m_num_selector_weights == 32) && (!pParams->m_has_alpha))
+		{
+			color_cell_compressor_results avg_results = *pResults;
+			const uint32_t r = (int)(.5f + meanColor.m_c[0] * 255.0f), g = (int)(.5f + meanColor.m_c[1] * 255.0f), b = (int)(.5f + meanColor.m_c[2] * 255.0f);
+			uint64_t avg_err = pack_astc_range11_5bit_to_one_color(pParams, &avg_results, r, g, b, pResults->m_pSelectors_temp);
+			if (avg_err < pResults->m_best_overall_err)
+			{
+				*pResults = avg_results;
+				memcpy(pResults->m_pSelectors, pResults->m_pSelectors_temp, sizeof(pResults->m_pSelectors[0]) * pParams->m_num_pixels);
+				pResults->m_best_overall_err = avg_err;
+			}
+		}
+	}
+
+#if BC7ENC_CHECK_OVERALL_ERROR
+	check_best_overall_error(pParams, pResults);
+#endif
+		
+	return pResults->m_best_overall_err;
+}
+
+uint64_t color_cell_compression_est_astc(
+	uint32_t num_weights, uint32_t num_comps, const uint32_t *pWeight_table,
+	uint32_t num_pixels, const color_quad_u8* pPixels, 
+	uint64_t best_err_so_far, const uint32_t weights[4])
+{
+	assert(num_comps == 3 || num_comps == 4);
+	assert(num_weights >= 1 && num_weights <= 32);
+	assert(pWeight_table[0] == 0 && pWeight_table[num_weights - 1] == 64);
+
+	// Find RGB bounds as an approximation of the block's principle axis
+	uint32_t lr = 255, lg = 255, lb = 255, la = 255;
+	uint32_t hr = 0, hg = 0, hb = 0, ha = 0;
+	if (num_comps == 4)
+	{
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const color_quad_u8* pC = &pPixels[i];
+			if (pC->m_c[0] < lr) lr = pC->m_c[0];
+			if (pC->m_c[1] < lg) lg = pC->m_c[1];
+			if (pC->m_c[2] < lb) lb = pC->m_c[2];
+			if (pC->m_c[3] < la) la = pC->m_c[3];
+
+			if (pC->m_c[0] > hr) hr = pC->m_c[0];
+			if (pC->m_c[1] > hg) hg = pC->m_c[1];
+			if (pC->m_c[2] > hb) hb = pC->m_c[2];
+			if (pC->m_c[3] > ha) ha = pC->m_c[3];
+		}
+	}
+	else
+	{
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const color_quad_u8* pC = &pPixels[i];
+			if (pC->m_c[0] < lr) lr = pC->m_c[0];
+			if (pC->m_c[1] < lg) lg = pC->m_c[1];
+			if (pC->m_c[2] < lb) lb = pC->m_c[2];
+
+			if (pC->m_c[0] > hr) hr = pC->m_c[0];
+			if (pC->m_c[1] > hg) hg = pC->m_c[1];
+			if (pC->m_c[2] > hb) hb = pC->m_c[2];
+		}
+		la = 255;
+		ha = 255;
+	}
+
+	color_quad_u8 lowColor, highColor;
+	color_quad_u8_set(&lowColor, lr, lg, lb, la);
+	color_quad_u8_set(&highColor, hr, hg, hb, ha);
+
+	// Place endpoints at bbox diagonals and compute interpolated colors 
+	color_quad_u8 weightedColors[32];
+
+	weightedColors[0] = lowColor;
+	weightedColors[num_weights - 1] = highColor;
+	for (uint32_t i = 1; i < (num_weights - 1); i++)
+	{
+		weightedColors[i].m_c[0] = (uint8_t)astc_interpolate(lowColor.m_c[0], highColor.m_c[0], pWeight_table[i]);
+		weightedColors[i].m_c[1] = (uint8_t)astc_interpolate(lowColor.m_c[1], highColor.m_c[1], pWeight_table[i]);
+		weightedColors[i].m_c[2] = (uint8_t)astc_interpolate(lowColor.m_c[2], highColor.m_c[2], pWeight_table[i]);
+		weightedColors[i].m_c[3] = (num_comps == 4) ? (uint8_t)astc_interpolate(lowColor.m_c[3], highColor.m_c[3], pWeight_table[i]) : 255;
+	}
+
+	// Compute dots and thresholds
+	const int ar = highColor.m_c[0] - lowColor.m_c[0];
+	const int ag = highColor.m_c[1] - lowColor.m_c[1];
+	const int ab = highColor.m_c[2] - lowColor.m_c[2];
+	const int aa = highColor.m_c[3] - lowColor.m_c[3];
+
+	int dots[32];
+	if (num_comps == 4)
+	{
+		for (uint32_t i = 0; i < num_weights; i++)
+			dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab + weightedColors[i].m_c[3] * aa;
+	}
+	else
+	{
+		assert(aa == 0);
+		for (uint32_t i = 0; i < num_weights; i++)
+			dots[i] = weightedColors[i].m_c[0] * ar + weightedColors[i].m_c[1] * ag + weightedColors[i].m_c[2] * ab;
+	}
+
+	int thresh[32 - 1];
+	for (uint32_t i = 0; i < (num_weights - 1); i++)
+		thresh[i] = (dots[i] + dots[i + 1] + 1) >> 1;
+
+	uint64_t total_err = 0;
+	if ((weights[0] | weights[1] | weights[2] | weights[3]) == 1)
+	{
+		if (num_comps == 4)
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2] + aa * pC->m_c[3];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+				int da = (int)pE1->m_c[3] - (int)pC->m_c[3];
+
+				total_err += (dr * dr) + (dg * dg) + (db * db) + (da * da);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+		else
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+
+				total_err += (dr * dr) + (dg * dg) + (db * db);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+	}
+	else
+	{
+		if (num_comps == 4)
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2] + aa * pC->m_c[3];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+				int da = (int)pE1->m_c[3] - (int)pC->m_c[3];
+
+				total_err += weights[0] * (dr * dr) + weights[1] * (dg * dg) + weights[2] * (db * db) + weights[3] * (da * da);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+		else
+		{
+			for (uint32_t i = 0; i < num_pixels; i++)
+			{
+				const color_quad_u8* pC = &pPixels[i];
+
+				int d = ar * pC->m_c[0] + ag * pC->m_c[1] + ab * pC->m_c[2];
+
+				// Find approximate selector
+				uint32_t s = 0;
+				for (int j = num_weights - 2; j >= 0; j--)
+				{
+					if (d >= thresh[j])
+					{
+						s = j + 1;
+						break;
+					}
+				}
+
+				// Compute error
+				const color_quad_u8* pE1 = &weightedColors[s];
+
+				int dr = (int)pE1->m_c[0] - (int)pC->m_c[0];
+				int dg = (int)pE1->m_c[1] - (int)pC->m_c[1];
+				int db = (int)pE1->m_c[2] - (int)pC->m_c[2];
+
+				total_err += weights[0] * (dr * dr) + weights[1] * (dg * dg) + weights[2] * (db * db);
+				if (total_err > best_err_so_far)
+					break;
+			}
+		}
+	}
+
+	return total_err;
+}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_bc7enc.h b/thirdparty/basis_universal/encoder/basisu_bc7enc.h
new file mode 100644
index 0000000000..23469912e2
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_bc7enc.h
@@ -0,0 +1,131 @@
+// File: basisu_bc7enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+
+#define BC7ENC_MAX_PARTITIONS1 (64)
+#define BC7ENC_MAX_UBER_LEVEL (4)
+
+	typedef uint8_t bc7enc_bool;
+
+#define BC7ENC_TRUE (1)
+#define BC7ENC_FALSE (0)
+		
+	typedef struct { float m_c[4]; } bc7enc_vec4F;
+
+	extern const float g_bc7_weights1x[2 * 4];
+	extern const float g_bc7_weights2x[4 * 4];
+	extern const float g_bc7_weights3x[8 * 4];
+	extern const float g_bc7_weights4x[16 * 4];
+	extern const float g_astc_weights4x[16 * 4];
+	extern const float g_astc_weights5x[32 * 4];
+	extern const float g_astc_weights_3levelsx[3 * 4];
+			
+	extern basist::astc_quant_bin g_astc_sorted_order_unquant[basist::BC7ENC_TOTAL_ASTC_RANGES][256]; // [sorted unquantized order]
+	
+	struct color_cell_compressor_params
+	{
+		uint32_t m_num_pixels;
+		const basist::color_quad_u8* m_pPixels;
+
+		uint32_t m_num_selector_weights;
+		const uint32_t* m_pSelector_weights;
+
+		const bc7enc_vec4F* m_pSelector_weightsx;
+		uint32_t m_comp_bits;
+
+		const uint8_t *m_pForce_selectors;
+
+		// Non-zero m_astc_endpoint_range enables ASTC mode. m_comp_bits and m_has_pbits are always false. We only support 2, 3, or 4 bit weight encodings.
+		uint32_t m_astc_endpoint_range;
+
+		uint32_t m_weights[4];
+		bc7enc_bool m_has_alpha;
+		bc7enc_bool m_has_pbits;
+		bc7enc_bool m_endpoints_share_pbit;
+		bc7enc_bool m_perceptual;
+	};
+
+	struct color_cell_compressor_results
+	{
+		uint64_t m_best_overall_err;
+		basist::color_quad_u8 m_low_endpoint;
+		basist::color_quad_u8 m_high_endpoint;
+		uint32_t m_pbits[2];
+		uint8_t* m_pSelectors;
+		uint8_t* m_pSelectors_temp;
+
+		// Encoded ASTC indices, if ASTC mode is enabled
+		basist::color_quad_u8 m_astc_low_endpoint;
+		basist::color_quad_u8 m_astc_high_endpoint;
+	};
+
+	struct bc7enc_compress_block_params
+	{
+		// m_max_partitions_mode1 may range from 0 (disables mode 1) to BC7ENC_MAX_PARTITIONS1. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_max_partitions_mode1;
+
+		// Relative RGBA or YCbCrA weights.
+		uint32_t m_weights[4];
+
+		// m_uber_level may range from 0 to BC7ENC_MAX_UBER_LEVEL. The higher this value, the slower the compressor, but the higher the quality.
+		uint32_t m_uber_level;
+
+		// If m_perceptual is true, colorspace error is computed in YCbCr space, otherwise RGB.
+		bc7enc_bool m_perceptual;
+
+		uint32_t m_least_squares_passes;
+	};
+
+	uint64_t color_cell_compression(uint32_t mode, const color_cell_compressor_params* pParams, color_cell_compressor_results* pResults, const bc7enc_compress_block_params* pComp_params);
+		
+	uint64_t color_cell_compression_est_astc(
+		uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeight_table,
+		uint32_t num_pixels, const basist::color_quad_u8* pPixels,
+		uint64_t best_err_so_far, const uint32_t weights[4]);
+		
+	inline void bc7enc_compress_block_params_init_linear_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_FALSE;
+		p->m_weights[0] = 1;
+		p->m_weights[1] = 1;
+		p->m_weights[2] = 1;
+		p->m_weights[3] = 1;
+	}
+
+	inline void bc7enc_compress_block_params_init_perceptual_weights(bc7enc_compress_block_params* p)
+	{
+		p->m_perceptual = BC7ENC_TRUE;
+		p->m_weights[0] = 128;
+		p->m_weights[1] = 64;
+		p->m_weights[2] = 16;
+		p->m_weights[3] = 32;
+	}
+
+	inline void bc7enc_compress_block_params_init(bc7enc_compress_block_params* p)
+	{
+		p->m_max_partitions_mode1 = BC7ENC_MAX_PARTITIONS1;
+		p->m_least_squares_passes = 1;
+		p->m_uber_level = 0;
+		bc7enc_compress_block_params_init_perceptual_weights(p);
+	}
+
+	// bc7enc_compress_block_init() MUST be called before calling bc7enc_compress_block() (or you'll get artifacts).
+	void bc7enc_compress_block_init();
+				
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_comp.cpp b/thirdparty/basis_universal/encoder/basisu_comp.cpp
new file mode 100644
index 0000000000..dc4ae11539
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp
@@ -0,0 +1,2113 @@
+// basisu_comp.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_comp.h"
+#include "basisu_enc.h"
+#include <unordered_set>
+#include <atomic>
+
+// basisu_transcoder.cpp is where basisu_miniz lives now, we just need the declarations here.
+#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+#include "basisu_miniz.h"
+
+#if !BASISD_SUPPORT_KTX2
+#error BASISD_SUPPORT_KTX2 must be enabled (set to 1).
+#endif
+
+#if BASISD_SUPPORT_KTX2_ZSTD
+#include "../zstd/zstd.h"
+#endif
+
+// Set to 1 to disable the mipPadding alignment workaround (which only seems to be needed when no key-values are written at all)
+#define BASISU_DISABLE_KTX2_ALIGNMENT_WORKAROUND (0)
+
+// Set to 1 to disable writing all KTX2 key values, triggering the validator bug.
+#define BASISU_DISABLE_KTX2_KEY_VALUES (0)
+
+using namespace buminiz;
+
+#define BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN 0
+#define DEBUG_CROP_TEXTURE_TO_64x64 (0)
+#define DEBUG_RESIZE_TEXTURE (0)
+#define DEBUG_EXTRACT_SINGLE_BLOCK (0)
+
+namespace basisu
+{
+   basis_compressor::basis_compressor() :
+		m_basis_file_size(0),
+		m_basis_bits_per_texel(0.0f),
+		m_total_blocks(0),
+		m_auto_global_sel_pal(false),
+		m_any_source_image_has_alpha(false)
+	{
+		debug_printf("basis_compressor::basis_compressor\n");
+	}
+
+	bool basis_compressor::init(const basis_compressor_params &params)
+	{
+		debug_printf("basis_compressor::init\n");
+
+		m_params = params;
+
+		if (m_params.m_debug)
+		{
+			debug_printf("basis_compressor::init:\n");
+
+#define PRINT_BOOL_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast<int>(m_params.v), m_params.v.was_changed());
+#define PRINT_INT_VALUE(v) debug_printf("%s: %i %u\n", BASISU_STRINGIZE2(v), static_cast<int>(m_params.v), m_params.v.was_changed());
+#define PRINT_UINT_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast<uint32_t>(m_params.v), m_params.v.was_changed());
+#define PRINT_FLOAT_VALUE(v) debug_printf("%s: %f %u\n", BASISU_STRINGIZE2(v), static_cast<float>(m_params.v), m_params.v.was_changed());
+
+			debug_printf("Has global selector codebook: %i\n", m_params.m_pSel_codebook != nullptr);
+
+			debug_printf("Source images: %u, source filenames: %u, source alpha filenames: %i, Source mipmap images: %u\n",
+				m_params.m_source_images.size(), m_params.m_source_filenames.size(), m_params.m_source_alpha_filenames.size(), m_params.m_source_mipmap_images.size());
+
+			if (m_params.m_source_mipmap_images.size())
+			{
+				debug_printf("m_source_mipmap_images array sizes:\n");
+				for (uint32_t i = 0; i < m_params.m_source_mipmap_images.size(); i++)
+					debug_printf("%u ", m_params.m_source_mipmap_images[i].size());
+				debug_printf("\n");
+			}
+
+			PRINT_BOOL_VALUE(m_uastc);
+			PRINT_BOOL_VALUE(m_y_flip);
+			PRINT_BOOL_VALUE(m_debug);
+			PRINT_BOOL_VALUE(m_validate);
+			PRINT_BOOL_VALUE(m_debug_images);
+			PRINT_BOOL_VALUE(m_global_sel_pal);
+			PRINT_BOOL_VALUE(m_auto_global_sel_pal);
+			PRINT_INT_VALUE(m_compression_level);
+			PRINT_BOOL_VALUE(m_no_hybrid_sel_cb);
+			PRINT_BOOL_VALUE(m_perceptual);
+			PRINT_BOOL_VALUE(m_no_endpoint_rdo);
+			PRINT_BOOL_VALUE(m_no_selector_rdo);
+			PRINT_BOOL_VALUE(m_read_source_images);
+			PRINT_BOOL_VALUE(m_write_output_basis_files);
+			PRINT_BOOL_VALUE(m_compute_stats);
+			PRINT_BOOL_VALUE(m_check_for_alpha);
+			PRINT_BOOL_VALUE(m_force_alpha);
+			debug_printf("swizzle: %d,%d,%d,%d\n",
+				m_params.m_swizzle[0],
+				m_params.m_swizzle[1],
+				m_params.m_swizzle[2],
+				m_params.m_swizzle[3]);
+			PRINT_BOOL_VALUE(m_renormalize);
+			PRINT_BOOL_VALUE(m_multithreading);
+			PRINT_BOOL_VALUE(m_disable_hierarchical_endpoint_codebooks);
+			
+			PRINT_FLOAT_VALUE(m_hybrid_sel_cb_quality_thresh);
+			
+			PRINT_INT_VALUE(m_global_pal_bits);
+			PRINT_INT_VALUE(m_global_mod_bits);
+
+			PRINT_FLOAT_VALUE(m_endpoint_rdo_thresh);
+			PRINT_FLOAT_VALUE(m_selector_rdo_thresh);
+			
+			PRINT_BOOL_VALUE(m_mip_gen);
+			PRINT_BOOL_VALUE(m_mip_renormalize);
+			PRINT_BOOL_VALUE(m_mip_wrapping);
+			PRINT_BOOL_VALUE(m_mip_fast);
+			PRINT_BOOL_VALUE(m_mip_srgb);
+			PRINT_FLOAT_VALUE(m_mip_premultiplied);
+			PRINT_FLOAT_VALUE(m_mip_scale);
+			PRINT_INT_VALUE(m_mip_smallest_dimension);
+			debug_printf("m_mip_filter: %s\n", m_params.m_mip_filter.c_str());
+
+			debug_printf("m_max_endpoint_clusters: %u\n", m_params.m_max_endpoint_clusters);
+			debug_printf("m_max_selector_clusters: %u\n", m_params.m_max_selector_clusters);
+			debug_printf("m_quality_level: %i\n", m_params.m_quality_level);
+
+			debug_printf("m_tex_type: %u\n", m_params.m_tex_type);
+			debug_printf("m_userdata0: 0x%X, m_userdata1: 0x%X\n", m_params.m_userdata0, m_params.m_userdata1);
+			debug_printf("m_us_per_frame: %i (%f fps)\n", m_params.m_us_per_frame, m_params.m_us_per_frame ? 1.0f / (m_params.m_us_per_frame / 1000000.0f) : 0);
+			debug_printf("m_pack_uastc_flags: 0x%X\n", m_params.m_pack_uastc_flags);
+			
+			PRINT_BOOL_VALUE(m_rdo_uastc);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_quality_scalar);
+			PRINT_INT_VALUE(m_rdo_uastc_dict_size);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_max_allowed_rms_increase_ratio);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_skip_block_rms_thresh);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_max_smooth_block_error_scale);
+			PRINT_FLOAT_VALUE(m_rdo_uastc_smooth_block_max_std_dev);
+			PRINT_BOOL_VALUE(m_rdo_uastc_favor_simpler_modes_in_rdo_mode)
+			PRINT_BOOL_VALUE(m_rdo_uastc_multithreading);
+
+			PRINT_INT_VALUE(m_resample_width);
+			PRINT_INT_VALUE(m_resample_height);
+			PRINT_FLOAT_VALUE(m_resample_factor);
+			debug_printf("Has global codebooks: %u\n", m_params.m_pGlobal_codebooks ? 1 : 0);
+			if (m_params.m_pGlobal_codebooks)
+			{
+				debug_printf("Global codebook endpoints: %u selectors: %u\n", m_params.m_pGlobal_codebooks->get_endpoints().size(), m_params.m_pGlobal_codebooks->get_selectors().size());
+			}
+
+			PRINT_BOOL_VALUE(m_create_ktx2_file);
+
+			debug_printf("KTX2 UASTC supercompression: %u\n", m_params.m_ktx2_uastc_supercompression);
+			debug_printf("KTX2 Zstd supercompression level: %i\n", (int)m_params.m_ktx2_zstd_supercompression_level);
+			debug_printf("KTX2 sRGB transfer func: %u\n", (int)m_params.m_ktx2_srgb_transfer_func);
+			debug_printf("Total KTX2 key values: %u\n", m_params.m_ktx2_key_values.size());
+			for (uint32_t i = 0; i < m_params.m_ktx2_key_values.size(); i++)
+			{
+				debug_printf("Key: \"%s\"\n", m_params.m_ktx2_key_values[i].m_key.data());
+				debug_printf("Value size: %u\n", m_params.m_ktx2_key_values[i].m_value.size());
+			}
+						
+#undef PRINT_BOOL_VALUE
+#undef PRINT_INT_VALUE
+#undef PRINT_UINT_VALUE
+#undef PRINT_FLOAT_VALUE
+		}
+
+		if ((m_params.m_read_source_images) && (!m_params.m_source_filenames.size()))
+		{
+			assert(0);
+			return false;
+		}
+
+		return true;
+	}
+		
+	basis_compressor::error_code basis_compressor::process()
+	{
+		debug_printf("basis_compressor::process\n");
+
+		if (!read_source_images())
+			return cECFailedReadingSourceImages;
+
+		if (!validate_texture_type_constraints())
+			return cECFailedValidating;
+
+		if (m_params.m_create_ktx2_file)
+		{
+			if (!validate_ktx2_constraints())
+				return cECFailedValidating;
+		}
+
+		if (!extract_source_blocks())
+			return cECFailedFrontEnd;
+
+		if (m_params.m_uastc)
+		{
+			error_code ec = encode_slices_to_uastc();
+			if (ec != cECSuccess)
+				return ec;
+		}
+		else
+		{
+			if (!process_frontend())
+				return cECFailedFrontEnd;
+
+			if (!extract_frontend_texture_data())
+				return cECFailedFontendExtract;
+
+			if (!process_backend())
+				return cECFailedBackend;
+		}
+
+		if (!create_basis_file_and_transcode())
+			return cECFailedCreateBasisFile;
+		
+		if (m_params.m_create_ktx2_file)
+		{
+			if (!create_ktx2_file())
+				return cECFailedCreateKTX2File;
+		}
+
+		if (!write_output_files_and_compute_stats())
+			return cECFailedWritingOutput;
+
+		return cECSuccess;
+	}
+
+	basis_compressor::error_code basis_compressor::encode_slices_to_uastc()
+	{
+		debug_printf("basis_compressor::encode_slices_to_uastc\n");
+
+		m_uastc_slice_textures.resize(m_slice_descs.size());
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+			m_uastc_slice_textures[slice_index].init(texture_format::cUASTC4x4, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height);
+
+		m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC4x4;
+		m_uastc_backend_output.m_etc1s = false;
+		m_uastc_backend_output.m_slice_desc = m_slice_descs;
+		m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size());
+		m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size());
+				
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			gpu_image& tex = m_uastc_slice_textures[slice_index];
+			basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+			(void)slice_desc;
+
+			const uint32_t num_blocks_x = tex.get_blocks_x();
+			const uint32_t num_blocks_y = tex.get_blocks_y();
+			const uint32_t total_blocks = tex.get_total_blocks();
+			const image& source_image = m_slice_images[slice_index];
+			
+			std::atomic<uint32_t> total_blocks_processed;
+			total_blocks_processed = 0;
+
+			const uint32_t N = 256;
+			for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(total_blocks, block_index_iter + N);
+
+				// FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten.
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed]
+					{
+#endif
+						BASISU_NOTE_UNUSED(num_blocks_y);
+						
+						uint32_t uastc_flags = m_params.m_pack_uastc_flags;
+						if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode))
+							uastc_flags |= cPackUASTCFavorSimplerModes;
+
+						for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+						{
+							const uint32_t block_x = block_index % num_blocks_x;
+							const uint32_t block_y = block_index / num_blocks_x;
+
+							color_rgba block_pixels[4][4];
+
+							source_image.extract_block_clamped((color_rgba*)block_pixels, block_x * 4, block_y * 4, 4, 4);
+
+							basist::uastc_block& dest_block = *(basist::uastc_block*)tex.get_block_ptr(block_x, block_y);
+
+							encode_uastc(&block_pixels[0][0].r, dest_block, uastc_flags);
+
+							total_blocks_processed++;
+							
+							uint32_t val = total_blocks_processed;
+							if ((val & 16383) == 16383)
+							{
+								debug_printf("basis_compressor::encode_slices_to_uastc: %3.1f%% done\n", static_cast<float>(val) * 100.0f / total_blocks);
+							}
+
+						}
+
+#ifndef __EMSCRIPTEN__
+					});
+#endif
+
+			} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			if (m_params.m_rdo_uastc)
+			{
+				uastc_rdo_params rdo_params;
+				rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar;
+				rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio;
+				rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh;
+				rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size;
+				rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale;
+				rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev;
+								
+				bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(),
+					(const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr,
+					(m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? basisu::minimum<uint32_t>(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0);
+				if (!status)
+				{
+					return cECFailedUASTCRDOPostProcess;
+				}
+			}
+
+			m_uastc_backend_output.m_slice_image_data[slice_index].resize(tex.get_size_in_bytes());
+			memcpy(&m_uastc_backend_output.m_slice_image_data[slice_index][0], tex.get_ptr(), tex.get_size_in_bytes());
+			
+			m_uastc_backend_output.m_slice_image_crcs[slice_index] = basist::crc16(tex.get_ptr(), tex.get_size_in_bytes(), 0);
+						
+		} // slice_index
+				
+		return cECSuccess;
+	}
+
+	bool basis_compressor::generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha)
+	{
+		debug_printf("basis_compressor::generate_mipmaps\n");
+
+		interval_timer tm;
+		tm.start();
+
+		uint32_t total_levels = 1;
+		uint32_t w = img.get_width(), h = img.get_height();
+		while (maximum<uint32_t>(w, h) > (uint32_t)m_params.m_mip_smallest_dimension)
+		{
+			w = maximum(w >> 1U, 1U);
+			h = maximum(h >> 1U, 1U);
+			total_levels++;
+		}
+
+#if BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN
+		// Requires stb_image_resize
+		stbir_filter filter = STBIR_FILTER_DEFAULT;
+		if (m_params.m_mip_filter == "box")
+			filter = STBIR_FILTER_BOX;
+		else if (m_params.m_mip_filter == "triangle")
+			filter = STBIR_FILTER_TRIANGLE;
+		else if (m_params.m_mip_filter == "cubic")
+			filter = STBIR_FILTER_CUBICBSPLINE;
+		else if (m_params.m_mip_filter == "catmull")
+			filter = STBIR_FILTER_CATMULLROM;
+		else if (m_params.m_mip_filter == "mitchell")
+			filter = STBIR_FILTER_MITCHELL;
+
+		for (uint32_t level = 1; level < total_levels; level++)
+		{
+			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
+			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
+
+			image &level_img = *enlarge_vector(mips, 1);
+			level_img.resize(level_width, level_height);
+						
+			int result = stbir_resize_uint8_generic( 
+				(const uint8_t *)img.get_ptr(), img.get_width(), img.get_height(), img.get_pitch() * sizeof(color_rgba),
+            (uint8_t *)level_img.get_ptr(), level_img.get_width(), level_img.get_height(), level_img.get_pitch() * sizeof(color_rgba),
+            has_alpha ? 4 : 3, has_alpha ? 3 : STBIR_ALPHA_CHANNEL_NONE, m_params.m_mip_premultiplied ? STBIR_FLAG_ALPHA_PREMULTIPLIED : 0,
+            m_params.m_mip_wrapping ? STBIR_EDGE_WRAP : STBIR_EDGE_CLAMP, filter, m_params.m_mip_srgb ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, 
+				nullptr);
+
+			if (result == 0)
+			{
+				error_printf("basis_compressor::generate_mipmaps: stbir_resize_uint8_generic() failed!\n");
+				return false;
+			}
+			
+			if (m_params.m_mip_renormalize)
+				level_img.renormalize_normal_map();
+		}
+#else
+		for (uint32_t level = 1; level < total_levels; level++)
+		{
+			const uint32_t level_width = maximum<uint32_t>(1, img.get_width() >> level);
+			const uint32_t level_height = maximum<uint32_t>(1, img.get_height() >> level);
+
+			image& level_img = *enlarge_vector(mips, 1);
+			level_img.resize(level_width, level_height);
+
+			const image* pSource_image = &img;
+
+			if (m_params.m_mip_fast)
+			{
+				if (level > 1)
+					pSource_image = &mips[level - 1];
+			}
+
+			bool status = image_resample(*pSource_image, level_img, m_params.m_mip_srgb, m_params.m_mip_filter.c_str(), m_params.m_mip_scale, m_params.m_mip_wrapping, 0, has_alpha ? 4 : 3);
+			if (!status)
+			{
+				error_printf("basis_compressor::generate_mipmaps: image_resample() failed!\n");
+				return false;
+			}
+
+			if (m_params.m_mip_renormalize)
+				level_img.renormalize_normal_map();
+		}
+#endif
+
+		if (m_params.m_debug)
+			debug_printf("Total mipmap generation time: %f secs\n", tm.get_elapsed_secs());
+
+		return true;
+	}
+
+	bool basis_compressor::read_source_images()
+	{
+		debug_printf("basis_compressor::read_source_images\n");
+
+		const uint32_t total_source_files = m_params.m_read_source_images ? (uint32_t)m_params.m_source_filenames.size() : (uint32_t)m_params.m_source_images.size();
+		if (!total_source_files)
+			return false;
+
+		m_stats.resize(0);
+		m_slice_descs.resize(0);
+		m_slice_images.resize(0);
+
+		m_total_blocks = 0;
+		uint32_t total_macroblocks = 0;
+
+		m_any_source_image_has_alpha = false;
+
+		basisu::vector<image> source_images;
+		basisu::vector<std::string> source_filenames;
+		
+		// First load all source images, and determine if any have an alpha channel.
+		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+		{
+			const char *pSource_filename = "";
+
+			image file_image;
+			
+			if (m_params.m_read_source_images)
+			{
+				pSource_filename = m_params.m_source_filenames[source_file_index].c_str();
+
+				// Load the source image
+				if (!load_image(pSource_filename, file_image))
+				{
+					error_printf("Failed reading source image: %s\n", pSource_filename);
+					return false;
+				}
+
+				printf("Read source image \"%s\", %ux%u\n", pSource_filename, file_image.get_width(), file_image.get_height());
+
+				// Optionally load another image and put a grayscale version of it into the alpha channel.
+				if ((source_file_index < m_params.m_source_alpha_filenames.size()) && (m_params.m_source_alpha_filenames[source_file_index].size()))
+				{
+					const char *pSource_alpha_image = m_params.m_source_alpha_filenames[source_file_index].c_str();
+
+					image alpha_data;
+
+					if (!load_image(pSource_alpha_image, alpha_data))
+					{
+						error_printf("Failed reading source image: %s\n", pSource_alpha_image);
+						return false;
+					}
+
+					printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height());
+
+					alpha_data.crop(file_image.get_width(), file_image.get_height());
+
+					for (uint32_t y = 0; y < file_image.get_height(); y++)
+						for (uint32_t x = 0; x < file_image.get_width(); x++)
+							file_image(x, y).a = (uint8_t)alpha_data(x, y).get_709_luma();
+				}
+			}
+			else
+			{
+				file_image = m_params.m_source_images[source_file_index];
+			}
+
+			if (m_params.m_renormalize)
+				file_image.renormalize_normal_map();
+
+			bool alpha_swizzled = false;
+			if (m_params.m_swizzle[0] != 0 ||
+				m_params.m_swizzle[1] != 1 ||
+				m_params.m_swizzle[2] != 2 ||
+				m_params.m_swizzle[3] != 3)
+			{
+				// Used for XY normal maps in RG - puts X in color, Y in alpha
+				for (uint32_t y = 0; y < file_image.get_height(); y++)
+					for (uint32_t x = 0; x < file_image.get_width(); x++)
+					{
+						const color_rgba &c = file_image(x, y);
+						file_image(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]);
+					}
+				alpha_swizzled = m_params.m_swizzle[3] != 3;
+			}
+						
+			bool has_alpha = false;
+			if (m_params.m_force_alpha || alpha_swizzled)
+				has_alpha = true;
+			else if (!m_params.m_check_for_alpha)
+				file_image.set_alpha(255);
+			else if (file_image.has_alpha())
+				has_alpha = true;
+
+			if (has_alpha)
+				m_any_source_image_has_alpha = true;
+
+			debug_printf("Source image index %u filename %s %ux%u has alpha: %u\n", source_file_index, pSource_filename, file_image.get_width(), file_image.get_height(), has_alpha);
+												
+			if (m_params.m_y_flip)
+				file_image.flip_y();
+
+#if DEBUG_EXTRACT_SINGLE_BLOCK
+			image block_image(4, 4);
+			const uint32_t block_x = 0;
+			const uint32_t block_y = 0;
+			block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0);
+			file_image = block_image;
+#endif
+
+#if DEBUG_CROP_TEXTURE_TO_64x64
+			file_image.resize(64, 64);
+#endif
+
+			if (m_params.m_resample_width > 0 && m_params.m_resample_height > 0)
+			{
+				int new_width = basisu::minimum<int>(m_params.m_resample_width, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+				int new_height = basisu::minimum<int>(m_params.m_resample_height, BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+
+				debug_printf("Resampling to %ix%i\n", new_width, new_height);
+
+				// TODO: A box filter - kaiser looks too sharp on video. Let the caller control this.
+				image temp_img(new_width, new_height);
+				image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
+				temp_img.swap(file_image);
+			}
+			else if (m_params.m_resample_factor > 0.0f)
+			{
+				int new_width = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_width() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+				int new_height = basisu::minimum<int>(basisu::maximum(1, (int)ceilf(file_image.get_height() * m_params.m_resample_factor)), BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION);
+
+				debug_printf("Resampling to %ix%i\n", new_width, new_height);
+
+				// TODO: A box filter - kaiser looks too sharp on video. Let the caller control this.
+				image temp_img(new_width, new_height);
+				image_resample(file_image, temp_img, m_params.m_perceptual, "box"); // "kaiser");
+				temp_img.swap(file_image);
+			}
+
+			if ((!file_image.get_width()) || (!file_image.get_height()))
+			{
+				error_printf("basis_compressor::read_source_images: Source image has a zero width and/or height!\n");
+				return false;
+			}
+
+			if ((file_image.get_width() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION) || (file_image.get_height() > BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION))
+			{
+				error_printf("basis_compressor::read_source_images: Source image is too large!\n");
+				return false;
+			}
+
+			source_images.push_back(file_image);
+			source_filenames.push_back(pSource_filename);
+		}
+
+		// Check if the caller has generated their own mipmaps. 
+		if (m_params.m_source_mipmap_images.size())
+		{
+			// Make sure they've passed us enough mipmap chains.
+			if ((m_params.m_source_images.size() != m_params.m_source_mipmap_images.size()) || (total_source_files != m_params.m_source_images.size()))
+			{
+				error_printf("basis_compressor::read_source_images(): m_params.m_source_mipmap_images.size() must equal m_params.m_source_images.size()!\n");
+				return false;
+			}
+
+			// Check if any of the user-supplied mipmap levels has alpha.
+			// We're assuming the user has already preswizzled their mipmap source images.
+			if (!m_any_source_image_has_alpha)
+			{
+				for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+				{
+					for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++)
+					{
+						const image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index];
+
+						if (mip_img.has_alpha())
+						{
+							m_any_source_image_has_alpha = true;
+							break;
+						}
+					}
+
+					if (m_any_source_image_has_alpha)
+						break;
+				}
+			}
+		}
+
+		debug_printf("Any source image has alpha: %u\n", m_any_source_image_has_alpha);
+
+		for (uint32_t source_file_index = 0; source_file_index < total_source_files; source_file_index++)
+		{
+			image &file_image = source_images[source_file_index];
+			const std::string &source_filename = source_filenames[source_file_index];
+
+			// Now, for each source image, create the slices corresponding to that image.
+			basisu::vector<image> slices;
+			
+			slices.reserve(32);
+			
+			// The first (largest) mipmap level.
+			slices.push_back(file_image);
+			
+			if (m_params.m_source_mipmap_images.size())
+			{
+				// User-provided mipmaps for each layer or image in the texture array.
+				for (uint32_t mip_index = 0; mip_index < m_params.m_source_mipmap_images[source_file_index].size(); mip_index++)
+				{
+					image& mip_img = m_params.m_source_mipmap_images[source_file_index][mip_index];
+
+					if (m_params.m_swizzle[0] != 0 ||
+						m_params.m_swizzle[1] != 1 ||
+						m_params.m_swizzle[2] != 2 ||
+						m_params.m_swizzle[3] != 3)
+					{
+						// Used for XY normal maps in RG - puts X in color, Y in alpha
+						for (uint32_t y = 0; y < mip_img.get_height(); y++)
+							for (uint32_t x = 0; x < mip_img.get_width(); x++)
+							{
+								const color_rgba &c = mip_img(x, y);
+								mip_img(x, y).set_noclamp_rgba(c[m_params.m_swizzle[0]], c[m_params.m_swizzle[1]], c[m_params.m_swizzle[2]], c[m_params.m_swizzle[3]]);
+							}
+					}
+
+					slices.push_back(mip_img);
+				}
+			}
+			else if (m_params.m_mip_gen)
+			{
+				// Automatically generate mipmaps.
+				if (!generate_mipmaps(file_image, slices, m_any_source_image_has_alpha))
+					return false;
+			}
+
+			uint_vec mip_indices(slices.size());
+			for (uint32_t i = 0; i < slices.size(); i++)
+				mip_indices[i] = i;
+						
+			if ((m_any_source_image_has_alpha) && (!m_params.m_uastc))
+			{
+				// For ETC1S, if source has alpha, then even mips will have RGB, and odd mips will have alpha in RGB. 
+				basisu::vector<image> alpha_slices;
+				uint_vec new_mip_indices;
+
+				alpha_slices.reserve(slices.size() * 2);
+
+				for (uint32_t i = 0; i < slices.size(); i++)
+				{
+					image lvl_rgb(slices[i]);
+					image lvl_a(lvl_rgb);
+
+					for (uint32_t y = 0; y < lvl_a.get_height(); y++)
+					{
+						for (uint32_t x = 0; x < lvl_a.get_width(); x++)
+						{
+							uint8_t a = lvl_a(x, y).a;
+							lvl_a(x, y).set_noclamp_rgba(a, a, a, 255);
+						}
+					}
+					
+					lvl_rgb.set_alpha(255);
+
+					alpha_slices.push_back(lvl_rgb);
+					new_mip_indices.push_back(i);
+
+					alpha_slices.push_back(lvl_a);
+					new_mip_indices.push_back(i);
+				}
+
+				slices.swap(alpha_slices);
+				mip_indices.swap(new_mip_indices);
+			}
+
+			assert(slices.size() == mip_indices.size());
+						
+			for (uint32_t slice_index = 0; slice_index < slices.size(); slice_index++)
+			{
+				image& slice_image = slices[slice_index];
+				const uint32_t orig_width = slice_image.get_width();
+				const uint32_t orig_height = slice_image.get_height();
+
+				bool is_alpha_slice = false;
+				if (m_any_source_image_has_alpha)
+				{
+					if (m_params.m_uastc)
+					{
+						is_alpha_slice = slice_image.has_alpha();
+					}
+					else
+					{
+						is_alpha_slice = (slice_index & 1) != 0;
+					}
+				}
+
+				// Enlarge the source image to 4x4 block boundaries, duplicating edge pixels if necessary to avoid introducing extra colors into blocks.
+				slice_image.crop_dup_borders(slice_image.get_block_width(4) * 4, slice_image.get_block_height(4) * 4);
+
+				if (m_params.m_debug_images)
+				{
+					save_png(string_format("basis_debug_source_image_%u_slice_%u.png", source_file_index, slice_index).c_str(), slice_image);
+				}
+
+				enlarge_vector(m_stats, 1);
+				enlarge_vector(m_slice_images, 1);
+				enlarge_vector(m_slice_descs, 1);
+
+				const uint32_t dest_image_index = (uint32_t)m_stats.size() - 1;
+
+				m_stats[dest_image_index].m_filename = source_filename.c_str();
+				m_stats[dest_image_index].m_width = orig_width;
+				m_stats[dest_image_index].m_height = orig_height;
+
+				m_slice_images[dest_image_index] = slice_image;
+
+				debug_printf("****** Slice %u: mip %u, alpha_slice: %u, filename: \"%s\", original: %ux%u actual: %ux%u\n", m_slice_descs.size() - 1, mip_indices[slice_index], is_alpha_slice, source_filename.c_str(), orig_width, orig_height, slice_image.get_width(), slice_image.get_height());
+
+				basisu_backend_slice_desc &slice_desc = m_slice_descs[dest_image_index];
+
+				slice_desc.m_first_block_index = m_total_blocks;
+
+				slice_desc.m_orig_width = orig_width;
+				slice_desc.m_orig_height = orig_height;
+
+				slice_desc.m_width = slice_image.get_width();
+				slice_desc.m_height = slice_image.get_height();
+
+				slice_desc.m_num_blocks_x = slice_image.get_block_width(4);
+				slice_desc.m_num_blocks_y = slice_image.get_block_height(4);
+
+				slice_desc.m_num_macroblocks_x = (slice_desc.m_num_blocks_x + 1) >> 1;
+				slice_desc.m_num_macroblocks_y = (slice_desc.m_num_blocks_y + 1) >> 1;
+
+				slice_desc.m_source_file_index = source_file_index;
+				
+				slice_desc.m_mip_index = mip_indices[slice_index];
+
+				slice_desc.m_alpha = is_alpha_slice;
+				slice_desc.m_iframe = false;
+				if (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames)
+				{
+					slice_desc.m_iframe = (source_file_index == 0);
+				}
+
+				m_total_blocks += slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
+				total_macroblocks += slice_desc.m_num_macroblocks_x * slice_desc.m_num_macroblocks_y;
+			
+			} // slice_index
+
+		} // source_file_index
+
+		debug_printf("Total blocks: %u, Total macroblocks: %u\n", m_total_blocks, total_macroblocks);
+
+		// Make sure we don't have too many slices
+		if (m_slice_descs.size() > BASISU_MAX_SLICES)
+		{
+			error_printf("Too many slices!\n");
+			return false;
+		}
+				
+		// Basic sanity check on the slices
+		for (uint32_t i = 1; i < m_slice_descs.size(); i++)
+		{
+			const basisu_backend_slice_desc &prev_slice_desc = m_slice_descs[i - 1];
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
+
+			// Make sure images are in order
+			int image_delta = (int)slice_desc.m_source_file_index - (int)prev_slice_desc.m_source_file_index;
+			if (image_delta > 1)
+				return false;	
+
+			// Make sure mipmap levels are in order
+			if (!image_delta)
+			{
+				int level_delta = (int)slice_desc.m_mip_index - (int)prev_slice_desc.m_mip_index;
+				if (level_delta > 1)
+					return false;
+			}
+		}
+
+		if (m_params.m_status_output)
+		{
+			printf("Total basis file slices: %u\n", (uint32_t)m_slice_descs.size());
+		}
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
+
+			if (m_params.m_status_output)
+			{
+				printf("Slice: %u, alpha: %u, orig width/height: %ux%u, width/height: %ux%u, first_block: %u, image_index: %u, mip_level: %u, iframe: %u\n",
+					i, slice_desc.m_alpha, slice_desc.m_orig_width, slice_desc.m_orig_height, slice_desc.m_width, slice_desc.m_height, slice_desc.m_first_block_index, slice_desc.m_source_file_index, slice_desc.m_mip_index, slice_desc.m_iframe);
+			}
+
+			if (m_any_source_image_has_alpha)
+			{
+				if (!m_params.m_uastc)
+				{
+					// For ETC1S, alpha slices must be at odd slice indices.
+					if (slice_desc.m_alpha)
+					{
+						if ((i & 1) == 0)
+							return false;
+
+						const basisu_backend_slice_desc& prev_slice_desc = m_slice_descs[i - 1];
+
+						// Make sure previous slice has this image's color data
+						if (prev_slice_desc.m_source_file_index != slice_desc.m_source_file_index)
+							return false;
+						if (prev_slice_desc.m_alpha)
+							return false;
+						if (prev_slice_desc.m_mip_index != slice_desc.m_mip_index)
+							return false;
+						if (prev_slice_desc.m_num_blocks_x != slice_desc.m_num_blocks_x)
+							return false;
+						if (prev_slice_desc.m_num_blocks_y != slice_desc.m_num_blocks_y)
+							return false;
+					}
+					else if (i & 1)
+						return false;
+				}
+			}
+			else if (slice_desc.m_alpha)
+			{
+				return false;
+			}
+
+			if ((slice_desc.m_orig_width > slice_desc.m_width) || (slice_desc.m_orig_height > slice_desc.m_height))
+				return false;
+			if ((slice_desc.m_source_file_index == 0) && (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
+			{
+				if (!slice_desc.m_iframe)
+					return false;
+			}
+		}
+
+		return true;
+	}
+
+	// Do some basic validation for 2D arrays, cubemaps, video, and volumes.
+	bool basis_compressor::validate_texture_type_constraints() 
+	{
+		debug_printf("basis_compressor::validate_texture_type_constraints\n");
+
+		// In 2D mode anything goes (each image may have a different resolution and # of mipmap levels).
+		if (m_params.m_tex_type == basist::cBASISTexType2D)
+			return true;
+				
+		uint32_t total_basis_images = 0;
+
+		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+				
+			total_basis_images = maximum<uint32_t>(total_basis_images, slice_desc.m_source_file_index + 1);
+		}
+
+		if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray)
+		{
+			// For cubemaps, validate that the total # of Basis images is a multiple of 6.
+			if ((total_basis_images % 6) != 0)
+			{
+				error_printf("basis_compressor::validate_texture_type_constraints: For cubemaps the total number of input images is not a multiple of 6!\n");
+				return false;
+			}
+		}
+
+		// Now validate that all the mip0's have the same dimensions, and that each image has the same # of mipmap levels.
+		uint_vec image_mipmap_levels(total_basis_images);
+
+		int width = -1, height = -1;
+		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+
+			image_mipmap_levels[slice_desc.m_source_file_index] = maximum(image_mipmap_levels[slice_desc.m_source_file_index], slice_desc.m_mip_index + 1);
+				
+			if (slice_desc.m_mip_index != 0)
+				continue;
+
+			if (width < 0)
+			{
+				width = slice_desc.m_orig_width;
+				height = slice_desc.m_orig_height;
+			}
+			else if ((width != (int)slice_desc.m_orig_width) || (height != (int)slice_desc.m_orig_height))
+			{
+				error_printf("basis_compressor::validate_texture_type_constraints: The source image resolutions are not all equal!\n");
+				return false;
+			}
+		}
+
+		for (size_t i = 1; i < image_mipmap_levels.size(); i++)
+		{
+			if (image_mipmap_levels[0] != image_mipmap_levels[i])
+			{
+				error_printf("basis_compressor::validate_texture_type_constraints: Each image must have the same number of mipmap levels!\n");
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::extract_source_blocks()
+	{
+		debug_printf("basis_compressor::extract_source_blocks\n");
+
+		m_source_blocks.resize(m_total_blocks);
+
+		for (uint32_t slice_index = 0; slice_index < m_slice_images.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+
+			const uint32_t num_blocks_x = slice_desc.m_num_blocks_x;
+			const uint32_t num_blocks_y = slice_desc.m_num_blocks_y;
+
+			const image& source_image = m_slice_images[slice_index];
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					source_image.extract_block_clamped(m_source_blocks[slice_desc.m_first_block_index + block_x + block_y * num_blocks_x].get_ptr(), block_x * 4, block_y * 4, 4, 4);
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::process_frontend()
+	{
+		debug_printf("basis_compressor::process_frontend\n");
+						
+#if 0
+		// TODO
+		basis_etc1_pack_params pack_params;
+		pack_params.m_quality = cETCQualityMedium;
+		pack_params.m_perceptual = m_params.m_perceptual;
+		pack_params.m_use_color4 = false;
+
+		pack_etc1_block_context pack_context;
+
+		std::unordered_set<uint64_t> endpoint_hash;
+		std::unordered_set<uint32_t> selector_hash;
+
+		for (uint32_t i = 0; i < m_source_blocks.size(); i++)
+		{
+			etc_block blk;
+			pack_etc1_block(blk, m_source_blocks[i].get_ptr(), pack_params, pack_context);
+
+			const color_rgba c0(blk.get_block_color(0, false));
+			endpoint_hash.insert((c0.r | (c0.g << 5) | (c0.b << 10)) | (blk.get_inten_table(0) << 16));
+
+			const color_rgba c1(blk.get_block_color(1, false));
+			endpoint_hash.insert((c1.r | (c1.g << 5) | (c1.b << 10)) | (blk.get_inten_table(1) << 16));
+
+			selector_hash.insert(blk.get_raw_selector_bits());
+		}
+
+		const uint32_t total_unique_endpoints = (uint32_t)endpoint_hash.size();
+		const uint32_t total_unique_selectors = (uint32_t)selector_hash.size();
+
+		if (m_params.m_debug)
+		{
+			debug_printf("Unique endpoints: %u, unique selectors: %u\n", total_unique_endpoints, total_unique_selectors);
+		}
+#endif
+
+		const double total_texels = m_total_blocks * 16.0f;
+
+		int endpoint_clusters = m_params.m_max_endpoint_clusters;
+		int selector_clusters = m_params.m_max_selector_clusters;
+
+		if (endpoint_clusters > basisu_frontend::cMaxEndpointClusters)
+		{
+			error_printf("Too many endpoint clusters! (%u but max is %u)\n", endpoint_clusters, basisu_frontend::cMaxEndpointClusters);
+			return false;
+		}
+		if (selector_clusters > basisu_frontend::cMaxSelectorClusters)
+		{
+			error_printf("Too many selector clusters! (%u but max is %u)\n", selector_clusters, basisu_frontend::cMaxSelectorClusters);
+			return false;
+		}
+		
+		if (m_params.m_quality_level != -1)
+		{
+			const float quality = saturate(m_params.m_quality_level / 255.0f);
+									
+			const float bits_per_endpoint_cluster = 14.0f;
+			const float max_desired_endpoint_cluster_bits_per_texel = 1.0f; // .15f
+			int max_endpoints = static_cast<int>((max_desired_endpoint_cluster_bits_per_texel * total_texels) / bits_per_endpoint_cluster);
+			
+			const float mid = 128.0f / 255.0f;
+
+			float color_endpoint_quality = quality;
+
+			const float endpoint_split_point = 0.5f;
+			
+			// In v1.2 and in previous versions, the endpoint codebook size at quality 128 was 3072. This wasn't quite large enough.
+			const int ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE = 4800;
+			const int MAX_ENDPOINT_CODEBOOK_SIZE = 8192;
+
+			if (color_endpoint_quality <= mid)
+			{
+				color_endpoint_quality = lerp(0.0f, endpoint_split_point, powf(color_endpoint_quality / mid, .65f));
+
+				max_endpoints = clamp<int>(max_endpoints, 256, ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE);
+				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
+								
+				if (max_endpoints < 64)
+					max_endpoints = 64;
+				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(32, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
+			}
+			else
+			{
+				color_endpoint_quality = powf((color_endpoint_quality - mid) / (1.0f - mid), 1.6f);
+
+				max_endpoints = clamp<int>(max_endpoints, 256, MAX_ENDPOINT_CODEBOOK_SIZE);
+				max_endpoints = minimum<uint32_t>(max_endpoints, m_total_blocks);
+								
+				if (max_endpoints < ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE)
+					max_endpoints = ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE;
+				endpoint_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(ENDPOINT_CODEBOOK_MID_QUALITY_CODEBOOK_SIZE, static_cast<float>(max_endpoints), color_endpoint_quality)), 32, basisu_frontend::cMaxEndpointClusters);
+			}
+						
+			float bits_per_selector_cluster = m_params.m_global_sel_pal ? 21.0f : 14.0f;
+
+			const float max_desired_selector_cluster_bits_per_texel = 1.0f; // .15f
+			int max_selectors = static_cast<int>((max_desired_selector_cluster_bits_per_texel * total_texels) / bits_per_selector_cluster);
+			max_selectors = clamp<int>(max_selectors, 256, basisu_frontend::cMaxSelectorClusters);
+			max_selectors = minimum<uint32_t>(max_selectors, m_total_blocks);
+
+			float color_selector_quality = quality;
+			//color_selector_quality = powf(color_selector_quality, 1.65f);
+			color_selector_quality = powf(color_selector_quality, 2.62f);
+
+			if (max_selectors < 96)
+				max_selectors = 96;
+			selector_clusters = clamp<uint32_t>((uint32_t)(.5f + lerp<float>(96, static_cast<float>(max_selectors), color_selector_quality)), 8, basisu_frontend::cMaxSelectorClusters);
+
+			debug_printf("Max endpoints: %u, max selectors: %u\n", endpoint_clusters, selector_clusters);
+
+			if (m_params.m_quality_level >= 223)
+			{
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+				{
+					if (!m_params.m_endpoint_rdo_thresh.was_changed())
+						m_params.m_endpoint_rdo_thresh *= .25f;
+					
+					if (!m_params.m_selector_rdo_thresh.was_changed())
+						m_params.m_selector_rdo_thresh *= .25f;
+				}
+			}
+			else if (m_params.m_quality_level >= 192)
+			{
+				if (!m_params.m_endpoint_rdo_thresh.was_changed())
+					m_params.m_endpoint_rdo_thresh *= .5f;
+
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+					m_params.m_selector_rdo_thresh *= .5f;
+			}
+			else if (m_params.m_quality_level >= 160)
+			{
+				if (!m_params.m_endpoint_rdo_thresh.was_changed())
+					m_params.m_endpoint_rdo_thresh *= .75f;
+
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+					m_params.m_selector_rdo_thresh *= .75f;
+			}
+			else if (m_params.m_quality_level >= 129)
+			{
+				float l = (quality - 129 / 255.0f) / ((160 - 129) / 255.0f);
+
+				if (!m_params.m_endpoint_rdo_thresh.was_changed())
+					m_params.m_endpoint_rdo_thresh *= lerp<float>(1.0f, .75f, l);
+				
+				if (!m_params.m_selector_rdo_thresh.was_changed())
+					m_params.m_selector_rdo_thresh *= lerp<float>(1.0f, .75f, l);
+			}
+		}
+
+		m_auto_global_sel_pal = false;
+		if (!m_params.m_global_sel_pal && m_params.m_auto_global_sel_pal)
+		{
+			const float bits_per_selector_cluster = 31.0f;
+			double selector_codebook_bpp_est = (bits_per_selector_cluster * selector_clusters) / total_texels;
+			debug_printf("selector_codebook_bpp_est: %f\n", selector_codebook_bpp_est);
+			const float force_global_sel_pal_bpp_threshold = .15f;
+			if ((total_texels <= 128.0f*128.0f) && (selector_codebook_bpp_est > force_global_sel_pal_bpp_threshold))
+			{
+				m_auto_global_sel_pal = true;
+				debug_printf("Auto global selector palette enabled\n");
+			}
+		}
+
+		basisu_frontend::params p;
+		p.m_num_source_blocks = m_total_blocks;
+		p.m_pSource_blocks = &m_source_blocks[0];
+		p.m_max_endpoint_clusters = endpoint_clusters;
+		p.m_max_selector_clusters = selector_clusters;
+		p.m_perceptual = m_params.m_perceptual;
+		p.m_debug_stats = m_params.m_debug;
+		p.m_debug_images = m_params.m_debug_images;
+		p.m_compression_level = m_params.m_compression_level;
+		p.m_tex_type = m_params.m_tex_type;
+		p.m_multithreaded = m_params.m_multithreading;
+		p.m_disable_hierarchical_endpoint_codebooks = m_params.m_disable_hierarchical_endpoint_codebooks;
+		p.m_validate = m_params.m_validate;
+		p.m_pJob_pool = m_params.m_pJob_pool;
+		p.m_pGlobal_codebooks = m_params.m_pGlobal_codebooks;
+
+		if ((m_params.m_global_sel_pal) || (m_auto_global_sel_pal))
+		{
+			p.m_pGlobal_sel_codebook = m_params.m_pSel_codebook;
+			p.m_num_global_sel_codebook_pal_bits = m_params.m_global_pal_bits;
+			p.m_num_global_sel_codebook_mod_bits = m_params.m_global_mod_bits;
+			p.m_use_hybrid_selector_codebooks = !m_params.m_no_hybrid_sel_cb;
+			p.m_hybrid_codebook_quality_thresh = m_params.m_hybrid_sel_cb_quality_thresh;
+		}
+
+		if (!m_frontend.init(p))
+		{
+			error_printf("basisu_frontend::init() failed!\n");
+			return false;
+		}
+
+		m_frontend.compress();
+
+		if (m_params.m_debug_images)
+		{
+			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+			{
+				char filename[1024];
+#ifdef _WIN32				
+				sprintf_s(filename, sizeof(filename), "rdo_frontend_output_output_blocks_%u.png", i);
+#else
+				snprintf(filename, sizeof(filename), "rdo_frontend_output_output_blocks_%u.png", i);
+#endif				
+				m_frontend.dump_debug_image(filename, m_slice_descs[i].m_first_block_index, m_slice_descs[i].m_num_blocks_x, m_slice_descs[i].m_num_blocks_y, true);
+
+#ifdef _WIN32
+				sprintf_s(filename, sizeof(filename), "rdo_frontend_output_api_%u.png", i);
+#else
+				snprintf(filename, sizeof(filename), "rdo_frontend_output_api_%u.png", i);
+#endif				
+				m_frontend.dump_debug_image(filename, m_slice_descs[i].m_first_block_index, m_slice_descs[i].m_num_blocks_x, m_slice_descs[i].m_num_blocks_y, false);
+			}
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::extract_frontend_texture_data()
+	{
+		debug_printf("basis_compressor::extract_frontend_texture_data\n");
+
+		m_frontend_output_textures.resize(m_slice_descs.size());
+		m_best_etc1s_images.resize(m_slice_descs.size());
+		m_best_etc1s_images_unpacked.resize(m_slice_descs.size());
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[i];
+
+			const uint32_t num_blocks_x = slice_desc.m_num_blocks_x;
+			const uint32_t num_blocks_y = slice_desc.m_num_blocks_y;
+
+			const uint32_t width = num_blocks_x * 4;
+			const uint32_t height = num_blocks_y * 4;
+
+			m_frontend_output_textures[i].init(texture_format::cETC1, width, height);
+
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					memcpy(m_frontend_output_textures[i].get_block_ptr(block_x, block_y, 0), &m_frontend.get_output_block(slice_desc.m_first_block_index + block_x + block_y * num_blocks_x), sizeof(etc_block));
+
+#if 0
+			if (m_params.m_debug_images)
+			{
+				char filename[1024];
+				sprintf_s(filename, sizeof(filename), "rdo_etc_frontend_%u_", i);
+				write_etc1_vis_images(m_frontend_output_textures[i], filename);
+			}
+#endif
+
+			m_best_etc1s_images[i].init(texture_format::cETC1, width, height);
+			for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++)
+				for (uint32_t block_x = 0; block_x < num_blocks_x; block_x++)
+					memcpy(m_best_etc1s_images[i].get_block_ptr(block_x, block_y, 0), &m_frontend.get_etc1s_block(slice_desc.m_first_block_index + block_x + block_y * num_blocks_x), sizeof(etc_block));
+
+			m_best_etc1s_images[i].unpack(m_best_etc1s_images_unpacked[i]);
+		}
+
+		return true;
+	}
+
+	bool basis_compressor::process_backend()
+	{
+		debug_printf("basis_compressor::process_backend\n");
+
+		basisu_backend_params backend_params;
+		backend_params.m_debug = m_params.m_debug;
+		backend_params.m_debug_images = m_params.m_debug_images;
+		backend_params.m_etc1s = true;
+		backend_params.m_compression_level = m_params.m_compression_level;
+		
+		if (!m_params.m_no_endpoint_rdo)
+			backend_params.m_endpoint_rdo_quality_thresh = m_params.m_endpoint_rdo_thresh;
+
+		if (!m_params.m_no_selector_rdo)
+			backend_params.m_selector_rdo_quality_thresh = m_params.m_selector_rdo_thresh;
+				
+		backend_params.m_use_global_sel_codebook = (m_frontend.get_params().m_pGlobal_sel_codebook != NULL);
+		backend_params.m_global_sel_codebook_pal_bits = m_frontend.get_params().m_num_global_sel_codebook_pal_bits;
+		backend_params.m_global_sel_codebook_mod_bits = m_frontend.get_params().m_num_global_sel_codebook_mod_bits;
+		backend_params.m_use_hybrid_sel_codebooks = m_frontend.get_params().m_use_hybrid_selector_codebooks;
+		backend_params.m_used_global_codebooks = m_frontend.get_params().m_pGlobal_codebooks != nullptr;
+
+		m_backend.init(&m_frontend, backend_params, m_slice_descs, m_params.m_pSel_codebook);
+		uint32_t total_packed_bytes = m_backend.encode();
+
+		if (!total_packed_bytes)
+		{
+			error_printf("basis_compressor::encode() failed!\n");
+			return false;
+		}
+
+		debug_printf("Total packed bytes (estimated): %u\n", total_packed_bytes);
+
+		return true;
+	}
+
+	bool basis_compressor::create_basis_file_and_transcode()
+	{
+		debug_printf("basis_compressor::create_basis_file_and_transcode\n");
+
+		const basisu_backend_output& encoded_output = m_params.m_uastc ? m_uastc_backend_output : m_backend.get_output();
+
+		if (!m_basis_file.init(encoded_output, m_params.m_tex_type, m_params.m_userdata0, m_params.m_userdata1, m_params.m_y_flip, m_params.m_us_per_frame))
+		{
+			error_printf("basis_compressor::create_basis_file_and_transcode: basisu_backend:init() failed!\n");
+			return false;
+		}
+	
+		const uint8_vec &comp_data = m_basis_file.get_compressed_data();
+
+		m_output_basis_file = comp_data;
+
+		interval_timer tm;
+		tm.start();
+
+		basist::basisu_transcoder_init();
+
+		debug_printf("basist::basisu_transcoder_init: Took %f ms\n", tm.get_elapsed_ms());
+
+		// Verify the compressed data by transcoding it to ASTC (or ETC1)/BC7 and validating the CRC's.
+		basist::basisu_transcoder decoder(m_params.m_pSel_codebook);
+		if (!decoder.validate_file_checksums(&comp_data[0], (uint32_t)comp_data.size(), true))
+		{
+			error_printf("decoder.validate_file_checksums() failed!\n");
+			return false;
+		}
+
+		m_decoded_output_textures.resize(m_slice_descs.size());
+		m_decoded_output_textures_unpacked.resize(m_slice_descs.size());
+
+		m_decoded_output_textures_bc7.resize(m_slice_descs.size());
+		m_decoded_output_textures_unpacked_bc7.resize(m_slice_descs.size());
+								
+		tm.start();
+		if (m_params.m_pGlobal_codebooks)
+		{
+			decoder.set_global_codebooks(m_params.m_pGlobal_codebooks);
+		}
+
+		if (!decoder.start_transcoding(&comp_data[0], (uint32_t)comp_data.size()))
+		{
+			error_printf("decoder.start_transcoding() failed!\n");
+			return false;
+		}
+
+		double start_transcoding_time = tm.get_elapsed_secs();
+
+		debug_printf("basisu_compressor::start_transcoding() took %3.3fms\n", start_transcoding_time * 1000.0f);
+
+		uint32_t total_orig_pixels = 0;
+		uint32_t total_texels = 0;
+
+		double total_time_etc1s_or_astc = 0;
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			gpu_image decoded_texture;
+			decoded_texture.init(m_params.m_uastc ? texture_format::cASTC4x4 : texture_format::cETC1, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+						
+			tm.start();
+
+			basist::block_format format = m_params.m_uastc ? basist::block_format::cASTC_4x4 : basist::block_format::cETC1;
+			uint32_t bytes_per_block = m_params.m_uastc ? 16 : 8;
+						
+			if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
+				reinterpret_cast<etc_block *>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, format, bytes_per_block))
+			{
+				error_printf("Transcoding failed on slice %u!\n", i);
+				return false;
+			}
+
+			total_time_etc1s_or_astc += tm.get_elapsed_secs();
+
+			if (encoded_output.m_tex_format == basist::basis_tex_format::cETC1S)
+			{
+				uint32_t image_crc16 = basist::crc16(decoded_texture.get_ptr(), decoded_texture.get_size_in_bytes(), 0);
+				if (image_crc16 != encoded_output.m_slice_image_crcs[i])
+				{
+					error_printf("Decoded image data CRC check failed on slice %u!\n", i);
+					return false;
+				}
+				debug_printf("Decoded image data CRC check succeeded on slice %i\n", i);
+			}
+
+			m_decoded_output_textures[i] = decoded_texture;
+
+			total_orig_pixels += m_slice_descs[i].m_orig_width * m_slice_descs[i].m_orig_height;
+			total_texels += m_slice_descs[i].m_width * m_slice_descs[i].m_height;
+		}
+												
+		double total_time_bc7 = 0;
+
+		if (basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cUASTC4x4) &&
+			basist::basis_is_format_supported(basist::transcoder_texture_format::cTFBC7_RGBA, basist::basis_tex_format::cETC1S))
+		{
+			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+			{
+				gpu_image decoded_texture;
+				decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height);
+
+				tm.start();
+
+				if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i,
+					reinterpret_cast<etc_block*>(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16))
+				{
+					error_printf("Transcoding failed to BC7 on slice %u!\n", i);
+					return false;
+				}
+
+				total_time_bc7 += tm.get_elapsed_secs();
+
+				m_decoded_output_textures_bc7[i] = decoded_texture;
+			}
+		}
+
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			m_decoded_output_textures[i].unpack(m_decoded_output_textures_unpacked[i]);
+
+			if (m_decoded_output_textures_bc7[i].get_pixel_width())
+				m_decoded_output_textures_bc7[i].unpack(m_decoded_output_textures_unpacked_bc7[i]);
+		}
+
+		debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", m_params.m_uastc ? "ASTC" : "ETC1", total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc);
+
+		if (total_time_bc7 != 0)
+			debug_printf("Transcoded to BC7 in %3.3fms, %f texels/sec\n", total_time_bc7 * 1000.0f, total_orig_pixels / total_time_bc7);
+
+		debug_printf("Total .basis output file size: %u, %3.3f bits/texel\n", comp_data.size(), comp_data.size() * 8.0f / total_orig_pixels);
+				
+		uint32_t total_orig_texels = 0;
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+
+			total_orig_texels += slice_desc.m_orig_width * slice_desc.m_orig_height;
+
+			const uint32_t total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y;
+			BASISU_NOTE_UNUSED(total_blocks);
+
+			assert(m_decoded_output_textures[slice_index].get_total_blocks() == total_blocks);
+		}
+
+		m_basis_file_size = (uint32_t)comp_data.size();
+		m_basis_bits_per_texel = (comp_data.size() * 8.0f) / total_orig_texels;
+
+		return true;
+	}
+
+	bool basis_compressor::write_output_files_and_compute_stats()
+	{
+		debug_printf("basis_compressor::write_output_files_and_compute_stats\n");
+
+		const uint8_vec& comp_data = m_params.m_create_ktx2_file ? m_output_ktx2_file : m_basis_file.get_compressed_data();
+		if (m_params.m_write_output_basis_files)
+		{
+			const std::string& output_filename = m_params.m_out_filename;
+
+			if (!write_vec_to_file(output_filename.c_str(), comp_data))
+			{
+				error_printf("Failed writing output data to file \"%s\"\n", output_filename.c_str());
+				return false;
+			}
+
+			printf("Wrote output .basis/.ktx2 file \"%s\"\n", output_filename.c_str());
+		}
+
+		size_t comp_size = 0;
+		if ((m_params.m_compute_stats) && (m_params.m_uastc) && (comp_data.size()))
+		{
+			void* pComp_data = tdefl_compress_mem_to_heap(&comp_data[0], comp_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES);
+			size_t decomp_size = 0;
+			void* pDecomp_data = tinfl_decompress_mem_to_heap(pComp_data, comp_size, &decomp_size, 0);
+			if ((decomp_size != comp_data.size()) || (memcmp(pDecomp_data, &comp_data[0], decomp_size) != 0))
+			{
+				printf("basis_compressor::create_basis_file_and_transcode:: miniz compression or decompression failed!\n");
+				return false;
+			}
+
+			mz_free(pComp_data);
+			mz_free(pDecomp_data);
+
+			uint32_t total_texels = 0;
+			for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+				total_texels += (m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y) * 16;
+			
+			m_basis_bits_per_texel = comp_size * 8.0f / total_texels;
+
+			debug_printf(".basis file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n",
+				(uint32_t)comp_data.size(),
+				(uint32_t)comp_size,
+				m_basis_bits_per_texel);
+		}
+
+		m_stats.resize(m_slice_descs.size());
+		
+		uint32_t total_orig_texels = 0;
+
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc &slice_desc = m_slice_descs[slice_index];
+						
+			total_orig_texels += slice_desc.m_orig_width * slice_desc.m_orig_height;
+
+			if (m_params.m_compute_stats)
+			{
+				printf("Slice: %u\n", slice_index);
+
+				image_stats &s = m_stats[slice_index];
+
+				// TODO: We used to output SSIM (during heavy encoder development), but this slowed down compression too much. We'll be adding it back.
+
+				image_metrics em;
+								
+				// ---- .basis stats
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3);
+				em.print(".basis RGB Avg:          ");
+				s.m_basis_rgb_avg_psnr = em.m_psnr;
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4);
+				em.print(".basis RGBA Avg:         ");
+				s.m_basis_rgba_avg_psnr = em.m_psnr;
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1);
+				em.print(".basis R   Avg:          ");
+				
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1);
+				em.print(".basis G   Avg:          ");
+				
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1);
+				em.print(".basis B   Avg:          ");
+
+				if (m_params.m_uastc)
+				{
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1);
+					em.print(".basis A   Avg:          ");
+
+					s.m_basis_a_avg_psnr = em.m_psnr;
+				}
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0);
+				em.print(".basis 709 Luma:         ");
+				s.m_basis_luma_709_psnr = static_cast<float>(em.m_psnr);
+				s.m_basis_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+				em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true);
+				em.print(".basis 601 Luma:         ");
+				s.m_basis_luma_601_psnr = static_cast<float>(em.m_psnr);
+								
+				if (m_slice_descs.size() == 1)
+				{
+					const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size();
+					debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+					debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height)));
+				}
+
+				if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width())
+				{
+					// ---- BC7 stats
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3);
+					em.print("BC7 RGB Avg:             ");
+					s.m_bc7_rgb_avg_psnr = em.m_psnr;
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4);
+					em.print("BC7 RGBA Avg:            ");
+					s.m_bc7_rgba_avg_psnr = em.m_psnr;
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1);
+					em.print("BC7 R   Avg:             ");
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1);
+					em.print("BC7 G   Avg:             ");
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1);
+					em.print("BC7 B   Avg:             ");
+
+					if (m_params.m_uastc)
+					{
+						em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1);
+						em.print("BC7 A   Avg:             ");
+
+						s.m_bc7_a_avg_psnr = em.m_psnr;
+					}
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0);
+					em.print("BC7 709 Luma:            ");
+					s.m_bc7_luma_709_psnr = static_cast<float>(em.m_psnr);
+					s.m_bc7_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+					em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true);
+					em.print("BC7 601 Luma:            ");
+					s.m_bc7_luma_601_psnr = static_cast<float>(em.m_psnr);
+				}
+
+				if (!m_params.m_uastc)
+				{
+					// ---- Nearly best possible ETC1S stats
+					em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0);
+					em.print("Unquantized ETC1S 709 Luma:    ");
+
+					s.m_best_etc1s_luma_709_psnr = static_cast<float>(em.m_psnr);
+					s.m_best_etc1s_luma_709_ssim = static_cast<float>(em.m_ssim);
+
+					em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true);
+					em.print("Unquantized ETC1S 601 Luma:    ");
+
+					s.m_best_etc1s_luma_601_psnr = static_cast<float>(em.m_psnr);
+
+					em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3);
+					em.print("Unquantized ETC1S RGB Avg:     ");
+
+					s.m_best_etc1s_rgb_avg_psnr = static_cast<float>(em.m_psnr);
+				}
+			}
+		
+			std::string out_basename;
+			if (m_params.m_out_filename.size())
+				string_get_filename(m_params.m_out_filename.c_str(), out_basename);
+			else if (m_params.m_source_filenames.size())
+				string_get_filename(m_params.m_source_filenames[slice_desc.m_source_file_index].c_str(), out_basename);
+
+			string_remove_extension(out_basename);
+			out_basename = "basis_debug_" + out_basename + string_format("_slice_%u", slice_index);
+
+			if ((!m_params.m_uastc) && (m_frontend.get_params().m_debug_images))
+			{
+				// Write "best" ETC1S debug images
+				if (!m_params.m_uastc)
+				{
+					gpu_image best_etc1s_gpu_image(m_best_etc1s_images[slice_index]);
+					best_etc1s_gpu_image.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					write_compressed_texture_file((out_basename + "_best_etc1s.ktx").c_str(), best_etc1s_gpu_image);
+
+					image best_etc1s_unpacked;
+					best_etc1s_gpu_image.unpack(best_etc1s_unpacked);
+					save_png(out_basename + "_best_etc1s.png", best_etc1s_unpacked);
+				}
+			}
+
+			if (m_params.m_debug_images)
+			{
+				// Write decoded ETC1S/ASTC debug images
+				{
+					gpu_image decoded_etc1s_or_astc(m_decoded_output_textures[slice_index]);
+					decoded_etc1s_or_astc.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					write_compressed_texture_file((out_basename + "_transcoded_etc1s_or_astc.ktx").c_str(), decoded_etc1s_or_astc);
+
+					image temp(m_decoded_output_textures_unpacked[slice_index]);
+					temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					save_png(out_basename + "_transcoded_etc1s_or_astc.png", temp);
+				}
+
+				// Write decoded BC7 debug images
+				if (m_decoded_output_textures_bc7[slice_index].get_pixel_width())
+				{
+					gpu_image decoded_bc7(m_decoded_output_textures_bc7[slice_index]);
+					decoded_bc7.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					write_compressed_texture_file((out_basename + "_transcoded_bc7.ktx").c_str(), decoded_bc7);
+
+					image temp(m_decoded_output_textures_unpacked_bc7[slice_index]);
+					temp.crop(slice_desc.m_orig_width, slice_desc.m_orig_height);
+					save_png(out_basename + "_transcoded_bc7.png", temp);
+				}
+			}
+		}
+				
+		return true;
+	}
+	
+	// Make sure all the mip 0's have the same dimensions and number of mipmap levels, or we can't encode the KTX2 file.
+	bool basis_compressor::validate_ktx2_constraints()
+	{
+		uint32_t base_width = 0, base_height = 0;
+		uint32_t total_layers = 0;
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			if (m_slice_descs[i].m_mip_index == 0)
+			{
+				if (!base_width)
+				{
+					base_width = m_slice_descs[i].m_orig_width;
+					base_height = m_slice_descs[i].m_orig_height;
+				}
+				else
+				{
+					if ((m_slice_descs[i].m_orig_width != base_width) || (m_slice_descs[i].m_orig_height != base_height))
+					{
+						return false;
+					}
+				}
+
+				total_layers = maximum<uint32_t>(total_layers, m_slice_descs[i].m_source_file_index + 1);
+			}
+		}
+
+		basisu::vector<uint32_t> total_mips(total_layers);
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+			total_mips[m_slice_descs[i].m_source_file_index] = maximum<uint32_t>(total_mips[m_slice_descs[i].m_source_file_index], m_slice_descs[i].m_mip_index + 1);
+
+		for (uint32_t i = 1; i < total_layers; i++)
+		{
+			if (total_mips[0] != total_mips[i])
+			{
+				return false;
+			}
+		}
+
+		return true;
+	}
+
+	static uint8_t g_ktx2_etc1s_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	static uint8_t g_ktx2_etc1s_alpha_dfd[60] = { 0x3C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x38,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF,0x40,0x0,0x3F,0xF,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	static uint8_t g_ktx2_uastc_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x4,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+	static uint8_t g_ktx2_uastc_alpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF };
+		
+	void basis_compressor::get_dfd(uint8_vec &dfd, const basist::ktx2_header &header)
+	{
+		const uint8_t* pDFD;
+		uint32_t dfd_len;
+
+		if (m_params.m_uastc)
+		{
+			if (m_any_source_image_has_alpha)
+			{
+				pDFD = g_ktx2_uastc_alpha_dfd;
+				dfd_len = sizeof(g_ktx2_uastc_alpha_dfd);
+			}
+			else
+			{
+				pDFD = g_ktx2_uastc_nonalpha_dfd;
+				dfd_len = sizeof(g_ktx2_uastc_nonalpha_dfd);
+			}
+		}
+		else
+		{
+			if (m_any_source_image_has_alpha)
+			{
+				pDFD = g_ktx2_etc1s_alpha_dfd;
+				dfd_len = sizeof(g_ktx2_etc1s_alpha_dfd);
+			}
+			else
+			{
+				pDFD = g_ktx2_etc1s_nonalpha_dfd;
+				dfd_len = sizeof(g_ktx2_etc1s_nonalpha_dfd);
+			}
+		}
+				
+		assert(dfd_len >= 44);
+
+		dfd.resize(dfd_len);
+		memcpy(dfd.data(), pDFD, dfd_len);
+
+		uint32_t dfd_bits = basisu::read_le_dword(dfd.data() + 3 * sizeof(uint32_t));
+		
+		dfd_bits &= ~(0xFF << 16);
+
+		if (m_params.m_ktx2_srgb_transfer_func)
+			dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_SRGB << 16);
+		else
+			dfd_bits |= (basist::KTX2_KHR_DF_TRANSFER_LINEAR << 16);
+
+		basisu::write_le_dword(dfd.data() + 3 * sizeof(uint32_t), dfd_bits);
+
+		if (header.m_supercompression_scheme != basist::KTX2_SS_NONE)
+		{
+			uint32_t plane_bits = basisu::read_le_dword(dfd.data() + 5 * sizeof(uint32_t));
+
+			plane_bits &= ~0xFF;
+
+			basisu::write_le_dword(dfd.data() + 5 * sizeof(uint32_t), plane_bits);
+		}
+
+		// Fix up the DFD channel(s)
+		uint32_t dfd_chan0 = basisu::read_le_dword(dfd.data() + 7 * sizeof(uint32_t));
+
+		if (m_params.m_uastc)
+		{
+			dfd_chan0 &= ~(0xF << 24);
+			
+			// TODO: Allow the caller to override this
+			if (m_any_source_image_has_alpha)
+				dfd_chan0 |= (basist::KTX2_DF_CHANNEL_UASTC_RGBA << 24);
+			else
+				dfd_chan0 |= (basist::KTX2_DF_CHANNEL_UASTC_RGB << 24);
+		}
+
+		basisu::write_le_dword(dfd.data() + 7 * sizeof(uint32_t), dfd_chan0);
+	}
+
+	bool basis_compressor::create_ktx2_file()
+	{
+		if (m_params.m_uastc)
+		{
+			if ((m_params.m_ktx2_uastc_supercompression != basist::KTX2_SS_NONE) && (m_params.m_ktx2_uastc_supercompression != basist::KTX2_SS_ZSTANDARD))
+				return false;
+		}
+
+		const basisu_backend_output& backend_output = m_backend.get_output();
+
+		// Determine the width/height, number of array layers, mipmap levels, and the number of faces (1 for 2D, 6 for cubemap).
+		// This does not support 1D or 3D.
+		uint32_t base_width = 0, base_height = 0, total_layers = 0, total_levels = 0, total_faces = 1;
+				
+		for (uint32_t i = 0; i < m_slice_descs.size(); i++)
+		{
+			if ((m_slice_descs[i].m_mip_index == 0) && (!base_width))
+			{
+				base_width = m_slice_descs[i].m_orig_width;
+				base_height = m_slice_descs[i].m_orig_height;
+			}
+
+			total_layers = maximum<uint32_t>(total_layers, m_slice_descs[i].m_source_file_index + 1);
+
+			if (!m_slice_descs[i].m_source_file_index)
+				total_levels = maximum<uint32_t>(total_levels, m_slice_descs[i].m_mip_index + 1);
+		}
+
+		if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray)
+		{
+			assert((total_layers % 6) == 0);
+			
+			total_layers /= 6;
+			assert(total_layers >= 1);
+
+			total_faces = 6;
+		}
+
+		basist::ktx2_header header;
+		memset(&header, 0, sizeof(header));
+
+		memcpy(header.m_identifier, basist::g_ktx2_file_identifier, sizeof(basist::g_ktx2_file_identifier));
+		header.m_pixel_width = base_width;
+		header.m_pixel_height = base_height;
+		header.m_face_count = total_faces;
+		header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED;
+		header.m_type_size = 1;
+		header.m_level_count = total_levels;
+		header.m_layer_count = (total_layers > 1) ? total_layers : 0;
+
+		if (m_params.m_uastc)
+		{
+			switch (m_params.m_ktx2_uastc_supercompression)
+			{
+			case basist::KTX2_SS_NONE:
+			{
+				header.m_supercompression_scheme = basist::KTX2_SS_NONE;
+				break;
+			}
+			case basist::KTX2_SS_ZSTANDARD:
+			{
+#if BASISD_SUPPORT_KTX2_ZSTD
+				header.m_supercompression_scheme = basist::KTX2_SS_ZSTANDARD;
+#else
+				header.m_supercompression_scheme = basist::KTX2_SS_NONE;
+#endif
+				break;
+			}
+			default: assert(0); return false;
+			}
+		}
+
+		basisu::vector<uint8_vec> level_data_bytes(total_levels);
+		basisu::vector<uint8_vec> compressed_level_data_bytes(total_levels);
+		uint_vec slice_level_offsets(m_slice_descs.size());
+
+		// This will append the texture data in the correct order (for each level: layer, then face).
+		for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+		{
+			const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+
+			slice_level_offsets[slice_index] = level_data_bytes[slice_desc.m_mip_index].size();
+
+			if (m_params.m_uastc)
+				append_vector(level_data_bytes[slice_desc.m_mip_index], m_uastc_backend_output.m_slice_image_data[slice_index]);
+			else
+				append_vector(level_data_bytes[slice_desc.m_mip_index], backend_output.m_slice_image_data[slice_index]);
+		}
+
+		// UASTC supercompression
+		if ((m_params.m_uastc) && (header.m_supercompression_scheme == basist::KTX2_SS_ZSTANDARD))
+		{
+#if BASISD_SUPPORT_KTX2_ZSTD
+			for (uint32_t level_index = 0; level_index < total_levels; level_index++)
+			{
+				compressed_level_data_bytes[level_index].resize(ZSTD_compressBound(level_data_bytes[level_index].size()));
+
+				size_t result = ZSTD_compress(compressed_level_data_bytes[level_index].data(), compressed_level_data_bytes[level_index].size(),
+					level_data_bytes[level_index].data(), level_data_bytes[level_index].size(),
+					m_params.m_ktx2_zstd_supercompression_level);
+
+				if (ZSTD_isError(result))
+					return false;
+
+				compressed_level_data_bytes[level_index].resize(result);
+			}
+#else
+			// Can't get here
+			assert(0);
+			return false;
+#endif
+		}
+		else
+		{
+			// No supercompression
+			compressed_level_data_bytes = level_data_bytes;
+		}
+				
+		uint8_vec etc1s_global_data;
+
+		// Create ETC1S global supercompressed data
+		if (!m_params.m_uastc)
+		{
+			basist::ktx2_etc1s_global_data_header etc1s_global_data_header;
+			clear_obj(etc1s_global_data_header);
+
+			etc1s_global_data_header.m_endpoint_count = backend_output.m_num_endpoints;
+			etc1s_global_data_header.m_selector_count = backend_output.m_num_selectors;
+			etc1s_global_data_header.m_endpoints_byte_length = backend_output.m_endpoint_palette.size();
+			etc1s_global_data_header.m_selectors_byte_length = backend_output.m_selector_palette.size();
+			etc1s_global_data_header.m_tables_byte_length = backend_output.m_slice_image_tables.size();
+
+			basisu::vector<basist::ktx2_etc1s_image_desc> etc1s_image_descs(total_levels * total_layers * total_faces);
+			memset(etc1s_image_descs.data(), 0, etc1s_image_descs.size_in_bytes());
+
+			for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++)
+			{
+				const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index];
+
+				const uint32_t level_index = slice_desc.m_mip_index;
+				uint32_t layer_index = slice_desc.m_source_file_index;
+				uint32_t face_index = 0;
+
+				if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray)
+				{
+					face_index = layer_index % 6;
+					layer_index /= 6;
+				}
+
+				const uint32_t etc1s_image_index = level_index * (total_layers * total_faces) + layer_index * total_faces + face_index;
+
+				if (slice_desc.m_alpha)
+				{
+					etc1s_image_descs[etc1s_image_index].m_alpha_slice_byte_length = backend_output.m_slice_image_data[slice_index].size();
+					etc1s_image_descs[etc1s_image_index].m_alpha_slice_byte_offset = slice_level_offsets[slice_index];
+				}
+				else
+				{
+					if (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames)
+						etc1s_image_descs[etc1s_image_index].m_image_flags = !slice_desc.m_iframe ? basist::KTX2_IMAGE_IS_P_FRAME : 0;
+
+					etc1s_image_descs[etc1s_image_index].m_rgb_slice_byte_length = backend_output.m_slice_image_data[slice_index].size();
+					etc1s_image_descs[etc1s_image_index].m_rgb_slice_byte_offset = slice_level_offsets[slice_index];
+				}
+			} // slice_index
+
+			append_vector(etc1s_global_data, (const uint8_t*)&etc1s_global_data_header, sizeof(etc1s_global_data_header));
+			append_vector(etc1s_global_data, (const uint8_t*)etc1s_image_descs.data(), etc1s_image_descs.size_in_bytes());
+			append_vector(etc1s_global_data, backend_output.m_endpoint_palette);
+			append_vector(etc1s_global_data, backend_output.m_selector_palette);
+			append_vector(etc1s_global_data, backend_output.m_slice_image_tables);
+			
+			header.m_supercompression_scheme = basist::KTX2_SS_BASISLZ;
+		}
+
+		// Key values
+		basist::ktx2_transcoder::key_value_vec key_values(m_params.m_ktx2_key_values);
+		key_values.enlarge(1);
+		
+		const char* pKTXwriter = "KTXwriter";
+		key_values.back().m_key.resize(strlen(pKTXwriter) + 1);
+		memcpy(key_values.back().m_key.data(), pKTXwriter, strlen(pKTXwriter) + 1);
+
+		char writer_id[128];
+#ifdef _MSC_VER
+		sprintf_s(writer_id, sizeof(writer_id), "Basis Universal %s", BASISU_LIB_VERSION_STRING);
+#else
+		snprintf(writer_id, sizeof(writer_id), "Basis Universal %s", BASISU_LIB_VERSION_STRING);
+#endif
+		key_values.back().m_value.resize(strlen(writer_id) + 1);
+		memcpy(key_values.back().m_value.data(), writer_id, strlen(writer_id) + 1);
+
+		key_values.sort();
+
+#if BASISU_DISABLE_KTX2_KEY_VALUES
+		// HACK HACK - Clear the key values array, which causes no key values to be written (triggering the ktx2check validator bug).
+		key_values.clear();
+#endif
+
+		uint8_vec key_value_data;
+
+		// DFD
+		uint8_vec dfd;
+		get_dfd(dfd, header);
+
+		const uint32_t kvd_file_offset = sizeof(header) + sizeof(basist::ktx2_level_index) * total_levels + dfd.size();
+
+		for (uint32_t pass = 0; pass < 2; pass++)
+		{
+			for (uint32_t i = 0; i < key_values.size(); i++)
+			{
+				if (key_values[i].m_key.size() < 2)
+					return false;
+
+				if (key_values[i].m_key.back() != 0)
+					return false;
+
+				const uint64_t total_len = (uint64_t)key_values[i].m_key.size() + (uint64_t)key_values[i].m_value.size();
+				if (total_len >= UINT32_MAX)
+					return false;
+
+				packed_uint<4> le_len((uint32_t)total_len);
+				append_vector(key_value_data, (const uint8_t*)&le_len, sizeof(le_len));
+
+				append_vector(key_value_data, key_values[i].m_key);
+				append_vector(key_value_data, key_values[i].m_value);
+
+				const uint32_t ofs = key_value_data.size() & 3;
+				const uint32_t padding = (4 - ofs) & 3;
+				for (uint32_t p = 0; p < padding; p++)
+					key_value_data.push_back(0);
+			}
+
+			if (header.m_supercompression_scheme != basist::KTX2_SS_NONE)
+				break;
+
+#if BASISU_DISABLE_KTX2_ALIGNMENT_WORKAROUND
+			break;
+#endif
+			
+			// Hack to ensure the KVD block ends on a 16 byte boundary, because we have no other official way of aligning the data.
+			uint32_t kvd_end_file_offset = kvd_file_offset + key_value_data.size();
+			uint32_t bytes_needed_to_pad = (16 - (kvd_end_file_offset & 15)) & 15;
+			if (!bytes_needed_to_pad)
+			{
+				// We're good. No need to add a dummy key.
+				break;
+			}
+
+			assert(!pass);
+			if (pass)
+				return false;
+
+			if (bytes_needed_to_pad < 6)
+				bytes_needed_to_pad += 16;
+
+			printf("WARNING: Due to a KTX2 validator bug related to mipPadding, we must insert a dummy key into the KTX2 file of %u bytes\n", bytes_needed_to_pad);
+			
+			// We're not good - need to add a dummy key large enough to force file alignment so the mip level array gets aligned. 
+			// We can't just add some bytes before the mip level array because ktx2check will see that as extra data in the file that shouldn't be there in ktxValidator::validateDataSize().
+			key_values.enlarge(1);
+			for (uint32_t i = 0; i < (bytes_needed_to_pad - 4 - 1 - 1); i++)
+				key_values.back().m_key.push_back(127);
+			
+			key_values.back().m_key.push_back(0);
+
+			key_values.back().m_value.push_back(0);
+
+			key_values.sort();
+
+			key_value_data.resize(0);
+			
+			// Try again
+		}
+
+		basisu::vector<basist::ktx2_level_index> level_index_array(total_levels);
+		memset(level_index_array.data(), 0, level_index_array.size_in_bytes());
+				
+		m_output_ktx2_file.clear();
+		m_output_ktx2_file.reserve(m_output_basis_file.size());
+
+		// Dummy header
+		m_output_ktx2_file.resize(sizeof(header));
+
+		// Level index array
+		append_vector(m_output_ktx2_file, (const uint8_t*)level_index_array.data(), level_index_array.size_in_bytes());
+				
+		// DFD
+		const uint8_t* pDFD = dfd.data();
+		uint32_t dfd_len = dfd.size();
+
+		header.m_dfd_byte_offset = m_output_ktx2_file.size();
+		header.m_dfd_byte_length = dfd_len;
+		append_vector(m_output_ktx2_file, pDFD, dfd_len);
+
+		// Key value data
+		if (key_value_data.size())
+		{
+			assert(kvd_file_offset == m_output_ktx2_file.size());
+
+			header.m_kvd_byte_offset = m_output_ktx2_file.size();
+			header.m_kvd_byte_length = key_value_data.size();
+			append_vector(m_output_ktx2_file, key_value_data);
+		}
+
+		// Global Supercompressed Data
+		if (etc1s_global_data.size())
+		{
+			uint32_t ofs = m_output_ktx2_file.size() & 7;
+			uint32_t padding = (8 - ofs) & 7;
+			for (uint32_t i = 0; i < padding; i++)
+				m_output_ktx2_file.push_back(0);
+
+			header.m_sgd_byte_length = etc1s_global_data.size();
+			header.m_sgd_byte_offset = m_output_ktx2_file.size();
+
+			append_vector(m_output_ktx2_file, etc1s_global_data);
+		}
+
+		// mipPadding
+		if (header.m_supercompression_scheme == basist::KTX2_SS_NONE)
+		{
+			// We currently can't do this or the validator will incorrectly give an error.
+			uint32_t ofs = m_output_ktx2_file.size() & 15;
+			uint32_t padding = (16 - ofs) & 15;
+
+			// Make sure we're always aligned here (due to a validator bug).
+			if (padding)
+			{
+				printf("Warning: KTX2 mip level data is not 16-byte aligned. This may trigger a ktx2check validation bug. Writing %u bytes of mipPadding.\n", padding);
+			}
+
+			for (uint32_t i = 0; i < padding; i++)
+				m_output_ktx2_file.push_back(0);
+		}
+
+		// Level data - write the smallest mipmap first.
+		for (int level = total_levels - 1; level >= 0; level--)
+		{
+			level_index_array[level].m_byte_length = compressed_level_data_bytes[level].size();
+			if (m_params.m_uastc)
+				level_index_array[level].m_uncompressed_byte_length = level_data_bytes[level].size();
+
+			level_index_array[level].m_byte_offset = m_output_ktx2_file.size();
+			append_vector(m_output_ktx2_file, compressed_level_data_bytes[level]);
+		}
+		
+		// Write final header
+		memcpy(m_output_ktx2_file.data(), &header, sizeof(header));
+
+		// Write final level index array
+		memcpy(m_output_ktx2_file.data() + sizeof(header), level_index_array.data(), level_index_array.size_in_bytes());
+
+		debug_printf("Total .ktx2 output file size: %u\n", m_output_ktx2_file.size());
+
+		return true;
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_comp.h b/thirdparty/basis_universal/encoder/basisu_comp.h
new file mode 100644
index 0000000000..2c3af968f7
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_comp.h
@@ -0,0 +1,555 @@
+// basisu_comp.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_frontend.h"
+#include "basisu_backend.h"
+#include "basisu_basis_file.h"
+#include "../transcoder/basisu_global_selector_palette.h"
+#include "../transcoder/basisu_transcoder.h"
+#include "basisu_uastc_enc.h"
+
+#define BASISU_LIB_VERSION 115
+#define BASISU_LIB_VERSION_STRING "1.15"
+
+#ifndef BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 is undefined
+#endif
+#ifndef BASISD_SUPPORT_KTX2_ZSTD
+	#error BASISD_SUPPORT_KTX2_ZSTD is undefined
+#endif
+
+#if !BASISD_SUPPORT_KTX2
+	#error BASISD_SUPPORT_KTX2 must be enabled when building the encoder. To reduce code size if KTX2 support is not needed, set BASISD_SUPPORT_KTX2_ZSTD to 0
+#endif
+
+namespace basisu
+{
+	const uint32_t BASISU_MAX_SUPPORTED_TEXTURE_DIMENSION = 16384;
+
+	// Allow block's color distance to increase by 1.5 while searching for an alternative nearby endpoint.
+	const float BASISU_DEFAULT_ENDPOINT_RDO_THRESH = 1.5f; 
+	
+	// Allow block's color distance to increase by 1.25 while searching the selector history buffer for a close enough match.
+	const float BASISU_DEFAULT_SELECTOR_RDO_THRESH = 1.25f; 
+
+	const int BASISU_DEFAULT_QUALITY = 128;
+	const float BASISU_DEFAULT_HYBRID_SEL_CB_QUALITY_THRESH = 2.0f;
+
+	const uint32_t BASISU_MAX_IMAGE_DIMENSION = 16384;
+	const uint32_t BASISU_QUALITY_MIN = 1;
+	const uint32_t BASISU_QUALITY_MAX = 255;
+	const uint32_t BASISU_MAX_ENDPOINT_CLUSTERS = basisu_frontend::cMaxEndpointClusters;
+	const uint32_t BASISU_MAX_SELECTOR_CLUSTERS = basisu_frontend::cMaxSelectorClusters;
+
+	const uint32_t BASISU_MAX_SLICES = 0xFFFFFF;
+
+	const int BASISU_RDO_UASTC_DICT_SIZE_DEFAULT = 4096; // 32768;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MIN = 64;
+	const int BASISU_RDO_UASTC_DICT_SIZE_MAX = 65536;
+
+	struct image_stats
+	{
+		image_stats()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_filename.clear();
+			m_width = 0;
+			m_height = 0;
+						
+			m_basis_rgb_avg_psnr = 0.0f;
+			m_basis_rgba_avg_psnr = 0.0f;
+			m_basis_a_avg_psnr = 0.0f;
+			m_basis_luma_709_psnr = 0.0f;
+			m_basis_luma_601_psnr = 0.0f;
+			m_basis_luma_709_ssim = 0.0f;
+
+			m_bc7_rgb_avg_psnr = 0.0f;
+			m_bc7_rgba_avg_psnr = 0.0f;
+			m_bc7_a_avg_psnr = 0.0f;
+			m_bc7_luma_709_psnr = 0.0f;
+			m_bc7_luma_601_psnr = 0.0f;
+			m_bc7_luma_709_ssim = 0.0f;
+						
+			m_best_etc1s_rgb_avg_psnr = 0.0f;
+			m_best_etc1s_luma_709_psnr = 0.0f;
+			m_best_etc1s_luma_601_psnr = 0.0f;
+			m_best_etc1s_luma_709_ssim = 0.0f;
+		}
+
+		std::string m_filename;
+		uint32_t m_width;
+		uint32_t m_height;
+
+		// .basis compressed (ETC1S or UASTC statistics)
+		float m_basis_rgb_avg_psnr;
+		float m_basis_rgba_avg_psnr;
+		float m_basis_a_avg_psnr;
+		float m_basis_luma_709_psnr;
+		float m_basis_luma_601_psnr;
+		float m_basis_luma_709_ssim;
+
+		// BC7 statistics
+		float m_bc7_rgb_avg_psnr;
+		float m_bc7_rgba_avg_psnr;
+		float m_bc7_a_avg_psnr;
+		float m_bc7_luma_709_psnr;
+		float m_bc7_luma_601_psnr;
+		float m_bc7_luma_709_ssim;
+		
+		// Highest achievable quality ETC1S statistics
+		float m_best_etc1s_rgb_avg_psnr;
+		float m_best_etc1s_luma_709_psnr;
+		float m_best_etc1s_luma_601_psnr;
+		float m_best_etc1s_luma_709_ssim;
+	};
+
+	template<bool def>
+	struct bool_param
+	{
+		bool_param() :
+			m_value(def),
+			m_changed(false)
+		{
+		}
+
+		void clear()
+		{
+			m_value = def;
+			m_changed = false;
+		}
+
+		operator bool() const
+		{
+			return m_value;
+		}
+
+		bool operator= (bool v)
+		{
+			m_value = v;
+			m_changed = true;
+			return m_value;
+		}
+
+		bool was_changed() const { return m_changed; }
+		void set_changed(bool flag) { m_changed = flag; }
+
+		bool m_value;
+		bool m_changed;
+	};
+
+	template<typename T>
+	struct param
+	{
+		param(T def, T min_v, T max_v) :
+			m_value(def),
+			m_def(def),
+			m_min(min_v),
+			m_max(max_v),
+			m_changed(false)
+		{
+		}
+
+		void clear()
+		{
+			m_value = m_def;
+			m_changed = false;
+		}
+
+		operator T() const
+		{
+			return m_value;
+		}
+
+		T operator= (T v)
+		{
+			m_value = clamp<T>(v, m_min, m_max);
+			m_changed = true;
+			return m_value;
+		}
+
+		T operator *= (T v)
+		{
+			m_value *= v;
+			m_changed = true;
+			return m_value;
+		}
+
+		bool was_changed() const { return m_changed; }
+		void set_changed(bool flag) { m_changed = flag; }
+
+		T m_value;
+		T m_def;
+		T m_min;
+		T m_max;
+		bool m_changed;
+	};
+
+	struct basis_compressor_params
+	{
+		basis_compressor_params() :
+			m_pSel_codebook(NULL),
+			m_compression_level((int)BASISU_DEFAULT_COMPRESSION_LEVEL, 0, (int)BASISU_MAX_COMPRESSION_LEVEL),
+			m_selector_rdo_thresh(BASISU_DEFAULT_SELECTOR_RDO_THRESH, 0.0f, 1e+10f),
+			m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f),
+			m_hybrid_sel_cb_quality_thresh(BASISU_DEFAULT_HYBRID_SEL_CB_QUALITY_THRESH, 0.0f, 1e+10f),
+			m_global_pal_bits(8, 0, ETC1_GLOBAL_SELECTOR_CODEBOOK_MAX_PAL_BITS),
+			m_global_mod_bits(8, 0, basist::etc1_global_palette_entry_modifier::cTotalBits),
+			m_mip_scale(1.0f, .000125f, 4.0f),
+			m_mip_smallest_dimension(1, 1, 16384),
+			m_max_endpoint_clusters(512),
+			m_max_selector_clusters(512),
+			m_quality_level(-1),
+			m_pack_uastc_flags(cPackUASTCLevelDefault),
+			m_rdo_uastc_quality_scalar(1.0f, 0.001f, 50.0f),
+			m_rdo_uastc_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX),
+			m_rdo_uastc_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f),
+			m_rdo_uastc_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f),
+			m_rdo_uastc_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f),
+			m_rdo_uastc_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f),
+			m_resample_width(0, 1, 16384),
+			m_resample_height(0, 1, 16384),
+			m_resample_factor(0.0f, .00125f, 100.0f),
+			m_ktx2_uastc_supercompression(basist::KTX2_SS_NONE),
+			m_ktx2_zstd_supercompression_level(6, INT_MIN, INT_MAX),
+			m_pJob_pool(nullptr)
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_pSel_codebook = NULL;
+
+			m_uastc.clear();
+			m_status_output.clear();
+
+			m_source_filenames.clear();
+			m_source_alpha_filenames.clear();
+
+			m_source_images.clear();
+			m_source_mipmap_images.clear();
+
+			m_out_filename.clear();
+
+			m_y_flip.clear();
+			m_debug.clear();
+			m_validate.clear();
+			m_debug_images.clear();
+			m_global_sel_pal.clear();
+			m_auto_global_sel_pal.clear();
+			m_no_hybrid_sel_cb.clear();
+			m_perceptual.clear();
+			m_no_selector_rdo.clear();
+			m_selector_rdo_thresh.clear();
+			m_read_source_images.clear();
+			m_write_output_basis_files.clear();
+			m_compression_level.clear();
+			m_compute_stats.clear();
+			m_check_for_alpha.clear();
+			m_force_alpha.clear();
+			m_multithreading.clear();
+			m_swizzle[0] = 0;
+			m_swizzle[1] = 1;
+			m_swizzle[2] = 2;
+			m_swizzle[3] = 3;
+			m_renormalize.clear();
+			m_hybrid_sel_cb_quality_thresh.clear();
+			m_global_pal_bits.clear();
+			m_global_mod_bits.clear();
+			m_disable_hierarchical_endpoint_codebooks.clear();
+
+			m_no_endpoint_rdo.clear();
+			m_endpoint_rdo_thresh.clear();
+						
+			m_mip_gen.clear();
+			m_mip_scale.clear();
+			m_mip_filter = "kaiser";
+			m_mip_scale = 1.0f;
+			m_mip_srgb.clear();
+			m_mip_premultiplied.clear();
+			m_mip_renormalize.clear();
+			m_mip_wrapping.clear();
+			m_mip_fast.clear();
+			m_mip_smallest_dimension.clear();
+
+			m_max_endpoint_clusters = 0;
+			m_max_selector_clusters = 0;
+			m_quality_level = -1;
+
+			m_tex_type = basist::cBASISTexType2D;
+			m_userdata0 = 0;
+			m_userdata1 = 0;
+			m_us_per_frame = 0;
+
+			m_pack_uastc_flags = cPackUASTCLevelDefault;
+			m_rdo_uastc.clear();
+			m_rdo_uastc_quality_scalar.clear();
+			m_rdo_uastc_max_smooth_block_error_scale.clear();
+			m_rdo_uastc_smooth_block_max_std_dev.clear();
+			m_rdo_uastc_max_allowed_rms_increase_ratio.clear();
+			m_rdo_uastc_skip_block_rms_thresh.clear();
+			m_rdo_uastc_favor_simpler_modes_in_rdo_mode.clear();
+			m_rdo_uastc_multithreading.clear();
+
+			m_resample_width.clear();
+			m_resample_height.clear();
+			m_resample_factor.clear();
+
+			m_pGlobal_codebooks = nullptr;
+
+			m_create_ktx2_file.clear();
+			m_ktx2_uastc_supercompression = basist::KTX2_SS_NONE;
+			m_ktx2_key_values.clear();
+			m_ktx2_zstd_supercompression_level.clear();
+			m_ktx2_srgb_transfer_func.clear();
+
+			m_pJob_pool = nullptr;
+		}
+				
+		// Pointer to the global selector codebook, or nullptr to not use a global selector codebook
+		const basist::etc1_global_selector_codebook *m_pSel_codebook;
+
+		// True to generate UASTC .basis file data, otherwise ETC1S.
+		bool_param<false> m_uastc;
+
+		// If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG images to read. 
+		// Otherwise, the compressor processes the images in m_source_images.
+		basisu::vector<std::string> m_source_filenames;
+		basisu::vector<std::string> m_source_alpha_filenames;
+		
+		basisu::vector<image> m_source_images;
+		
+		// Stores mipmaps starting from level 1. Level 0 is still stored in m_source_images, as usual.
+		// If m_source_mipmaps isn't empty, automatic mipmap generation isn't done. m_source_mipmaps.size() MUST equal m_source_images.size() or the compressor returns an error.
+		// The compressor applies the user-provided swizzling (in m_swizzle) to these images.
+		basisu::vector< basisu::vector<image> > m_source_mipmap_images;
+						
+		// Filename of the output basis file
+		std::string m_out_filename;
+
+		// The params are done this way so we can detect when the user has explictly changed them.
+
+		// Flip images across Y axis
+		bool_param<false> m_y_flip;
+
+		// If true, the compressor will print basis status to stdout during compression.
+		bool_param<true> m_status_output;
+		
+		// Output debug information during compression
+		bool_param<false> m_debug;
+		bool_param<false> m_validate;
+		
+		// m_debug_images is pretty slow
+		bool_param<false> m_debug_images;
+
+		// Compression level, from 0 to BASISU_MAX_COMPRESSION_LEVEL (higher is slower)
+		param<int> m_compression_level;
+
+		bool_param<false> m_global_sel_pal;
+		bool_param<false> m_auto_global_sel_pal;
+
+		// Frontend/backend codec parameters
+		bool_param<false> m_no_hybrid_sel_cb;
+		
+		// Use perceptual sRGB colorspace metrics instead of linear
+		bool_param<true> m_perceptual;
+
+		// Disable selector RDO, for faster compression but larger files
+		bool_param<false> m_no_selector_rdo;
+		param<float> m_selector_rdo_thresh;
+
+		bool_param<false> m_no_endpoint_rdo;
+		param<float> m_endpoint_rdo_thresh;
+
+		// Read source images from m_source_filenames/m_source_alpha_filenames
+		bool_param<false> m_read_source_images;
+
+		// Write the output basis file to disk using m_out_filename
+		bool_param<false> m_write_output_basis_files;
+								
+		// Compute and display image metrics 
+		bool_param<false> m_compute_stats;
+		
+		// Check to see if any input image has an alpha channel, if so then the output basis file will have alpha channels
+		bool_param<true> m_check_for_alpha;
+		
+		// Always put alpha slices in the output basis file, even when the input doesn't have alpha
+		bool_param<false> m_force_alpha; 
+		bool_param<true> m_multithreading;
+		
+		// Split the R channel to RGB and the G channel to alpha, then write a basis file with alpha channels
+		char m_swizzle[4];
+
+		bool_param<false> m_renormalize;
+
+		bool_param<false> m_disable_hierarchical_endpoint_codebooks;
+
+		// Global/hybrid selector codebook parameters
+		param<float> m_hybrid_sel_cb_quality_thresh;
+		param<int> m_global_pal_bits;
+		param<int> m_global_mod_bits;
+		
+		// mipmap generation parameters
+		bool_param<false> m_mip_gen;
+		param<float> m_mip_scale;
+		std::string m_mip_filter;
+		bool_param<false> m_mip_srgb;
+		bool_param<true> m_mip_premultiplied; // not currently supported
+		bool_param<false> m_mip_renormalize; 
+		bool_param<true> m_mip_wrapping;
+		bool_param<true> m_mip_fast;
+		param<int> m_mip_smallest_dimension;
+				
+		// Codebook size (quality) control. 
+		// If m_quality_level != -1, it controls the quality level. It ranges from [0,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX].
+		// Otherwise m_max_endpoint_clusters/m_max_selector_clusters controls the codebook sizes directly.
+		uint32_t m_max_endpoint_clusters;
+		uint32_t m_max_selector_clusters;
+		int m_quality_level;
+		
+		// m_tex_type, m_userdata0, m_userdata1, m_framerate - These fields go directly into the Basis file header.
+		basist::basis_texture_type m_tex_type;
+		uint32_t m_userdata0;
+		uint32_t m_userdata1;
+		uint32_t m_us_per_frame;
+
+		// cPackUASTCLevelDefault, etc.
+		uint32_t m_pack_uastc_flags;
+		bool_param<false> m_rdo_uastc;
+		param<float> m_rdo_uastc_quality_scalar;
+		param<int> m_rdo_uastc_dict_size;
+		param<float> m_rdo_uastc_max_smooth_block_error_scale;
+		param<float> m_rdo_uastc_smooth_block_max_std_dev;
+		param<float> m_rdo_uastc_max_allowed_rms_increase_ratio;
+		param<float> m_rdo_uastc_skip_block_rms_thresh;
+		bool_param<true> m_rdo_uastc_favor_simpler_modes_in_rdo_mode;
+		bool_param<true> m_rdo_uastc_multithreading;
+
+		param<int> m_resample_width;
+		param<int> m_resample_height;
+		param<float> m_resample_factor;
+		const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
+
+		// KTX2 specific parameters.
+		// Internally, the compressor always creates a .basis file then it converts that lossless to KTX2.
+		bool_param<false> m_create_ktx2_file;
+		basist::ktx2_supercompression m_ktx2_uastc_supercompression;
+		basist::ktx2_transcoder::key_value_vec m_ktx2_key_values;
+		param<int> m_ktx2_zstd_supercompression_level;
+		bool_param<false> m_ktx2_srgb_transfer_func;
+
+		job_pool *m_pJob_pool;
+	};
+	
+	class basis_compressor
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basis_compressor);
+
+	public:
+		basis_compressor();
+
+		bool init(const basis_compressor_params &params);
+		
+		enum error_code
+		{
+			cECSuccess = 0,
+			cECFailedReadingSourceImages,
+			cECFailedValidating,
+			cECFailedEncodeUASTC,
+			cECFailedFrontEnd,
+			cECFailedFontendExtract,
+			cECFailedBackend,
+			cECFailedCreateBasisFile,
+			cECFailedWritingOutput,
+			cECFailedUASTCRDOPostProcess,
+			cECFailedCreateKTX2File
+		};
+
+		error_code process();
+
+		// The output .basis file will always be valid of process() succeeded.
+		const uint8_vec &get_output_basis_file() const { return m_output_basis_file; }
+		
+		// The output .ktx2 file will only be valid if m_create_ktx2_file was true and process() succeeded.
+		const uint8_vec& get_output_ktx2_file() const { return m_output_ktx2_file; }
+
+		const basisu::vector<image_stats> &get_stats() const { return m_stats; }
+
+		uint32_t get_basis_file_size() const { return m_basis_file_size; }
+		double get_basis_bits_per_texel() const { return m_basis_bits_per_texel; }
+		
+		bool get_any_source_image_has_alpha() const { return m_any_source_image_has_alpha; }
+								
+	private:
+		basis_compressor_params m_params;
+		
+		basisu::vector<image> m_slice_images;
+
+		basisu::vector<image_stats> m_stats;
+
+		uint32_t m_basis_file_size;
+		double m_basis_bits_per_texel;
+						
+		basisu_backend_slice_desc_vec m_slice_descs;
+
+		uint32_t m_total_blocks;
+		bool m_auto_global_sel_pal;
+
+		basisu_frontend m_frontend;
+		pixel_block_vec m_source_blocks;
+
+		basisu::vector<gpu_image> m_frontend_output_textures;
+
+		basisu::vector<gpu_image> m_best_etc1s_images;
+		basisu::vector<image> m_best_etc1s_images_unpacked;
+
+		basisu_backend m_backend;
+
+		basisu_file m_basis_file;
+
+		basisu::vector<gpu_image> m_decoded_output_textures;
+		basisu::vector<image> m_decoded_output_textures_unpacked;
+		basisu::vector<gpu_image> m_decoded_output_textures_bc7;
+		basisu::vector<image> m_decoded_output_textures_unpacked_bc7;
+
+		uint8_vec m_output_basis_file;
+		uint8_vec m_output_ktx2_file;
+		
+		basisu::vector<gpu_image> m_uastc_slice_textures;
+		basisu_backend_output m_uastc_backend_output;
+
+		bool m_any_source_image_has_alpha;
+
+		bool read_source_images();
+		bool extract_source_blocks();
+		bool process_frontend();
+		bool extract_frontend_texture_data();
+		bool process_backend();
+		bool create_basis_file_and_transcode();
+		bool write_output_files_and_compute_stats();
+		error_code encode_slices_to_uastc();
+		bool generate_mipmaps(const image &img, basisu::vector<image> &mips, bool has_alpha);
+		bool validate_texture_type_constraints();
+		bool validate_ktx2_constraints();
+		void get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr);
+		bool create_ktx2_file();
+	};
+
+} // namespace basisu
+
diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp
new file mode 100644
index 0000000000..f02fb62c11
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp
@@ -0,0 +1,2139 @@
+// basisu_enc.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+#include "lodepng.h"
+#include "basisu_resampler.h"
+#include "basisu_resampler_filters.h"
+#include "basisu_etc.h"
+#include "../transcoder/basisu_transcoder.h"
+#include "basisu_bc7enc.h"
+#include "apg_bmp.h"
+#include "jpgd.h"
+#include <vector>
+
+#if defined(_WIN32)
+// For QueryPerformanceCounter/QueryPerformanceFrequency
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
+namespace basisu
+{
+	uint64_t interval_timer::g_init_ticks, interval_timer::g_freq;
+	double interval_timer::g_timer_freq;
+#if BASISU_SUPPORT_SSE
+	bool g_cpu_supports_sse41;
+#endif
+
+	uint8_t g_hamming_dist[256] =
+	{
+		0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+		1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+		1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+		1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+		3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+		1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
+		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+		3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+		2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
+		3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+		3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+		4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
+	};
+
+	// This is a Public Domain 8x8 font from here:
+	// https://github.com/dhepper/font8x8/blob/master/font8x8_basic.h
+	const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8] = 
+	{
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},	// U+0020 ( )
+	 { 0x18, 0x3C, 0x3C, 0x18, 0x18, 0x00, 0x18, 0x00},   // U+0021 (!)
+	 { 0x36, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0022 (")
+	 { 0x36, 0x36, 0x7F, 0x36, 0x7F, 0x36, 0x36, 0x00},   // U+0023 (#)
+	 { 0x0C, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x0C, 0x00},   // U+0024 ($)
+	 { 0x00, 0x63, 0x33, 0x18, 0x0C, 0x66, 0x63, 0x00},   // U+0025 (%)
+	 { 0x1C, 0x36, 0x1C, 0x6E, 0x3B, 0x33, 0x6E, 0x00},   // U+0026 (&)
+	 { 0x06, 0x06, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0027 (')
+	 { 0x18, 0x0C, 0x06, 0x06, 0x06, 0x0C, 0x18, 0x00},   // U+0028 (()
+	 { 0x06, 0x0C, 0x18, 0x18, 0x18, 0x0C, 0x06, 0x00},   // U+0029 ())
+	 { 0x00, 0x66, 0x3C, 0xFF, 0x3C, 0x66, 0x00, 0x00},   // U+002A (*)
+	 { 0x00, 0x0C, 0x0C, 0x3F, 0x0C, 0x0C, 0x00, 0x00},   // U+002B (+)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x06},   // U+002C (,)
+	 { 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x00},   // U+002D (-)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x0C, 0x0C, 0x00},   // U+002E (.)
+	 { 0x60, 0x30, 0x18, 0x0C, 0x06, 0x03, 0x01, 0x00},   // U+002F (/)
+	 { 0x3E, 0x63, 0x73, 0x7B, 0x6F, 0x67, 0x3E, 0x00},   // U+0030 (0)
+	 { 0x0C, 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x3F, 0x00},   // U+0031 (1)
+	 { 0x1E, 0x33, 0x30, 0x1C, 0x06, 0x33, 0x3F, 0x00},   // U+0032 (2)
+	 { 0x1E, 0x33, 0x30, 0x1C, 0x30, 0x33, 0x1E, 0x00},   // U+0033 (3)
+	 { 0x38, 0x3C, 0x36, 0x33, 0x7F, 0x30, 0x78, 0x00},   // U+0034 (4)
+	 { 0x3F, 0x03, 0x1F, 0x30, 0x30, 0x33, 0x1E, 0x00},   // U+0035 (5)
+	 { 0x1C, 0x06, 0x03, 0x1F, 0x33, 0x33, 0x1E, 0x00},   // U+0036 (6)
+	 { 0x3F, 0x33, 0x30, 0x18, 0x0C, 0x0C, 0x0C, 0x00},   // U+0037 (7)
+	 { 0x1E, 0x33, 0x33, 0x1E, 0x33, 0x33, 0x1E, 0x00},   // U+0038 (8)
+	 { 0x1E, 0x33, 0x33, 0x3E, 0x30, 0x18, 0x0E, 0x00},   // U+0039 (9)
+	 { 0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x00},   // U+003A (:)
+	 { 0x00, 0x0C, 0x0C, 0x00, 0x00, 0x0C, 0x0C, 0x06},   // U+003B (;)
+	 { 0x18, 0x0C, 0x06, 0x03, 0x06, 0x0C, 0x18, 0x00},   // U+003C (<)
+	 { 0x00, 0x00, 0x3F, 0x00, 0x00, 0x3F, 0x00, 0x00},   // U+003D (=)
+	 { 0x06, 0x0C, 0x18, 0x30, 0x18, 0x0C, 0x06, 0x00},   // U+003E (>)
+	 { 0x1E, 0x33, 0x30, 0x18, 0x0C, 0x00, 0x0C, 0x00},   // U+003F (?)
+	 { 0x3E, 0x63, 0x7B, 0x7B, 0x7B, 0x03, 0x1E, 0x00},   // U+0040 (@)
+	 { 0x0C, 0x1E, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x00},   // U+0041 (A)
+	 { 0x3F, 0x66, 0x66, 0x3E, 0x66, 0x66, 0x3F, 0x00},   // U+0042 (B)
+	 { 0x3C, 0x66, 0x03, 0x03, 0x03, 0x66, 0x3C, 0x00},   // U+0043 (C)
+	 { 0x1F, 0x36, 0x66, 0x66, 0x66, 0x36, 0x1F, 0x00},   // U+0044 (D)
+	 { 0x7F, 0x46, 0x16, 0x1E, 0x16, 0x46, 0x7F, 0x00},   // U+0045 (E)
+	 { 0x7F, 0x46, 0x16, 0x1E, 0x16, 0x06, 0x0F, 0x00},   // U+0046 (F)
+	 { 0x3C, 0x66, 0x03, 0x03, 0x73, 0x66, 0x7C, 0x00},   // U+0047 (G)
+	 { 0x33, 0x33, 0x33, 0x3F, 0x33, 0x33, 0x33, 0x00},   // U+0048 (H)
+	 { 0x1E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0049 (I)
+	 { 0x78, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E, 0x00},   // U+004A (J)
+	 { 0x67, 0x66, 0x36, 0x1E, 0x36, 0x66, 0x67, 0x00},   // U+004B (K)
+	 { 0x0F, 0x06, 0x06, 0x06, 0x46, 0x66, 0x7F, 0x00},   // U+004C (L)
+	 { 0x63, 0x77, 0x7F, 0x7F, 0x6B, 0x63, 0x63, 0x00},   // U+004D (M)
+	 { 0x63, 0x67, 0x6F, 0x7B, 0x73, 0x63, 0x63, 0x00},   // U+004E (N)
+	 { 0x1C, 0x36, 0x63, 0x63, 0x63, 0x36, 0x1C, 0x00},   // U+004F (O)
+	 { 0x3F, 0x66, 0x66, 0x3E, 0x06, 0x06, 0x0F, 0x00},   // U+0050 (P)
+	 { 0x1E, 0x33, 0x33, 0x33, 0x3B, 0x1E, 0x38, 0x00},   // U+0051 (Q)
+	 { 0x3F, 0x66, 0x66, 0x3E, 0x36, 0x66, 0x67, 0x00},   // U+0052 (R)
+	 { 0x1E, 0x33, 0x07, 0x0E, 0x38, 0x33, 0x1E, 0x00},   // U+0053 (S)
+	 { 0x3F, 0x2D, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0054 (T)
+	 { 0x33, 0x33, 0x33, 0x33, 0x33, 0x33, 0x3F, 0x00},   // U+0055 (U)
+	 { 0x33, 0x33, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00},   // U+0056 (V)
+	 { 0x63, 0x63, 0x63, 0x6B, 0x7F, 0x77, 0x63, 0x00},   // U+0057 (W)
+	 { 0x63, 0x63, 0x36, 0x1C, 0x1C, 0x36, 0x63, 0x00},   // U+0058 (X)
+	 { 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x0C, 0x1E, 0x00},   // U+0059 (Y)
+	 { 0x7F, 0x63, 0x31, 0x18, 0x4C, 0x66, 0x7F, 0x00},   // U+005A (Z)
+	 { 0x1E, 0x06, 0x06, 0x06, 0x06, 0x06, 0x1E, 0x00},   // U+005B ([)
+	 { 0x03, 0x06, 0x0C, 0x18, 0x30, 0x60, 0x40, 0x00},   // U+005C (\)
+	 { 0x1E, 0x18, 0x18, 0x18, 0x18, 0x18, 0x1E, 0x00},   // U+005D (])
+	 { 0x08, 0x1C, 0x36, 0x63, 0x00, 0x00, 0x00, 0x00},   // U+005E (^)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF},   // U+005F (_)
+	 { 0x0C, 0x0C, 0x18, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+0060 (`)
+	 { 0x00, 0x00, 0x1E, 0x30, 0x3E, 0x33, 0x6E, 0x00},   // U+0061 (a)
+	 { 0x07, 0x06, 0x06, 0x3E, 0x66, 0x66, 0x3B, 0x00},   // U+0062 (b)
+	 { 0x00, 0x00, 0x1E, 0x33, 0x03, 0x33, 0x1E, 0x00},   // U+0063 (c)
+	 { 0x38, 0x30, 0x30, 0x3e, 0x33, 0x33, 0x6E, 0x00},   // U+0064 (d)
+	 { 0x00, 0x00, 0x1E, 0x33, 0x3f, 0x03, 0x1E, 0x00},   // U+0065 (e)
+	 { 0x1C, 0x36, 0x06, 0x0f, 0x06, 0x06, 0x0F, 0x00},   // U+0066 (f)
+	 { 0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x1F},   // U+0067 (g)
+	 { 0x07, 0x06, 0x36, 0x6E, 0x66, 0x66, 0x67, 0x00},   // U+0068 (h)
+	 { 0x0C, 0x00, 0x0E, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+0069 (i)
+	 { 0x30, 0x00, 0x30, 0x30, 0x30, 0x33, 0x33, 0x1E},   // U+006A (j)
+	 { 0x07, 0x06, 0x66, 0x36, 0x1E, 0x36, 0x67, 0x00},   // U+006B (k)
+	 { 0x0E, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x1E, 0x00},   // U+006C (l)
+	 { 0x00, 0x00, 0x33, 0x7F, 0x7F, 0x6B, 0x63, 0x00},   // U+006D (m)
+	 { 0x00, 0x00, 0x1F, 0x33, 0x33, 0x33, 0x33, 0x00},   // U+006E (n)
+	 { 0x00, 0x00, 0x1E, 0x33, 0x33, 0x33, 0x1E, 0x00},   // U+006F (o)
+	 { 0x00, 0x00, 0x3B, 0x66, 0x66, 0x3E, 0x06, 0x0F},   // U+0070 (p)
+	 { 0x00, 0x00, 0x6E, 0x33, 0x33, 0x3E, 0x30, 0x78},   // U+0071 (q)
+	 { 0x00, 0x00, 0x3B, 0x6E, 0x66, 0x06, 0x0F, 0x00},   // U+0072 (r)
+	 { 0x00, 0x00, 0x3E, 0x03, 0x1E, 0x30, 0x1F, 0x00},   // U+0073 (s)
+	 { 0x08, 0x0C, 0x3E, 0x0C, 0x0C, 0x2C, 0x18, 0x00},   // U+0074 (t)
+	 { 0x00, 0x00, 0x33, 0x33, 0x33, 0x33, 0x6E, 0x00},   // U+0075 (u)
+	 { 0x00, 0x00, 0x33, 0x33, 0x33, 0x1E, 0x0C, 0x00},   // U+0076 (v)
+	 { 0x00, 0x00, 0x63, 0x6B, 0x7F, 0x7F, 0x36, 0x00},   // U+0077 (w)
+	 { 0x00, 0x00, 0x63, 0x36, 0x1C, 0x36, 0x63, 0x00},   // U+0078 (x)
+	 { 0x00, 0x00, 0x33, 0x33, 0x33, 0x3E, 0x30, 0x1F},   // U+0079 (y)
+	 { 0x00, 0x00, 0x3F, 0x19, 0x0C, 0x26, 0x3F, 0x00},   // U+007A (z)
+	 { 0x38, 0x0C, 0x0C, 0x07, 0x0C, 0x0C, 0x38, 0x00},   // U+007B ({)
+	 { 0x18, 0x18, 0x18, 0x00, 0x18, 0x18, 0x18, 0x00},   // U+007C (|)
+	 { 0x07, 0x0C, 0x0C, 0x38, 0x0C, 0x0C, 0x07, 0x00},   // U+007D (})
+	 { 0x6E, 0x3B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},   // U+007E (~)
+	 { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}    // U+007F
+	};
+			
+	// Encoder library initialization (just call once at startup)
+	void basisu_encoder_init()
+	{
+		detect_sse41();
+
+		basist::basisu_transcoder_init();
+		pack_etc1_solid_color_init();
+		//uastc_init();
+		bc7enc_compress_block_init(); // must be after uastc_init()
+	}
+
+	void error_printf(const char *pFmt, ...)
+	{
+		char buf[2048];
+
+		va_list args;
+		va_start(args, pFmt);
+#ifdef _WIN32		
+		vsprintf_s(buf, sizeof(buf), pFmt, args);
+#else
+		vsnprintf(buf, sizeof(buf), pFmt, args);
+#endif
+		va_end(args);
+
+		fprintf(stderr, "ERROR: %s", buf);
+	}
+
+#if defined(_WIN32)
+	inline void query_counter(timer_ticks* pTicks)
+	{
+		QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*>(pTicks));
+	}
+	inline void query_counter_frequency(timer_ticks* pTicks)
+	{
+		QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*>(pTicks));
+	}
+#elif defined(__APPLE__)
+#include <sys/time.h>
+	inline void query_counter(timer_ticks* pTicks)
+	{
+		struct timeval cur_time;
+		gettimeofday(&cur_time, NULL);
+		*pTicks = static_cast<unsigned long long>(cur_time.tv_sec) * 1000000ULL + static_cast<unsigned long long>(cur_time.tv_usec);
+	}
+	inline void query_counter_frequency(timer_ticks* pTicks)
+	{
+		*pTicks = 1000000;
+	}
+#elif defined(__GNUC__)
+#include <sys/timex.h>
+	inline void query_counter(timer_ticks* pTicks)
+	{
+		struct timeval cur_time;
+		gettimeofday(&cur_time, NULL);
+		*pTicks = static_cast<unsigned long long>(cur_time.tv_sec) * 1000000ULL + static_cast<unsigned long long>(cur_time.tv_usec);
+	}
+	inline void query_counter_frequency(timer_ticks* pTicks)
+	{
+		*pTicks = 1000000;
+	}
+#else
+#error TODO
+#endif
+				
+	interval_timer::interval_timer() : m_start_time(0), m_stop_time(0), m_started(false), m_stopped(false)
+	{
+		if (!g_timer_freq)
+			init();
+	}
+
+	void interval_timer::start()
+	{
+		query_counter(&m_start_time);
+		m_started = true;
+		m_stopped = false;
+	}
+
+	void interval_timer::stop()
+	{
+		assert(m_started);
+		query_counter(&m_stop_time);
+		m_stopped = true;
+	}
+
+	double interval_timer::get_elapsed_secs() const
+	{
+		assert(m_started);
+		if (!m_started)
+			return 0;
+
+		timer_ticks stop_time = m_stop_time;
+		if (!m_stopped)
+			query_counter(&stop_time);
+
+		timer_ticks delta = stop_time - m_start_time;
+		return delta * g_timer_freq;
+	}
+		
+	void interval_timer::init()
+	{
+		if (!g_timer_freq)
+		{
+			query_counter_frequency(&g_freq);
+			g_timer_freq = 1.0f / g_freq;
+			query_counter(&g_init_ticks);
+		}
+	}
+
+	timer_ticks interval_timer::get_ticks()
+	{
+		if (!g_timer_freq)
+			init();
+		timer_ticks ticks;
+		query_counter(&ticks);
+		return ticks - g_init_ticks;
+	}
+
+	double interval_timer::ticks_to_secs(timer_ticks ticks)
+	{
+		if (!g_timer_freq)
+			init();
+		return ticks * g_timer_freq;
+	}
+		
+	const uint32_t MAX_32BIT_ALLOC_SIZE = 250000000;
+
+	bool load_bmp(const char* pFilename, image& img)
+	{
+		int w = 0, h = 0;
+		unsigned int n_chans = 0;
+		unsigned char* pImage_data = apg_bmp_read(pFilename, &w, &h, &n_chans);
+				
+		if ((!pImage_data) || (!w) || (!h) || ((n_chans != 3) && (n_chans != 4)))
+		{
+			error_printf("Failed loading .BMP image \"%s\"!\n", pFilename);
+
+			if (pImage_data)
+				apg_bmp_free(pImage_data);
+						
+			return false;
+		}
+
+		if (sizeof(void *) == sizeof(uint32_t))
+		{
+			if ((w * h * n_chans) > MAX_32BIT_ALLOC_SIZE)
+			{
+				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h);
+
+				if (pImage_data)
+					apg_bmp_free(pImage_data);
+
+				return false;
+			}
+		}
+		
+		img.resize(w, h);
+
+		const uint8_t *pSrc = pImage_data;
+		for (int y = 0; y < h; y++)
+		{
+			color_rgba *pDst = &img(0, y);
+
+			for (int x = 0; x < w; x++)
+			{
+				pDst->r = pSrc[0];
+				pDst->g = pSrc[1];
+				pDst->b = pSrc[2];
+				pDst->a = (n_chans == 3) ? 255 : pSrc[3];
+
+				pSrc += n_chans;
+				++pDst;
+			}
+		}
+
+		apg_bmp_free(pImage_data);
+
+		return true;
+	}
+		
+	bool load_tga(const char* pFilename, image& img)
+	{
+		int w = 0, h = 0, n_chans = 0;
+		uint8_t* pImage_data = read_tga(pFilename, w, h, n_chans);
+				
+		if ((!pImage_data) || (!w) || (!h) || ((n_chans != 3) && (n_chans != 4)))
+		{
+			error_printf("Failed loading .TGA image \"%s\"!\n", pFilename);
+
+			if (pImage_data)
+				free(pImage_data);
+						
+			return false;
+		}
+
+		if (sizeof(void *) == sizeof(uint32_t))
+		{
+			if ((w * h * n_chans) > MAX_32BIT_ALLOC_SIZE)
+			{
+				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", pFilename, w, h);
+
+				if (pImage_data)
+					free(pImage_data);
+
+				return false;
+			}
+		}
+		
+		img.resize(w, h);
+
+		const uint8_t *pSrc = pImage_data;
+		for (int y = 0; y < h; y++)
+		{
+			color_rgba *pDst = &img(0, y);
+
+			for (int x = 0; x < w; x++)
+			{
+				pDst->r = pSrc[0];
+				pDst->g = pSrc[1];
+				pDst->b = pSrc[2];
+				pDst->a = (n_chans == 3) ? 255 : pSrc[3];
+
+				pSrc += n_chans;
+				++pDst;
+			}
+		}
+
+		free(pImage_data);
+
+		return true;
+	}
+
+	bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename)
+	{
+		if (!buf_size)
+			return false;
+
+		unsigned err = 0, w = 0, h = 0;
+
+		if (sizeof(void*) == sizeof(uint32_t))
+		{
+			// Inspect the image first on 32-bit builds, to see if the image would require too much memory.
+			lodepng::State state;
+			err = lodepng_inspect(&w, &h, &state, pBuf, buf_size);
+			if ((err != 0) || (!w) || (!h))
+				return false;
+
+			const uint32_t exepected_alloc_size = w * h * sizeof(uint32_t);
+
+			// If the file is too large on 32-bit builds then just bail now, to prevent causing a memory exception.
+			if (exepected_alloc_size >= MAX_32BIT_ALLOC_SIZE)
+			{
+				error_printf("Image \"%s\" is too large (%ux%u) to process in a 32-bit build!\n", (pFilename != nullptr) ? pFilename : "<memory>", w, h);
+				return false;
+			}
+
+			w = h = 0;
+		}
+
+		std::vector<uint8_t> out;
+		err = lodepng::decode(out, w, h, pBuf, buf_size);
+		if ((err != 0) || (!w) || (!h))
+			return false;
+
+		if (out.size() != (w * h * 4))
+			return false;
+
+		img.resize(w, h);
+
+		memcpy(img.get_ptr(), &out[0], out.size());
+
+		return true;
+	}
+		
+	bool load_png(const char* pFilename, image& img)
+	{
+		std::vector<uint8_t> buffer;
+		unsigned err = lodepng::load_file(buffer, std::string(pFilename));
+		if (err)
+			return false;
+
+
+		return load_png(buffer.data(), buffer.size(), img, pFilename);
+	}
+
+	bool load_jpg(const char *pFilename, image& img)
+	{
+		int width = 0, height = 0, actual_comps = 0;
+		uint8_t *pImage_data = jpgd::decompress_jpeg_image_from_file(pFilename, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagLinearChromaFiltering);
+		if (!pImage_data)
+			return false;
+		
+		img.init(pImage_data, width, height, 4);
+		
+		free(pImage_data);
+
+		return true;
+	}
+
+	bool load_image(const char* pFilename, image& img)
+	{
+		std::string ext(string_get_extension(std::string(pFilename)));
+
+		if (ext.length() == 0)
+			return false;
+
+		const char *pExt = ext.c_str();
+
+		if (strcasecmp(pExt, "png") == 0)
+			return load_png(pFilename, img);
+		if (strcasecmp(pExt, "bmp") == 0)
+			return load_bmp(pFilename, img);
+		if (strcasecmp(pExt, "tga") == 0)
+			return load_tga(pFilename, img);
+		if ( (strcasecmp(pExt, "jpg") == 0) || (strcasecmp(pExt, "jfif") == 0) || (strcasecmp(pExt, "jpeg") == 0) )
+			return load_jpg(pFilename, img);
+
+		return false;
+	}
+	
+	bool save_png(const char* pFilename, const image &img, uint32_t image_save_flags, uint32_t grayscale_comp)
+	{
+		if (!img.get_total_pixels())
+			return false;
+
+		const uint32_t MAX_PNG_IMAGE_DIM = 32768;
+		if ((img.get_width() > MAX_PNG_IMAGE_DIM) || (img.get_height() > MAX_PNG_IMAGE_DIM))
+			return false;
+
+		std::vector<uint8_t> out;
+		unsigned err = 0;
+				
+		if (image_save_flags & cImageSaveGrayscale)
+		{
+			uint8_vec g_pixels(img.get_width() * img.get_height());
+			uint8_t *pDst = &g_pixels[0];
+
+			for (uint32_t y = 0; y < img.get_height(); y++)
+				for (uint32_t x = 0; x < img.get_width(); x++)
+					*pDst++ = img(x, y)[grayscale_comp];
+
+			err = lodepng::encode(out, (const uint8_t*)&g_pixels[0], img.get_width(), img.get_height(), LCT_GREY, 8);
+		}
+		else
+		{
+			bool has_alpha = img.has_alpha();
+			if ((!has_alpha) || ((image_save_flags & cImageSaveIgnoreAlpha) != 0))
+			{
+				const uint64_t total_bytes = (uint64_t)img.get_width() * 3U * (uint64_t)img.get_height();
+				if (total_bytes > INT_MAX)
+					return false;
+				uint8_vec rgb_pixels(static_cast<size_t>(total_bytes));
+				uint8_t *pDst = &rgb_pixels[0];
+								
+				for (uint32_t y = 0; y < img.get_height(); y++)
+				{
+					for (uint32_t x = 0; x < img.get_width(); x++)
+					{
+						const color_rgba& c = img(x, y);
+						pDst[0] = c.r;
+						pDst[1] = c.g;
+						pDst[2] = c.b;
+						pDst += 3;
+					}
+				}
+
+				err = lodepng::encode(out, (const uint8_t*)& rgb_pixels[0], img.get_width(), img.get_height(), LCT_RGB, 8);
+			}
+			else
+			{
+				err = lodepng::encode(out, (const uint8_t*)img.get_ptr(), img.get_width(), img.get_height(), LCT_RGBA, 8);
+			}
+		}
+
+		err = lodepng::save_file(out, std::string(pFilename));
+		if (err)
+			return false;
+
+		return true;
+	}
+		
+	bool read_file_to_vec(const char* pFilename, uint8_vec& data)
+	{
+		FILE* pFile = nullptr;
+#ifdef _WIN32
+		fopen_s(&pFile, pFilename, "rb");
+#else
+		pFile = fopen(pFilename, "rb");
+#endif
+		if (!pFile)
+			return false;
+				
+		fseek(pFile, 0, SEEK_END);
+#ifdef _WIN32
+		int64_t filesize = _ftelli64(pFile);
+#else
+		int64_t filesize = ftello(pFile);
+#endif
+		if (filesize < 0)
+		{
+			fclose(pFile);
+			return false;
+		}
+		fseek(pFile, 0, SEEK_SET);
+
+		if (sizeof(size_t) == sizeof(uint32_t))
+		{
+			if (filesize > 0x70000000)
+			{
+				// File might be too big to load safely in one alloc
+				fclose(pFile);
+				return false;
+			}
+		}
+
+		if (!data.try_resize((size_t)filesize))
+		{
+			fclose(pFile);
+			return false;
+		}
+
+		if (filesize)
+		{
+			if (fread(&data[0], 1, (size_t)filesize, pFile) != (size_t)filesize)
+			{
+				fclose(pFile);
+				return false;
+			}
+		}
+
+		fclose(pFile);
+		return true;
+	}
+
+	bool write_data_to_file(const char* pFilename, const void* pData, size_t len)
+	{
+		FILE* pFile = nullptr;
+#ifdef _WIN32
+		fopen_s(&pFile, pFilename, "wb");
+#else
+		pFile = fopen(pFilename, "wb");
+#endif
+		if (!pFile)
+			return false;
+
+		if (len)
+		{
+			if (fwrite(pData, 1, len, pFile) != len)
+			{
+				fclose(pFile);
+				return false;
+			}
+		}
+
+		return fclose(pFile) != EOF;
+	}
+
+	float linear_to_srgb(float l)
+	{
+		assert(l >= 0.0f && l <= 1.0f);
+		if (l < .0031308f)
+			return saturate(l * 12.92f);
+		else
+			return saturate(1.055f * powf(l, 1.0f/2.4f) - .055f);
+	}
+
+	float srgb_to_linear(float s)
+	{
+		assert(s >= 0.0f && s <= 1.0f);
+		if (s < .04045f)
+			return saturate(s * (1.0f/12.92f));
+		else
+			return saturate(powf((s + .055f) * (1.0f/1.055f), 2.4f));
+	}
+
+	bool image_resample(const image &src, image &dst, bool srgb,
+		const char *pFilter, float filter_scale, 
+		bool wrapping,
+		uint32_t first_comp, uint32_t num_comps)
+	{
+		assert((first_comp + num_comps) <= 4);
+
+		const int cMaxComps = 4;
+				
+		const uint32_t src_w = src.get_width(), src_h = src.get_height();
+		const uint32_t dst_w = dst.get_width(), dst_h = dst.get_height();
+				
+		if (maximum(src_w, src_h) > BASISU_RESAMPLER_MAX_DIMENSION)
+		{
+			printf("Image is too large!\n");
+			return false;
+		}
+
+		if (!src_w || !src_h || !dst_w || !dst_h)
+			return false;
+				
+		if ((num_comps < 1) || (num_comps > cMaxComps))
+			return false;
+				
+		if ((minimum(dst_w, dst_h) < 1) || (maximum(dst_w, dst_h) > BASISU_RESAMPLER_MAX_DIMENSION))
+		{
+			printf("Image is too large!\n");
+			return false;
+		}
+
+		if ((src_w == dst_w) && (src_h == dst_h))
+		{
+			dst = src;
+			return true;
+		}
+
+		float srgb_to_linear_table[256];
+		if (srgb)
+		{
+			for (int i = 0; i < 256; ++i)
+				srgb_to_linear_table[i] = srgb_to_linear((float)i * (1.0f/255.0f));
+		}
+
+		const int LINEAR_TO_SRGB_TABLE_SIZE = 8192;
+		uint8_t linear_to_srgb_table[LINEAR_TO_SRGB_TABLE_SIZE];
+
+		if (srgb)
+		{
+			for (int i = 0; i < LINEAR_TO_SRGB_TABLE_SIZE; ++i)
+				linear_to_srgb_table[i] = (uint8_t)clamp<int>((int)(255.0f * linear_to_srgb((float)i * (1.0f / (LINEAR_TO_SRGB_TABLE_SIZE - 1))) + .5f), 0, 255);
+		}
+
+		std::vector<float> samples[cMaxComps];
+		Resampler *resamplers[cMaxComps];
+		
+		resamplers[0] = new Resampler(src_w, src_h, dst_w, dst_h,
+			wrapping ? Resampler::BOUNDARY_WRAP : Resampler::BOUNDARY_CLAMP, 0.0f, 1.0f,
+			pFilter, nullptr, nullptr, filter_scale, filter_scale, 0, 0);
+		samples[0].resize(src_w);
+
+		for (uint32_t i = 1; i < num_comps; ++i)
+		{
+			resamplers[i] = new Resampler(src_w, src_h, dst_w, dst_h,
+				wrapping ? Resampler::BOUNDARY_WRAP : Resampler::BOUNDARY_CLAMP, 0.0f, 1.0f,
+				pFilter, resamplers[0]->get_clist_x(), resamplers[0]->get_clist_y(), filter_scale, filter_scale, 0, 0);
+			samples[i].resize(src_w);
+		}
+
+		uint32_t dst_y = 0;
+
+		for (uint32_t src_y = 0; src_y < src_h; ++src_y)
+		{
+			const color_rgba *pSrc = &src(0, src_y);
+
+			// Put source lines into resampler(s)
+			for (uint32_t x = 0; x < src_w; ++x)
+			{
+				for (uint32_t c = 0; c < num_comps; ++c)
+				{
+					const uint32_t comp_index = first_comp + c;
+					const uint32_t v = (*pSrc)[comp_index];
+
+					if (!srgb || (comp_index == 3))
+						samples[c][x] = v * (1.0f / 255.0f);
+					else
+						samples[c][x] = srgb_to_linear_table[v];
+				}
+
+				pSrc++;
+			}
+
+			for (uint32_t c = 0; c < num_comps; ++c)
+			{
+				if (!resamplers[c]->put_line(&samples[c][0]))
+				{
+					for (uint32_t i = 0; i < num_comps; i++)
+						delete resamplers[i];
+					return false;
+				}
+			}
+
+			// Now retrieve any output lines
+			for (;;)
+			{
+				uint32_t c;
+				for (c = 0; c < num_comps; ++c)
+				{
+					const uint32_t comp_index = first_comp + c;
+
+					const float *pOutput_samples = resamplers[c]->get_line();
+					if (!pOutput_samples)
+						break;
+
+					const bool linear_flag = !srgb || (comp_index == 3);
+					
+					color_rgba *pDst = &dst(0, dst_y);
+
+					for (uint32_t x = 0; x < dst_w; x++)
+					{
+						// TODO: Add dithering
+						if (linear_flag)
+						{
+							int j = (int)(255.0f * pOutput_samples[x] + .5f);
+							(*pDst)[comp_index] = (uint8_t)clamp<int>(j, 0, 255);
+						}
+						else
+						{
+							int j = (int)((LINEAR_TO_SRGB_TABLE_SIZE - 1) * pOutput_samples[x] + .5f);
+							(*pDst)[comp_index] = linear_to_srgb_table[clamp<int>(j, 0, LINEAR_TO_SRGB_TABLE_SIZE - 1)];
+						}
+
+						pDst++;
+					}
+				}
+				if (c < num_comps)
+					break;
+
+				++dst_y;
+			}
+		}
+
+		for (uint32_t i = 0; i < num_comps; ++i)
+			delete resamplers[i];
+
+		return true;
+	}
+
+	void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms)
+	{
+		// See the paper "In-Place Calculation of Minimum Redundancy Codes" by Moffat and Katajainen
+		if (!num_syms)
+			return;
+
+		if (1 == num_syms)
+		{
+			A[0].m_key = 1;
+			return;
+		}
+		
+		A[0].m_key += A[1].m_key;
+		
+		int s = 2, r = 0, next;
+		for (next = 1; next < (num_syms - 1); ++next)
+		{
+			if ((s >= num_syms) || (A[r].m_key < A[s].m_key))
+			{
+				A[next].m_key = A[r].m_key;
+				A[r].m_key = next;
+				++r;
+			}
+			else
+			{
+				A[next].m_key = A[s].m_key;
+				++s;
+			}
+
+			if ((s >= num_syms) || ((r < next) && A[r].m_key < A[s].m_key))
+			{
+				A[next].m_key = A[next].m_key + A[r].m_key;
+				A[r].m_key = next;
+				++r;
+			}
+			else
+			{
+				A[next].m_key = A[next].m_key + A[s].m_key;
+				++s;
+			}
+		}
+		A[num_syms - 2].m_key = 0;
+
+		for (next = num_syms - 3; next >= 0; --next)
+		{
+			A[next].m_key = 1 + A[A[next].m_key].m_key;
+		}
+
+		int num_avail = 1, num_used = 0, depth = 0;
+		r = num_syms - 2;
+		next = num_syms - 1;
+		while (num_avail > 0)
+		{
+			for ( ; (r >= 0) && ((int)A[r].m_key == depth); ++num_used, --r )
+				;
+
+			for ( ; num_avail > num_used; --next, --num_avail)
+				A[next].m_key = depth;
+
+			num_avail = 2 * num_used;
+			num_used = 0;
+			++depth;
+		}
+	}
+
+	void canonical_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size)
+	{
+		int i;
+		uint32_t total = 0;
+		if (code_list_len <= 1)
+			return;
+
+		for (i = max_code_size + 1; i <= cHuffmanMaxSupportedInternalCodeSize; i++)
+			pNum_codes[max_code_size] += pNum_codes[i];
+
+		for (i = max_code_size; i > 0; i--)
+			total += (((uint32_t)pNum_codes[i]) << (max_code_size - i));
+
+		while (total != (1UL << max_code_size))
+		{
+			pNum_codes[max_code_size]--;
+			for (i = max_code_size - 1; i > 0; i--)
+			{
+				if (pNum_codes[i])
+				{
+					pNum_codes[i]--;
+					pNum_codes[i + 1] += 2;
+					break;
+				}
+			}
+
+			total--;
+		}
+	}
+
+	sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1)
+	{
+		uint32_t total_passes = 2, pass_shift, pass, i, hist[256 * 2];
+		sym_freq *pCur_syms = pSyms0, *pNew_syms = pSyms1;
+
+		clear_obj(hist);
+
+		for (i = 0; i < num_syms; i++)
+		{
+			uint32_t freq = pSyms0[i].m_key;
+			
+			// We scale all input frequencies to 16-bits.
+			assert(freq <= UINT16_MAX);
+
+			hist[freq & 0xFF]++;
+			hist[256 + ((freq >> 8) & 0xFF)]++;
+		}
+
+		while ((total_passes > 1) && (num_syms == hist[(total_passes - 1) * 256]))
+			total_passes--;
+
+		for (pass_shift = 0, pass = 0; pass < total_passes; pass++, pass_shift += 8)
+		{
+			const uint32_t *pHist = &hist[pass << 8];
+			uint32_t offsets[256], cur_ofs = 0;
+			for (i = 0; i < 256; i++)
+			{
+				offsets[i] = cur_ofs;
+				cur_ofs += pHist[i];
+			}
+
+			for (i = 0; i < num_syms; i++)
+				pNew_syms[offsets[(pCur_syms[i].m_key >> pass_shift) & 0xFF]++] = pCur_syms[i];
+
+			sym_freq *t = pCur_syms;
+			pCur_syms = pNew_syms;
+			pNew_syms = t;
+		}
+
+		return pCur_syms;
+	}
+
+	bool huffman_encoding_table::init(uint32_t num_syms, const uint16_t *pFreq, uint32_t max_code_size)
+	{
+		if (max_code_size > cHuffmanMaxSupportedCodeSize)
+			return false;
+		if ((!num_syms) || (num_syms > cHuffmanMaxSyms))
+			return false;
+
+		uint32_t total_used_syms = 0;
+		for (uint32_t i = 0; i < num_syms; i++)
+			if (pFreq[i])
+				total_used_syms++;
+
+		if (!total_used_syms)
+			return false;
+
+		std::vector<sym_freq> sym_freq0(total_used_syms), sym_freq1(total_used_syms);
+		for (uint32_t i = 0, j = 0; i < num_syms; i++)
+		{
+			if (pFreq[i])
+			{
+				sym_freq0[j].m_key = pFreq[i];
+				sym_freq0[j++].m_sym_index = static_cast<uint16_t>(i);
+			}
+		}
+
+		sym_freq *pSym_freq = canonical_huffman_radix_sort_syms(total_used_syms, &sym_freq0[0], &sym_freq1[0]);
+
+		canonical_huffman_calculate_minimum_redundancy(pSym_freq, total_used_syms);
+
+		int num_codes[cHuffmanMaxSupportedInternalCodeSize + 1];
+		clear_obj(num_codes);
+
+		for (uint32_t i = 0; i < total_used_syms; i++)
+		{
+			if (pSym_freq[i].m_key > cHuffmanMaxSupportedInternalCodeSize)
+				return false;
+
+			num_codes[pSym_freq[i].m_key]++;
+		}
+
+		canonical_huffman_enforce_max_code_size(num_codes, total_used_syms, max_code_size);
+
+		m_code_sizes.resize(0);
+		m_code_sizes.resize(num_syms);
+
+		m_codes.resize(0);
+		m_codes.resize(num_syms);
+
+		for (uint32_t i = 1, j = total_used_syms; i <= max_code_size; i++)
+			for (uint32_t l = num_codes[i]; l > 0; l--)
+				m_code_sizes[pSym_freq[--j].m_sym_index] = static_cast<uint8_t>(i);
+
+		uint32_t next_code[cHuffmanMaxSupportedInternalCodeSize + 1];
+
+		next_code[1] = 0;
+		for (uint32_t j = 0, i = 2; i <= max_code_size; i++)
+			next_code[i] = j = ((j + num_codes[i - 1]) << 1);
+
+		for (uint32_t i = 0; i < num_syms; i++)
+		{
+			uint32_t rev_code = 0, code, code_size;
+			if ((code_size = m_code_sizes[i]) == 0)
+				continue;
+			if (code_size > cHuffmanMaxSupportedInternalCodeSize)
+				return false;
+			code = next_code[code_size]++;
+			for (uint32_t l = code_size; l > 0; l--, code >>= 1)
+				rev_code = (rev_code << 1) | (code & 1);
+			m_codes[i] = static_cast<uint16_t>(rev_code);
+		}
+
+		return true;
+	}
+
+	bool huffman_encoding_table::init(uint32_t num_syms, const uint32_t *pSym_freq, uint32_t max_code_size)
+	{
+		if ((!num_syms) || (num_syms > cHuffmanMaxSyms))
+			return false;
+
+		uint16_vec sym_freq(num_syms);
+
+		uint32_t max_freq = 0;
+		for (uint32_t i = 0; i < num_syms; i++)
+			max_freq = maximum(max_freq, pSym_freq[i]);
+
+		if (max_freq < UINT16_MAX)
+		{
+			for (uint32_t i = 0; i < num_syms; i++)
+				sym_freq[i] = static_cast<uint16_t>(pSym_freq[i]);
+		}
+		else
+		{
+			for (uint32_t i = 0; i < num_syms; i++)
+			{
+				if (pSym_freq[i])
+				{
+					uint32_t f = static_cast<uint32_t>((static_cast<uint64_t>(pSym_freq[i]) * 65534U + (max_freq >> 1)) / max_freq);
+					sym_freq[i] = static_cast<uint16_t>(clamp<uint32_t>(f, 1, 65534));
+				}
+			}
+		}
+
+		return init(num_syms, &sym_freq[0], max_code_size);
+	}
+
+	void bitwise_coder::end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len)
+	{
+		if (run_size)
+		{
+			if (run_size < cHuffmanSmallRepeatSizeMin)
+			{
+				while (run_size--)
+					syms.push_back(static_cast<uint16_t>(len));
+			}
+			else if (run_size <= cHuffmanSmallRepeatSizeMax)
+			{
+				syms.push_back(static_cast<uint16_t>(cHuffmanSmallRepeatCode | ((run_size - cHuffmanSmallRepeatSizeMin) << 6)));
+			}
+			else
+			{
+				assert((run_size >= cHuffmanBigRepeatSizeMin) && (run_size <= cHuffmanBigRepeatSizeMax));
+				syms.push_back(static_cast<uint16_t>(cHuffmanBigRepeatCode | ((run_size - cHuffmanBigRepeatSizeMin) << 6)));
+			}
+		}
+
+		run_size = 0;
+	}
+
+	void bitwise_coder::end_zero_run(uint16_vec &syms, uint32_t &run_size)
+	{
+		if (run_size)
+		{
+			if (run_size < cHuffmanSmallZeroRunSizeMin)
+			{
+				while (run_size--)
+					syms.push_back(0);
+			}
+			else if (run_size <= cHuffmanSmallZeroRunSizeMax)
+			{
+				syms.push_back(static_cast<uint16_t>(cHuffmanSmallZeroRunCode | ((run_size - cHuffmanSmallZeroRunSizeMin) << 6)));
+			}
+			else
+			{
+				assert((run_size >= cHuffmanBigZeroRunSizeMin) && (run_size <= cHuffmanBigZeroRunSizeMax));
+				syms.push_back(static_cast<uint16_t>(cHuffmanBigZeroRunCode | ((run_size - cHuffmanBigZeroRunSizeMin) << 6)));
+			}
+		}
+
+		run_size = 0;
+	}
+
+	uint32_t bitwise_coder::emit_huffman_table(const huffman_encoding_table &tab)
+	{
+		const uint64_t start_bits = m_total_bits;
+
+		const uint8_vec &code_sizes = tab.get_code_sizes();
+
+		uint32_t total_used = tab.get_total_used_codes();
+		put_bits(total_used, cHuffmanMaxSymsLog2);
+			
+		if (!total_used)
+			return 0;
+
+		uint16_vec syms;
+		syms.reserve(total_used + 16);
+
+		uint32_t prev_code_len = UINT_MAX, zero_run_size = 0, nonzero_run_size = 0;
+
+		for (uint32_t i = 0; i <= total_used; ++i)
+		{
+			const uint32_t code_len = (i == total_used) ? 0xFF : code_sizes[i];
+			assert((code_len == 0xFF) || (code_len <= 16));
+
+			if (code_len)
+			{
+				end_zero_run(syms, zero_run_size);
+
+				if (code_len != prev_code_len)
+				{
+					end_nonzero_run(syms, nonzero_run_size, prev_code_len);
+					if (code_len != 0xFF)
+						syms.push_back(static_cast<uint16_t>(code_len));
+				}
+				else if (++nonzero_run_size == cHuffmanBigRepeatSizeMax)
+					end_nonzero_run(syms, nonzero_run_size, prev_code_len);
+			}
+			else
+			{
+				end_nonzero_run(syms, nonzero_run_size, prev_code_len);
+
+				if (++zero_run_size == cHuffmanBigZeroRunSizeMax)
+					end_zero_run(syms, zero_run_size);
+			}
+
+			prev_code_len = code_len;
+		}
+
+		histogram h(cHuffmanTotalCodelengthCodes);
+		for (uint32_t i = 0; i < syms.size(); i++)
+			h.inc(syms[i] & 63);
+
+		huffman_encoding_table ct;
+		if (!ct.init(h, 7))
+			return 0;
+
+		assert(cHuffmanTotalSortedCodelengthCodes == cHuffmanTotalCodelengthCodes);
+
+		uint32_t total_codelength_codes;
+		for (total_codelength_codes = cHuffmanTotalSortedCodelengthCodes; total_codelength_codes > 0; total_codelength_codes--)
+			if (ct.get_code_sizes()[g_huffman_sorted_codelength_codes[total_codelength_codes - 1]])
+				break;
+
+		assert(total_codelength_codes);
+
+		put_bits(total_codelength_codes, 5);
+		for (uint32_t i = 0; i < total_codelength_codes; i++)
+			put_bits(ct.get_code_sizes()[g_huffman_sorted_codelength_codes[i]], 3);
+
+		for (uint32_t i = 0; i < syms.size(); ++i)
+		{
+			const uint32_t l = syms[i] & 63, e = syms[i] >> 6;
+
+			put_code(l, ct);
+				
+			if (l == cHuffmanSmallZeroRunCode)
+				put_bits(e, cHuffmanSmallZeroRunExtraBits);
+			else if (l == cHuffmanBigZeroRunCode)
+				put_bits(e, cHuffmanBigZeroRunExtraBits);
+			else if (l == cHuffmanSmallRepeatCode)
+				put_bits(e, cHuffmanSmallRepeatExtraBits);
+			else if (l == cHuffmanBigRepeatCode)
+				put_bits(e, cHuffmanBigRepeatExtraBits);
+		}
+
+		return (uint32_t)(m_total_bits - start_bits);
+	}
+
+	bool huffman_test(int rand_seed)
+	{
+		histogram h(19);
+
+		// Feed in a fibonacci sequence to force large codesizes
+		h[0] += 1; h[1] += 1; h[2] += 2; h[3] += 3;
+		h[4] += 5; h[5] += 8; h[6] += 13; h[7] += 21;
+		h[8] += 34; h[9] += 55; h[10] += 89; h[11] += 144;
+		h[12] += 233; h[13] += 377; h[14] += 610; h[15] += 987;
+		h[16] += 1597; h[17] += 2584; h[18] += 4181;
+
+		huffman_encoding_table etab;
+		etab.init(h, 16);
+		
+		{
+			bitwise_coder c;
+			c.init(1024);
+
+			c.emit_huffman_table(etab);
+			for (int i = 0; i < 19; i++)
+				c.put_code(i, etab);
+
+			c.flush();
+
+			basist::bitwise_decoder d;
+			d.init(&c.get_bytes()[0], static_cast<uint32_t>(c.get_bytes().size()));
+
+			basist::huffman_decoding_table dtab;
+			bool success = d.read_huffman_table(dtab);
+			if (!success)
+			{
+				assert(0);
+				printf("Failure 5\n");
+				return false;
+			}
+
+			for (uint32_t i = 0; i < 19; i++)
+			{
+				uint32_t s = d.decode_huffman(dtab);
+				if (s != i)
+				{
+					assert(0);
+					printf("Failure 5\n");
+					return false;
+				}
+			}
+		}
+
+		basisu::rand r;
+		r.seed(rand_seed);
+
+		for (int iter = 0; iter < 500000; iter++)
+		{
+			printf("%u\n", iter);
+
+			uint32_t max_sym = r.irand(0, 8193);
+			uint32_t num_codes = r.irand(1, 10000);
+			uint_vec syms(num_codes);
+
+			for (uint32_t i = 0; i < num_codes; i++)
+			{
+				if (r.bit())
+					syms[i] = r.irand(0, max_sym);
+				else
+				{
+					int s = (int)(r.gaussian((float)max_sym / 2, (float)maximum<int>(1, max_sym / 2)) + .5f);
+					s = basisu::clamp<int>(s, 0, max_sym);
+
+					syms[i] = s;
+				}
+
+			}
+
+			histogram h1(max_sym + 1);
+			for (uint32_t i = 0; i < num_codes; i++)
+				h1[syms[i]]++;
+
+			huffman_encoding_table etab2;
+			if (!etab2.init(h1, 16))
+			{
+				assert(0);
+				printf("Failed 0\n");
+				return false;
+			}
+
+			bitwise_coder c;
+			c.init(1024);
+
+			c.emit_huffman_table(etab2);
+
+			for (uint32_t i = 0; i < num_codes; i++)
+				c.put_code(syms[i], etab2);
+
+			c.flush();
+
+			basist::bitwise_decoder d;
+			d.init(&c.get_bytes()[0], (uint32_t)c.get_bytes().size());
+
+			basist::huffman_decoding_table dtab;
+			bool success = d.read_huffman_table(dtab);
+			if (!success)
+			{
+				assert(0);
+				printf("Failed 2\n");
+				return false;
+			}
+
+			for (uint32_t i = 0; i < num_codes; i++)
+			{
+				uint32_t s = d.decode_huffman(dtab);
+				if (s != syms[i])
+				{
+					assert(0);
+					printf("Failed 4\n");
+					return false;
+				}
+			}
+
+		}
+		return true;
+	}
+
+	void palette_index_reorderer::init(uint32_t num_indices, const uint32_t *pIndices, uint32_t num_syms, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight)
+	{
+		assert((num_syms > 0) && (num_indices > 0));
+		assert((dist_func_weight >= 0.0f) && (dist_func_weight <= 1.0f));
+
+		clear();
+
+		m_remap_table.resize(num_syms);
+		m_entries_picked.reserve(num_syms);
+		m_total_count_to_picked.resize(num_syms);
+
+		if (num_indices <= 1)
+			return;
+
+		prepare_hist(num_syms, num_indices, pIndices);
+		find_initial(num_syms);
+
+		while (m_entries_to_do.size())
+		{
+			// Find the best entry to move into the picked list.
+			uint32_t best_entry;
+			double best_count;
+			find_next_entry(best_entry, best_count, pDist_func, pCtx, dist_func_weight);
+
+			// We now have chosen an entry to place in the picked list, now determine which side it goes on.
+			const uint32_t entry_to_move = m_entries_to_do[best_entry];
+								
+			float side = pick_side(num_syms, entry_to_move, pDist_func, pCtx, dist_func_weight);
+								
+			// Put entry_to_move either on the "left" or "right" side of the picked entries
+			if (side <= 0)
+				m_entries_picked.push_back(entry_to_move);
+			else
+				m_entries_picked.insert(m_entries_picked.begin(), entry_to_move);
+
+			// Erase best_entry from the todo list
+			m_entries_to_do.erase(m_entries_to_do.begin() + best_entry);
+
+			// We've just moved best_entry to the picked list, so now we need to update m_total_count_to_picked[] to factor the additional count to best_entry
+			for (uint32_t i = 0; i < m_entries_to_do.size(); i++)
+				m_total_count_to_picked[m_entries_to_do[i]] += get_hist(m_entries_to_do[i], entry_to_move, num_syms);
+		}
+
+		for (uint32_t i = 0; i < num_syms; i++)
+			m_remap_table[m_entries_picked[i]] = i;
+	}
+
+	void palette_index_reorderer::prepare_hist(uint32_t num_syms, uint32_t num_indices, const uint32_t *pIndices)
+	{
+		m_hist.resize(0);
+		m_hist.resize(num_syms * num_syms);
+
+		for (uint32_t i = 0; i < num_indices; i++)
+		{
+			const uint32_t idx = pIndices[i];
+			inc_hist(idx, (i < (num_indices - 1)) ? pIndices[i + 1] : -1, num_syms);
+			inc_hist(idx, (i > 0) ? pIndices[i - 1] : -1, num_syms);
+		}
+	}
+
+	void palette_index_reorderer::find_initial(uint32_t num_syms)
+	{
+		uint32_t max_count = 0, max_index = 0;
+		for (uint32_t i = 0; i < num_syms * num_syms; i++)
+			if (m_hist[i] > max_count)
+				max_count = m_hist[i], max_index = i;
+
+		uint32_t a = max_index / num_syms, b = max_index % num_syms;
+
+		m_entries_picked.push_back(a);
+		m_entries_picked.push_back(b);
+
+		for (uint32_t i = 0; i < num_syms; i++)
+			if ((i != b) && (i != a))
+				m_entries_to_do.push_back(i);
+
+		for (uint32_t i = 0; i < m_entries_to_do.size(); i++)
+			for (uint32_t j = 0; j < m_entries_picked.size(); j++)
+				m_total_count_to_picked[m_entries_to_do[i]] += get_hist(m_entries_to_do[i], m_entries_picked[j], num_syms);
+	}
+
+	void palette_index_reorderer::find_next_entry(uint32_t &best_entry, double &best_count, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight)
+	{
+		best_entry = 0;
+		best_count = 0;
+
+		for (uint32_t i = 0; i < m_entries_to_do.size(); i++)
+		{
+			const uint32_t u = m_entries_to_do[i];
+			double total_count = m_total_count_to_picked[u];
+
+			if (pDist_func)
+			{
+				float w = maximum<float>((*pDist_func)(u, m_entries_picked.front(), pCtx), (*pDist_func)(u, m_entries_picked.back(), pCtx));
+				assert((w >= 0.0f) && (w <= 1.0f));
+				total_count = (total_count + 1.0f) * lerp(1.0f - dist_func_weight, 1.0f + dist_func_weight, w);
+			}
+
+			if (total_count <= best_count)
+				continue;
+
+			best_entry = i;
+			best_count = total_count;
+		}
+	}
+
+	float palette_index_reorderer::pick_side(uint32_t num_syms, uint32_t entry_to_move, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight)
+	{
+		float which_side = 0;
+
+		int l_count = 0, r_count = 0;
+		for (uint32_t j = 0; j < m_entries_picked.size(); j++)
+		{
+			const int count = get_hist(entry_to_move, m_entries_picked[j], num_syms), r = ((int)m_entries_picked.size() + 1 - 2 * (j + 1));
+			which_side += static_cast<float>(r * count);
+			if (r >= 0)
+				l_count += r * count;
+			else
+				r_count += -r * count;
+		}
+
+		if (pDist_func)
+		{
+			float w_left = lerp(1.0f - dist_func_weight, 1.0f + dist_func_weight, (*pDist_func)(entry_to_move, m_entries_picked.front(), pCtx));
+			float w_right = lerp(1.0f - dist_func_weight, 1.0f + dist_func_weight, (*pDist_func)(entry_to_move, m_entries_picked.back(), pCtx));
+			which_side = w_left * l_count - w_right * r_count;
+		}
+		return which_side;
+	}
+
+	void image_metrics::calc(const image &a, const image &b, uint32_t first_chan, uint32_t total_chans, bool avg_comp_error, bool use_601_luma)
+	{
+		assert((first_chan < 4U) && (first_chan + total_chans <= 4U));
+
+		const uint32_t width = basisu::minimum(a.get_width(), b.get_width());
+		const uint32_t height = basisu::minimum(a.get_height(), b.get_height());
+
+		double hist[256];
+		clear_obj(hist);
+
+		for (uint32_t y = 0; y < height; y++)
+		{
+			for (uint32_t x = 0; x < width; x++)
+			{
+				const color_rgba &ca = a(x, y), &cb = b(x, y);
+
+				if (total_chans)
+				{
+					for (uint32_t c = 0; c < total_chans; c++)
+						hist[iabs(ca[first_chan + c] - cb[first_chan + c])]++;
+				}
+				else
+				{
+					if (use_601_luma)
+						hist[iabs(ca.get_601_luma() - cb.get_601_luma())]++;
+					else
+						hist[iabs(ca.get_709_luma() - cb.get_709_luma())]++;
+				}
+			}
+		}
+
+		m_max = 0;
+		double sum = 0.0f, sum2 = 0.0f;
+		for (uint32_t i = 0; i < 256; i++)
+		{
+			if (hist[i])
+			{
+				m_max = basisu::maximum<float>(m_max, (float)i);
+				double v = i * hist[i];
+				sum += v;
+				sum2 += i * v;
+			}
+		}
+
+		double total_values = (double)width * (double)height;
+		if (avg_comp_error)
+			total_values *= (double)clamp<uint32_t>(total_chans, 1, 4);
+
+		m_mean = (float)clamp<double>(sum / total_values, 0.0f, 255.0);
+		m_mean_squared = (float)clamp<double>(sum2 / total_values, 0.0f, 255.0f * 255.0f);
+		m_rms = (float)sqrt(m_mean_squared);
+		m_psnr = m_rms ? (float)clamp<double>(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f;
+	}
+
+	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed)
+	{
+		rand r(seed);
+
+		uint8_t *pDst = static_cast<uint8_t *>(pBuf);
+
+		while (size >= sizeof(uint32_t))
+		{
+			*(uint32_t *)pDst = r.urand32();
+			pDst += sizeof(uint32_t);
+			size -= sizeof(uint32_t);
+		}
+
+		while (size)
+		{
+			*pDst++ = r.byte();
+			size--;
+		}
+	}
+
+	uint32_t hash_hsieh(const uint8_t *pBuf, size_t len)
+	{
+		if (!pBuf || !len) 
+			return 0;
+
+		uint32_t h = static_cast<uint32_t>(len);
+
+		const uint32_t bytes_left = len & 3;
+		len >>= 2;
+
+		while (len--)
+		{
+			const uint16_t *pWords = reinterpret_cast<const uint16_t *>(pBuf);
+
+			h += pWords[0];
+			
+			const uint32_t t = (pWords[1] << 11) ^ h;
+			h = (h << 16) ^ t;
+			
+			pBuf += sizeof(uint32_t);
+			
+			h += h >> 11;
+		}
+
+		switch (bytes_left)
+		{
+		case 1: 
+			h += *reinterpret_cast<const signed char*>(pBuf);
+			h ^= h << 10;
+			h += h >> 1;
+			break;
+		case 2: 
+			h += *reinterpret_cast<const uint16_t *>(pBuf);
+			h ^= h << 11;
+			h += h >> 17;
+			break;
+		case 3:
+			h += *reinterpret_cast<const uint16_t *>(pBuf);
+			h ^= h << 16;
+			h ^= (static_cast<signed char>(pBuf[sizeof(uint16_t)])) << 18;
+			h += h >> 11;
+			break;
+		default:
+			break;
+		}
+		
+		h ^= h << 3;
+		h += h >> 5;
+		h ^= h << 4;
+		h += h >> 17;
+		h ^= h << 25;
+		h += h >> 6;
+
+		return h;
+	}
+
+	job_pool::job_pool(uint32_t num_threads) : 
+		m_num_active_jobs(0),
+		m_kill_flag(false)
+	{
+		assert(num_threads >= 1U);
+
+		debug_printf("job_pool::job_pool: %u total threads\n", num_threads);
+
+		if (num_threads > 1)
+		{
+			m_threads.resize(num_threads - 1);
+
+			for (int i = 0; i < ((int)num_threads - 1); i++)
+			   m_threads[i] = std::thread([this, i] { job_thread(i); });
+		}
+	}
+
+	job_pool::~job_pool()
+	{
+		debug_printf("job_pool::~job_pool\n");
+		
+		// Notify all workers that they need to die right now.
+		m_kill_flag = true;
+		
+		m_has_work.notify_all();
+
+		// Wait for all workers to die.
+		for (uint32_t i = 0; i < m_threads.size(); i++)
+			m_threads[i].join();
+	}
+				
+	void job_pool::add_job(const std::function<void()>& job)
+	{
+		std::unique_lock<std::mutex> lock(m_mutex);
+
+		m_queue.emplace_back(job);
+
+		const size_t queue_size = m_queue.size();
+
+		lock.unlock();
+
+		if (queue_size > 1)
+			m_has_work.notify_one();
+	}
+
+	void job_pool::add_job(std::function<void()>&& job)
+	{
+		std::unique_lock<std::mutex> lock(m_mutex);
+
+		m_queue.emplace_back(std::move(job));
+						
+		const size_t queue_size = m_queue.size();
+
+		lock.unlock();
+
+		if (queue_size > 1)
+		{
+			m_has_work.notify_one();
+		}
+	}
+
+	void job_pool::wait_for_all()
+	{
+		std::unique_lock<std::mutex> lock(m_mutex);
+
+		// Drain the job queue on the calling thread.
+		while (!m_queue.empty())
+		{
+			std::function<void()> job(m_queue.back());
+			m_queue.pop_back();
+
+			lock.unlock();
+
+			job();
+
+			lock.lock();
+		}
+
+		// The queue is empty, now wait for all active jobs to finish up.
+		m_no_more_jobs.wait(lock, [this]{ return !m_num_active_jobs; } );
+	}
+
+	void job_pool::job_thread(uint32_t index)
+	{
+		debug_printf("job_pool::job_thread: starting %u\n", index);
+		
+		while (true)
+		{
+			std::unique_lock<std::mutex> lock(m_mutex);
+
+			// Wait for any jobs to be issued.
+			m_has_work.wait(lock, [this] { return m_kill_flag || m_queue.size(); } );
+
+			// Check to see if we're supposed to exit.
+			if (m_kill_flag)
+				break;
+
+			// Get the job and execute it.
+			std::function<void()> job(m_queue.back());
+			m_queue.pop_back();
+
+			++m_num_active_jobs;
+
+			lock.unlock();
+
+			job();
+
+			lock.lock();
+
+			--m_num_active_jobs;
+
+			// Now check if there are no more jobs remaining. 
+			const bool all_done = m_queue.empty() && !m_num_active_jobs;
+			
+			lock.unlock();
+
+			if (all_done)
+				m_no_more_jobs.notify_all();
+		}
+
+		debug_printf("job_pool::job_thread: exiting\n");
+	}
+
+	// .TGA image loading
+	#pragma pack(push)
+	#pragma pack(1)
+	struct tga_header
+	{
+		uint8_t			m_id_len;
+		uint8_t			m_cmap;
+		uint8_t			m_type;
+		packed_uint<2>	m_cmap_first;
+		packed_uint<2> m_cmap_len;
+		uint8_t			m_cmap_bpp;
+		packed_uint<2> m_x_org;
+		packed_uint<2> m_y_org;
+		packed_uint<2> m_width;
+		packed_uint<2> m_height;
+		uint8_t			m_depth;
+		uint8_t			m_desc;
+	};
+	#pragma pack(pop)
+
+	const uint32_t MAX_TGA_IMAGE_SIZE = 16384;
+
+	enum tga_image_type
+	{
+		cITPalettized = 1,
+		cITRGB = 2,
+		cITGrayscale = 3
+	};
+
+	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans)
+	{
+		width = 0;
+		height = 0;
+		n_chans = 0;
+
+		if (buf_size <= sizeof(tga_header))
+			return nullptr;
+
+		const tga_header &hdr = *reinterpret_cast<const tga_header *>(pBuf);
+
+		if ((!hdr.m_width) || (!hdr.m_height) || (hdr.m_width > MAX_TGA_IMAGE_SIZE) || (hdr.m_height > MAX_TGA_IMAGE_SIZE))
+			return nullptr;
+
+		if (hdr.m_desc >> 6)
+			return nullptr;
+
+		// Simple validation
+		if ((hdr.m_cmap != 0) && (hdr.m_cmap != 1))
+			return nullptr;
+		
+		if (hdr.m_cmap)
+		{
+			if ((hdr.m_cmap_bpp == 0) || (hdr.m_cmap_bpp > 32))
+				return nullptr;
+
+			// Nobody implements CMapFirst correctly, so we're not supporting it. Never seen it used, either.
+			if (hdr.m_cmap_first != 0)
+				return nullptr;
+		}
+
+		const bool x_flipped = (hdr.m_desc & 0x10) != 0;
+		const bool y_flipped = (hdr.m_desc & 0x20) == 0;
+
+		bool rle_flag = false;
+		int file_image_type = hdr.m_type;
+		if (file_image_type > 8)
+		{
+			file_image_type -= 8;
+			rle_flag = true;
+		}
+
+		const tga_image_type image_type = static_cast<tga_image_type>(file_image_type);
+
+		switch (file_image_type)
+		{
+		case cITRGB:
+			if (hdr.m_depth == 8)
+				return nullptr;
+			break;
+		case cITPalettized:
+			if ((hdr.m_depth != 8) || (hdr.m_cmap != 1) || (hdr.m_cmap_len == 0))
+				return nullptr;
+			break;
+		case cITGrayscale:
+			if ((hdr.m_cmap != 0) || (hdr.m_cmap_len != 0))
+				return nullptr;
+			if ((hdr.m_depth != 8) && (hdr.m_depth != 16))
+				return nullptr;
+			break;
+		default:
+			return nullptr;
+		}
+
+		uint32_t tga_bytes_per_pixel = 0;
+
+		switch (hdr.m_depth)
+		{
+		case 32:
+			tga_bytes_per_pixel = 4;
+			n_chans = 4;
+			break;
+		case 24:
+			tga_bytes_per_pixel = 3;
+			n_chans = 3;
+			break;
+		case 16:
+		case 15:
+			tga_bytes_per_pixel = 2;
+			// For compatibility with stb_image_write.h
+			n_chans = ((file_image_type == cITGrayscale) && (hdr.m_depth == 16)) ? 4 : 3;
+			break;
+		case 8:
+			tga_bytes_per_pixel = 1;
+			// For palettized RGBA support, which both FreeImage and stb_image support.
+			n_chans = ((file_image_type == cITPalettized) && (hdr.m_cmap_bpp == 32)) ? 4 : 3;
+			break;
+		default:
+			return nullptr;
+		}
+
+		const uint32_t bytes_per_line = hdr.m_width * tga_bytes_per_pixel;
+
+		const uint8_t *pSrc = pBuf + sizeof(tga_header);
+		uint32_t bytes_remaining = buf_size - sizeof(tga_header);
+
+		if (hdr.m_id_len)
+		{
+			if (bytes_remaining < hdr.m_id_len)
+				return nullptr;
+			pSrc += hdr.m_id_len;
+			bytes_remaining += hdr.m_id_len;
+		}
+
+		color_rgba pal[256];
+		for (uint32_t i = 0; i < 256; i++)
+			pal[i].set(0, 0, 0, 255);
+
+		if ((hdr.m_cmap) && (hdr.m_cmap_len))
+		{
+			if (image_type == cITPalettized)
+			{
+				// Note I cannot find any files using 32bpp palettes in the wild (never seen any in ~30 years).
+				if ( ((hdr.m_cmap_bpp != 32) && (hdr.m_cmap_bpp != 24) && (hdr.m_cmap_bpp != 15) && (hdr.m_cmap_bpp != 16)) || (hdr.m_cmap_len > 256) )
+					return nullptr;
+
+				if (hdr.m_cmap_bpp == 32)
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 4;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						pal[i].r = pSrc[i * 4 + 2];
+						pal[i].g = pSrc[i * 4 + 1];
+						pal[i].b = pSrc[i * 4 + 0];
+						pal[i].a = pSrc[i * 4 + 3];
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+				else if (hdr.m_cmap_bpp == 24)
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 3;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						pal[i].r = pSrc[i * 3 + 2];
+						pal[i].g = pSrc[i * 3 + 1];
+						pal[i].b = pSrc[i * 3 + 0];
+						pal[i].a = 255;
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+				else
+				{
+					const uint32_t pal_size = hdr.m_cmap_len * 2;
+					if (bytes_remaining < pal_size)
+						return nullptr;
+
+					for (uint32_t i = 0; i < hdr.m_cmap_len; i++)
+					{
+						const uint32_t v = pSrc[i * 2 + 0] | (pSrc[i * 2 + 1] << 8);
+
+						pal[i].r = (((v >> 10) & 31) * 255 + 15) / 31;
+						pal[i].g = (((v >> 5) & 31) * 255 + 15) / 31;
+						pal[i].b = ((v & 31) * 255 + 15) / 31;
+						pal[i].a = 255;
+					}
+
+					bytes_remaining -= pal_size;
+					pSrc += pal_size;
+				}
+			}
+			else
+			{
+				const uint32_t bytes_to_skip = (hdr.m_cmap_bpp >> 3) * hdr.m_cmap_len;
+				if (bytes_remaining < bytes_to_skip)
+					return nullptr;
+				pSrc += bytes_to_skip;
+				bytes_remaining += bytes_to_skip;
+			}
+		}
+		
+		width = hdr.m_width;
+		height = hdr.m_height;
+
+		const uint32_t source_pitch = width * tga_bytes_per_pixel;
+		const uint32_t dest_pitch = width * n_chans;
+		
+		uint8_t *pImage = (uint8_t *)malloc(dest_pitch * height);
+		if (!pImage)
+			return nullptr;
+
+		std::vector<uint8_t> input_line_buf;
+		if (rle_flag)
+			input_line_buf.resize(source_pitch);
+
+		int run_type = 0, run_remaining = 0;
+		uint8_t run_pixel[4];
+		memset(run_pixel, 0, sizeof(run_pixel));
+
+		for (int y = 0; y < height; y++)
+		{
+			const uint8_t *pLine_data;
+
+			if (rle_flag)
+			{
+				int pixels_remaining = width;
+				uint8_t *pDst = &input_line_buf[0];
+
+				do 
+				{
+					if (!run_remaining)
+					{
+						if (bytes_remaining < 1)
+						{
+							free(pImage);
+							return nullptr;
+						}
+
+						int v = *pSrc++;
+						bytes_remaining--;
+
+						run_type = v & 0x80;
+						run_remaining = (v & 0x7F) + 1;
+
+						if (run_type)
+						{
+							if (bytes_remaining < tga_bytes_per_pixel)
+							{
+								free(pImage);
+								return nullptr;
+							}
+
+							memcpy(run_pixel, pSrc, tga_bytes_per_pixel);
+							pSrc += tga_bytes_per_pixel;
+							bytes_remaining -= tga_bytes_per_pixel;
+						}
+					}
+
+					const uint32_t n = basisu::minimum<uint32_t>(pixels_remaining, run_remaining);
+					pixels_remaining -= n;
+					run_remaining -= n;
+
+					if (run_type)
+					{
+						for (uint32_t i = 0; i < n; i++)
+							for (uint32_t j = 0; j < tga_bytes_per_pixel; j++)
+								*pDst++ = run_pixel[j];
+					}
+					else
+					{
+						const uint32_t bytes_wanted = n * tga_bytes_per_pixel;
+
+						if (bytes_remaining < bytes_wanted)
+						{
+							free(pImage);
+							return nullptr;
+						}
+
+						memcpy(pDst, pSrc, bytes_wanted);
+						pDst += bytes_wanted;
+
+						pSrc += bytes_wanted;
+						bytes_remaining -= bytes_wanted;
+					}
+
+				} while (pixels_remaining);
+
+				assert((pDst - &input_line_buf[0]) == width * tga_bytes_per_pixel);
+
+				pLine_data = &input_line_buf[0];
+			}
+			else
+			{
+				if (bytes_remaining < source_pitch)
+				{
+					free(pImage);
+					return nullptr;
+				}
+
+				pLine_data = pSrc;
+				bytes_remaining -= source_pitch;
+				pSrc += source_pitch;
+			}
+
+			// Convert to 24bpp RGB or 32bpp RGBA.
+			uint8_t *pDst = pImage + (y_flipped ? (height - 1 - y) : y) * dest_pitch + (x_flipped ? (width - 1) * n_chans : 0);
+			const int dst_stride = x_flipped ? -((int)n_chans) : n_chans;
+
+			switch (hdr.m_depth)
+			{
+			case 32:
+				assert(tga_bytes_per_pixel == 4 && n_chans == 4);
+				for (int i = 0; i < width; i++, pLine_data += 4, pDst += dst_stride)
+				{
+					pDst[0] = pLine_data[2];
+					pDst[1] = pLine_data[1];
+					pDst[2] = pLine_data[0];
+					pDst[3] = pLine_data[3];
+				}
+				break;
+			case 24:
+				assert(tga_bytes_per_pixel == 3 && n_chans == 3);
+				for (int i = 0; i < width; i++, pLine_data += 3, pDst += dst_stride)
+				{
+					pDst[0] = pLine_data[2];
+					pDst[1] = pLine_data[1];
+					pDst[2] = pLine_data[0];
+				}
+				break;
+			case 16:
+			case 15:
+				if (image_type == cITRGB)
+				{
+					assert(tga_bytes_per_pixel == 2 && n_chans == 3);
+					for (int i = 0; i < width; i++, pLine_data += 2, pDst += dst_stride)
+					{
+						const uint32_t v = pLine_data[0] | (pLine_data[1] << 8);
+						pDst[0] = (((v >> 10) & 31) * 255 + 15) / 31;
+						pDst[1] = (((v >> 5) & 31) * 255 + 15) / 31;
+						pDst[2] = ((v & 31) * 255 + 15) / 31;
+					}
+				}
+				else
+				{
+					assert(image_type == cITGrayscale && tga_bytes_per_pixel == 2 && n_chans == 4);
+					for (int i = 0; i < width; i++, pLine_data += 2, pDst += dst_stride)
+					{
+						pDst[0] = pLine_data[0];
+						pDst[1] = pLine_data[0];
+						pDst[2] = pLine_data[0];
+						pDst[3] = pLine_data[1];
+					}
+				}
+				break;
+			case 8:
+				assert(tga_bytes_per_pixel == 1);
+				if (image_type == cITPalettized)
+				{
+					if (hdr.m_cmap_bpp == 32)
+					{
+						assert(n_chans == 4);
+						for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+						{
+							const uint32_t c = *pLine_data;
+							pDst[0] = pal[c].r;
+							pDst[1] = pal[c].g;
+							pDst[2] = pal[c].b;
+							pDst[3] = pal[c].a;
+						}
+					}
+					else
+					{
+						assert(n_chans == 3);
+						for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+						{
+							const uint32_t c = *pLine_data;
+							pDst[0] = pal[c].r;
+							pDst[1] = pal[c].g;
+							pDst[2] = pal[c].b;
+						}
+					}
+				}
+				else
+				{
+					assert(n_chans == 3);
+					for (int i = 0; i < width; i++, pLine_data++, pDst += dst_stride)
+					{
+						const uint8_t c = *pLine_data;
+						pDst[0] = c;
+						pDst[1] = c;
+						pDst[2] = c;
+					}
+				}
+				break;
+			default:
+				assert(0);
+				break;
+			}
+		} // y
+
+		return pImage;
+	}
+
+	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans)
+	{
+		width = height = n_chans = 0;
+
+		uint8_vec filedata;
+		if (!read_file_to_vec(pFilename, filedata))
+			return nullptr;
+
+		if (!filedata.size() || (filedata.size() > UINT32_MAX))
+			return nullptr;
+		
+		return read_tga(&filedata[0], (uint32_t)filedata.size(), width, height, n_chans);
+	}
+
+	void image::debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t scale_x, uint32_t scale_y, const color_rgba& fg, const color_rgba* pBG, bool alpha_only, const char* pFmt, ...)
+	{
+		char buf[2048];
+
+		va_list args;
+		va_start(args, pFmt);
+#ifdef _WIN32		
+		vsprintf_s(buf, sizeof(buf), pFmt, args);
+#else
+		vsnprintf(buf, sizeof(buf), pFmt, args);
+#endif
+		va_end(args);
+
+		const char* p = buf;
+
+		const uint32_t orig_x_ofs = x_ofs;
+
+		while (*p)
+		{
+			uint8_t c = *p++;
+			if ((c < 32) || (c > 127))
+				c = '.';
+
+			const uint8_t* pGlpyh = &g_debug_font8x8_basic[c - 32][0];
+
+			for (uint32_t y = 0; y < 8; y++)
+			{
+				uint32_t row_bits = pGlpyh[y];
+				for (uint32_t x = 0; x < 8; x++)
+				{
+					const uint32_t q = row_bits & (1 << x);
+										
+					const color_rgba* pColor = q ? &fg : pBG;
+					if (!pColor)
+						continue;
+
+					if (alpha_only)
+						fill_box_alpha(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
+					else
+						fill_box(x_ofs + x * scale_x, y_ofs + y * scale_y, scale_x, scale_y, *pColor);
+				}
+			}
+
+			x_ofs += 8 * scale_x;
+			if ((x_ofs + 8 * scale_x) > m_width)
+			{
+				x_ofs = orig_x_ofs;
+				y_ofs += 8 * scale_y;
+			}
+		}
+	}
+		
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_enc.h b/thirdparty/basis_universal/encoder/basisu_enc.h
new file mode 100644
index 0000000000..05c95cbc3b
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_enc.h
@@ -0,0 +1,3127 @@
+// basisu_enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu.h"
+#include "../transcoder/basisu_transcoder_internal.h"
+
+#include <mutex>
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <thread>
+#include <unordered_map>
+#include <ostream>
+
+#if !defined(_WIN32) || defined(__MINGW32__)
+#include <libgen.h>
+#endif
+
+// This module is really just a huge grab bag of classes and helper functions needed by the encoder.
+
+// If BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE is 1, quality in perceptual mode will be slightly greater, but at a large increase in encoding CPU time.
+#define BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE (0)
+
+namespace basisu
+{
+	extern uint8_t g_hamming_dist[256];
+	extern const uint8_t g_debug_font8x8_basic[127 - 32 + 1][8];
+
+	// Encoder library initialization.
+	// This function MUST be called before encoding anything!
+	void basisu_encoder_init();
+
+	// basisu_kernels_sse.cpp - will be a no-op and g_cpu_supports_sse41 will always be false unless compiled with BASISU_SUPPORT_SSE=1
+	extern void detect_sse41();
+
+#if BASISU_SUPPORT_SSE
+	extern bool g_cpu_supports_sse41;
+#else
+	const bool g_cpu_supports_sse41 = false;
+#endif
+
+	void error_printf(const char *pFmt, ...);
+
+	// Helpers
+
+	inline uint8_t clamp255(int32_t i)
+	{
+		return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i);
+	}
+
+	inline int32_t clampi(int32_t value, int32_t low, int32_t high) 
+	{ 
+		if (value < low) 
+			value = low; 
+		else if (value > high) 
+			value = high; 
+		return value; 
+	}
+
+	inline uint8_t mul_8(uint32_t v, uint32_t a)
+	{
+		v = v * a + 128; 
+		return (uint8_t)((v + (v >> 8)) >> 8);
+	}
+
+	inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 64);
+		uint64_t bits = 0;
+		uint32_t total_bits = 0;
+
+		while (total_bits < codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
+
+			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
+			byte_bits &= ((1 << bits_to_read) - 1);
+
+			bits |= ((uint64_t)(byte_bits) << total_bits);
+
+			total_bits += bits_to_read;
+			bit_offset += bits_to_read;
+		}
+
+		return bits;
+	}
+
+	inline uint32_t read_bits32(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize)
+	{
+		assert(codesize <= 32);
+		uint32_t bits = 0;
+		uint32_t total_bits = 0;
+
+		while (total_bits < codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_read = minimum<int>(codesize - total_bits, 8 - byte_bit_offset);
+
+			uint32_t byte_bits = pBuf[bit_offset >> 3] >> byte_bit_offset;
+			byte_bits &= ((1 << bits_to_read) - 1);
+
+			bits |= (byte_bits << total_bits);
+
+			total_bits += bits_to_read;
+			bit_offset += bits_to_read;
+		}
+
+		return bits;
+	}
+				
+	// Hashing
+	
+	inline uint32_t bitmix32c(uint32_t v) 
+	{
+		v = (v + 0x7ed55d16) + (v << 12);
+		v = (v ^ 0xc761c23c) ^ (v >> 19);
+		v = (v + 0x165667b1) + (v << 5);
+		v = (v + 0xd3a2646c) ^ (v << 9);
+		v = (v + 0xfd7046c5) + (v << 3);
+		v = (v ^ 0xb55a4f09) ^ (v >> 16);
+		return v;
+	}
+
+	inline uint32_t bitmix32(uint32_t v) 
+	{
+		v -= (v << 6);
+		v ^= (v >> 17);
+		v -= (v << 9);
+		v ^= (v << 4);
+		v -= (v << 3);
+		v ^= (v << 10);
+		v ^= (v >> 15);
+		return v;
+	}
+
+	inline uint32_t wang_hash(uint32_t seed)
+	{
+		 seed = (seed ^ 61) ^ (seed >> 16);
+		 seed *= 9;
+		 seed = seed ^ (seed >> 4);
+		 seed *= 0x27d4eb2d;
+		 seed = seed ^ (seed >> 15);
+		 return seed;
+	}
+
+	uint32_t hash_hsieh(const uint8_t* pBuf, size_t len);
+
+	template <typename Key>
+	struct bit_hasher
+	{
+		std::size_t operator()(const Key& k) const
+		{
+			return hash_hsieh(reinterpret_cast<const uint8_t *>(&k), sizeof(k));
+		}
+	};
+
+	class running_stat
+	{
+	public:
+		running_stat() :
+			m_n(0),
+			m_old_m(0), m_new_m(0), m_old_s(0), m_new_s(0)
+		{
+		}
+		void clear()
+		{
+			m_n = 0;
+		}
+		void push(double x)
+		{
+			m_n++;
+			if (m_n == 1)
+			{
+				m_old_m = m_new_m = x;
+				m_old_s = 0.0;
+				m_min = x;
+				m_max = x;
+			}
+			else
+			{
+				m_new_m = m_old_m + (x - m_old_m) / m_n;
+				m_new_s = m_old_s + (x - m_old_m) * (x - m_new_m);
+				m_old_m = m_new_m;
+				m_old_s = m_new_s;
+				m_min = basisu::minimum(x, m_min);
+				m_max = basisu::maximum(x, m_max);
+			}
+		}
+		uint32_t get_num() const
+		{
+			return m_n;
+		}
+		double get_mean() const
+		{
+			return (m_n > 0) ? m_new_m : 0.0;
+		}
+
+		double get_variance() const
+		{
+			return ((m_n > 1) ? m_new_s / (m_n - 1) : 0.0);
+		}
+
+		double get_std_dev() const
+		{
+			return sqrt(get_variance());
+		}
+
+		double get_min() const
+		{
+			return m_min;
+		}
+
+		double get_max() const
+		{
+			return m_max;
+		}
+
+	private:
+		uint32_t m_n;
+		double m_old_m, m_new_m, m_old_s, m_new_s, m_min, m_max;
+	};
+
+	// Linear algebra
+
+	template <uint32_t N, typename T>
+	class vec
+	{
+	protected:
+		T m_v[N];
+
+	public:
+		enum { num_elements = N };
+
+		inline vec() { }
+		inline vec(eZero) { set_zero();  }
+
+		explicit inline vec(T val) { set(val); }
+		inline vec(T v0, T v1) { set(v0, v1); }
+		inline vec(T v0, T v1, T v2) { set(v0, v1, v2); }
+		inline vec(T v0, T v1, T v2, T v3) { set(v0, v1, v2, v3); }
+		inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; }
+		template <uint32_t OtherN, typename OtherT> inline vec(const vec<OtherN, OtherT> &other) { set(other); }
+
+		inline T operator[](uint32_t i) const { assert(i < N); return m_v[i]; }
+		inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; }
+
+		inline T getX() const { return m_v[0]; }
+		inline T getY() const { static_assert(N >= 2, "N too small"); return m_v[1]; }
+		inline T getZ() const { static_assert(N >= 3, "N too small"); return m_v[2]; }
+		inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; }
+
+		inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false;	return true; }
+		inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; }
+
+		inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; }
+
+		template <uint32_t OtherN, typename OtherT>
+		inline vec &set(const vec<OtherN, OtherT> &other)
+		{
+			uint32_t i;
+			if ((const void *)(&other) == (const void *)(this))
+				return *this;
+			const uint32_t m = minimum(OtherN, N);
+			for (i = 0; i < m; i++)
+				m_v[i] = static_cast<T>(other[i]);
+			for (; i < N; i++)
+				m_v[i] = 0;
+			return *this;
+		}
+
+		inline vec &set_component(uint32_t index, T val) { assert(index < N); m_v[index] = val; return *this; }
+		inline vec &set(T val) { for (uint32_t i = 0; i < N; i++) m_v[i] = val; return *this; }
+		inline void clear_elements(uint32_t s, uint32_t e) { assert(e <= N); for (uint32_t i = s; i < e; i++) m_v[i] = 0; }
+
+		inline vec &set(T v0, T v1)
+		{
+			m_v[0] = v0;
+			if (N >= 2)
+			{
+				m_v[1] = v1;
+				clear_elements(2, N);
+			}
+			return *this;
+		}
+
+		inline vec &set(T v0, T v1, T v2)
+		{
+			m_v[0] = v0;
+			if (N >= 2)
+			{
+				m_v[1] = v1;
+				if (N >= 3)
+				{
+					m_v[2] = v2;
+					clear_elements(3, N);
+				}
+			}
+			return *this;
+		}
+
+		inline vec &set(T v0, T v1, T v2, T v3)
+		{
+			m_v[0] = v0;
+			if (N >= 2)
+			{
+				m_v[1] = v1;
+				if (N >= 3)
+				{
+					m_v[2] = v2;
+
+					if (N >= 4)
+					{
+						m_v[3] = v3;
+						clear_elements(5, N);
+					}
+				}
+			}
+			return *this;
+		}
+
+		inline vec &operator=(const vec &rhs) { if (this != &rhs) for (uint32_t i = 0; i < N; i++) m_v[i] = rhs.m_v[i]; return *this; }
+		template <uint32_t OtherN, typename OtherT> inline vec &operator=(const vec<OtherN, OtherT> &rhs) { set(rhs); return *this; }
+
+		inline const T *get_ptr() const { return reinterpret_cast<const T *>(&m_v[0]); }
+		inline T *get_ptr() { return reinterpret_cast<T *>(&m_v[0]); }
+		
+		inline vec operator- () const { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = -m_v[i]; return res; }
+		inline vec operator+ () const { return *this; }
+		inline vec &operator+= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] += other.m_v[i]; return *this; }
+		inline vec &operator-= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] -= other.m_v[i]; return *this; }
+		inline vec &operator/= (const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] /= other.m_v[i]; return *this; }
+		inline vec &operator*=(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] *= other.m_v[i]; return *this; }
+		inline vec &operator/= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] /= s; return *this; }
+		inline vec &operator*= (T s) { for (uint32_t i = 0; i < N; i++) m_v[i] *= s; return *this; }
+		
+		friend inline vec operator+(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] + rhs.m_v[i]; return res; }
+		friend inline vec operator-(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] - rhs.m_v[i]; return res; }
+		friend inline vec operator*(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] * val; return res; }
+		friend inline vec operator*(T val, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = val * rhs.m_v[i]; return res; }
+		friend inline vec operator/(const vec &lhs, T val) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / val; return res; }
+		friend inline vec operator/(const vec &lhs, const vec &rhs) { vec res; for (uint32_t i = 0; i < N; i++) res.m_v[i] = lhs.m_v[i] / rhs.m_v[i]; return res; }
+		
+		static inline T dot_product(const vec &lhs, const vec &rhs) { T res = lhs.m_v[0] * rhs.m_v[0]; for (uint32_t i = 1; i < N; i++) res += lhs.m_v[i] * rhs.m_v[i]; return res; }
+
+		inline T dot(const vec &rhs) const { return dot_product(*this, rhs); }
+
+		inline T norm() const { return dot_product(*this, *this); }
+		inline T length() const { return sqrt(norm()); }
+
+		inline T squared_distance(const vec &other) const { T d2 = 0; for (uint32_t i = 0; i < N; i++) { T d = m_v[i] - other.m_v[i]; d2 += d * d; } return d2; }
+		inline double squared_distance_d(const vec& other) const { double d2 = 0; for (uint32_t i = 0; i < N; i++) { double d = (double)m_v[i] - (double)other.m_v[i]; d2 += d * d; } return d2; }
+
+		inline T distance(const vec &other) const { return static_cast<T>(sqrt(squared_distance(other))); }
+		inline double distance_d(const vec& other) const { return sqrt(squared_distance_d(other)); }
+
+		inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len);	return *this; }
+
+		inline vec &clamp(T l, T h)
+		{
+			for (uint32_t i = 0; i < N; i++)
+				m_v[i] = basisu::clamp(m_v[i], l, h);
+			return *this;
+		}
+
+		static vec component_min(const vec& a, const vec& b)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = minimum(a[i], b[i]);
+			return res;
+		}
+
+		static vec component_max(const vec& a, const vec& b)
+		{
+			vec res;
+			for (uint32_t i = 0; i < N; i++)
+				res[i] = maximum(a[i], b[i]);
+			return res;
+		}
+	};
+
+	typedef vec<4, double> vec4D;
+	typedef vec<3, double> vec3D;
+	typedef vec<2, double> vec2D;
+	typedef vec<1, double> vec1D;
+
+	typedef vec<4, float> vec4F;
+	typedef vec<3, float> vec3F;
+	typedef vec<2, float> vec2F;
+	typedef vec<1, float> vec1F;
+		
+	template <uint32_t Rows, uint32_t Cols, typename T>
+	class matrix
+	{
+	public:
+		typedef vec<Rows, T> col_vec;
+		typedef vec<Cols, T> row_vec;
+
+		typedef T scalar_type;
+
+		enum { rows = Rows, cols = Cols };
+
+	protected:
+		row_vec m_r[Rows];
+
+	public:
+		inline matrix() {}
+		inline matrix(eZero) { set_zero();  }
+		inline matrix(const matrix &other) { for (uint32_t i = 0; i < Rows; i++) m_r[i] = other.m_r[i];	}
+		inline matrix &operator=(const matrix &rhs) { if (this != &rhs) for (uint32_t i = 0; i < Rows; i++) m_r[i] = rhs.m_r[i]; return *this; }
+
+		inline T operator()(uint32_t r, uint32_t c) const { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
+		inline T &operator()(uint32_t r, uint32_t c) { assert((r < Rows) && (c < Cols)); return m_r[r][c]; }
+
+		inline const row_vec &operator[](uint32_t r) const { assert(r < Rows); return m_r[r]; }
+		inline row_vec &operator[](uint32_t r) { assert(r < Rows); return m_r[r]; }
+
+		inline matrix &set_zero()
+		{
+			for (uint32_t i = 0; i < Rows; i++)
+				m_r[i].set_zero();
+			return *this;
+		}
+
+		inline matrix &set_identity()
+		{
+			for (uint32_t i = 0; i < Rows; i++)
+			{
+				m_r[i].set_zero();
+				if (i < Cols)
+					m_r[i][i] = 1.0f;
+			}
+			return *this;
+		}
+	};
+
+	template<uint32_t N, typename VectorType>
+	inline VectorType compute_pca_from_covar(matrix<N, N, float> &cmatrix)
+	{
+		VectorType axis;
+		if (N == 1)
+			axis.set(1.0f);
+		else
+		{
+			for (uint32_t i = 0; i < N; i++)
+				axis[i] = lerp(.75f, 1.25f, i * (1.0f / maximum<int>(N - 1, 1)));
+		}
+
+		VectorType prev_axis(axis);
+
+		// Power iterations
+		for (uint32_t power_iter = 0; power_iter < 8; power_iter++)
+		{
+			VectorType trial_axis;
+			double max_sum = 0;
+
+			for (uint32_t i = 0; i < N; i++)
+			{
+				double sum = 0;
+				for (uint32_t j = 0; j < N; j++)
+					sum += cmatrix[i][j] * axis[j];
+
+				trial_axis[i] = static_cast<float>(sum);
+
+				max_sum = maximum(fabs(sum), max_sum);
+			}
+
+			if (max_sum != 0.0f)
+				trial_axis *= static_cast<float>(1.0f / max_sum);
+
+			VectorType delta_axis(prev_axis - trial_axis);
+
+			prev_axis = axis;
+			axis = trial_axis;
+
+			if (delta_axis.norm() < .0024f)
+				break;
+		}
+
+		return axis.normalize_in_place();
+	}
+
+	template<typename T> inline void indirect_sort(uint32_t num_indices, uint32_t* pIndices, const T* pKeys)
+	{
+		for (uint32_t i = 0; i < num_indices; i++)
+			pIndices[i] = i;
+
+		std::sort(
+			pIndices,
+			pIndices + num_indices,
+			[pKeys](uint32_t a, uint32_t b) { return pKeys[a] < pKeys[b]; }
+		);
+	}
+	
+	// Very simple job pool with no dependencies.
+	class job_pool
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(job_pool);
+
+	public:
+		// num_threads is the TOTAL number of job pool threads, including the calling thread! So 2=1 new thread, 3=2 new threads, etc.
+		job_pool(uint32_t num_threads);
+		~job_pool();
+				
+		void add_job(const std::function<void()>& job);
+		void add_job(std::function<void()>&& job);
+
+		void wait_for_all();
+
+		size_t get_total_threads() const { return 1 + m_threads.size(); }
+		
+	private:
+		std::vector<std::thread> m_threads;
+		std::vector<std::function<void()> > m_queue;
+		
+		std::mutex m_mutex;
+		std::condition_variable m_has_work;
+		std::condition_variable m_no_more_jobs;
+		
+		uint32_t m_num_active_jobs;
+		
+		std::atomic<bool> m_kill_flag;
+
+		void job_thread(uint32_t index);
+	};
+
+	// Simple 32-bit color class
+
+	class color_rgba_i16
+	{
+	public:
+		union
+		{
+			int16_t m_comps[4];
+
+			struct
+			{
+				int16_t r;
+				int16_t g;
+				int16_t b;
+				int16_t a;
+			};
+		};
+
+		inline color_rgba_i16()
+		{
+			static_assert(sizeof(*this) == sizeof(int16_t)*4, "sizeof(*this) == sizeof(int16_t)*4");
+		}
+
+		inline color_rgba_i16(int sr, int sg, int sb, int sa)
+		{
+			set(sr, sg, sb, sa);
+		}
+
+		inline color_rgba_i16 &set(int sr, int sg, int sb, int sa)
+		{
+			m_comps[0] = (int16_t)clamp<int>(sr, INT16_MIN, INT16_MAX);
+			m_comps[1] = (int16_t)clamp<int>(sg, INT16_MIN, INT16_MAX);
+			m_comps[2] = (int16_t)clamp<int>(sb, INT16_MIN, INT16_MAX);
+			m_comps[3] = (int16_t)clamp<int>(sa, INT16_MIN, INT16_MAX);
+			return *this;
+		}
+	};
+				
+	class color_rgba
+	{
+	public:
+		union
+		{
+			uint8_t m_comps[4];
+
+			struct
+			{
+				uint8_t r;
+				uint8_t g;
+				uint8_t b;
+				uint8_t a;
+			};
+		};
+
+		inline color_rgba()
+		{
+			static_assert(sizeof(*this) == 4, "sizeof(*this) != 4");
+			static_assert(sizeof(*this) == sizeof(basist::color32), "sizeof(*this) != sizeof(basist::color32)");
+		}
+
+		// Not too hot about this idea.
+		inline color_rgba(const basist::color32& other) :
+			r(other.r),
+			g(other.g),
+			b(other.b),
+			a(other.a)
+		{
+		}
+
+		color_rgba& operator= (const basist::color32& rhs)
+		{
+			r = rhs.r;
+			g = rhs.g;
+			b = rhs.b;
+			a = rhs.a;
+			return *this;
+		}
+
+		inline color_rgba(int y)
+		{
+			set(y);
+		}
+
+		inline color_rgba(int y, int na)
+		{
+			set(y, na);
+		}
+
+		inline color_rgba(int sr, int sg, int sb, int sa)
+		{
+			set(sr, sg, sb, sa);
+		}
+
+		inline color_rgba(eNoClamp, int sr, int sg, int sb, int sa)
+		{
+			set_noclamp_rgba((uint8_t)sr, (uint8_t)sg, (uint8_t)sb, (uint8_t)sa);
+		}
+
+		inline color_rgba& set_noclamp_y(int y)
+		{
+			m_comps[0] = (uint8_t)y;
+			m_comps[1] = (uint8_t)y;
+			m_comps[2] = (uint8_t)y;
+			m_comps[3] = (uint8_t)255;
+			return *this;
+		}
+
+		inline color_rgba &set_noclamp_rgba(int sr, int sg, int sb, int sa)
+		{
+			m_comps[0] = (uint8_t)sr;
+			m_comps[1] = (uint8_t)sg;
+			m_comps[2] = (uint8_t)sb;
+			m_comps[3] = (uint8_t)sa;
+			return *this;
+		}
+
+		inline color_rgba &set(int y)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
+			m_comps[1] = m_comps[0];
+			m_comps[2] = m_comps[0];
+			m_comps[3] = 255;
+			return *this;
+		}
+
+		inline color_rgba &set(int y, int na)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(y, 0, 255));
+			m_comps[1] = m_comps[0];
+			m_comps[2] = m_comps[0];
+			m_comps[3] = static_cast<uint8_t>(clamp<int>(na, 0, 255));
+			return *this;
+		}
+
+		inline color_rgba &set(int sr, int sg, int sb, int sa)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
+			m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
+			m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
+			m_comps[3] = static_cast<uint8_t>(clamp<int>(sa, 0, 255));
+			return *this;
+		}
+
+		inline color_rgba &set_rgb(int sr, int sg, int sb)
+		{
+			m_comps[0] = static_cast<uint8_t>(clamp<int>(sr, 0, 255));
+			m_comps[1] = static_cast<uint8_t>(clamp<int>(sg, 0, 255));
+			m_comps[2] = static_cast<uint8_t>(clamp<int>(sb, 0, 255));
+			return *this;
+		}
+
+		inline color_rgba &set_rgb(const color_rgba &other)
+		{
+			r = other.r;
+			g = other.g;
+			b = other.b;
+			return *this;
+		}
+
+		inline const uint8_t &operator[] (uint32_t index) const { assert(index < 4); return m_comps[index]; }
+		inline uint8_t &operator[] (uint32_t index) { assert(index < 4); return m_comps[index]; }
+		
+		inline void clear()
+		{
+			m_comps[0] = 0;
+			m_comps[1] = 0;
+			m_comps[2] = 0;
+			m_comps[3] = 0;
+		}
+
+		inline bool operator== (const color_rgba &rhs) const
+		{
+			if (m_comps[0] != rhs.m_comps[0]) return false;
+			if (m_comps[1] != rhs.m_comps[1]) return false;
+			if (m_comps[2] != rhs.m_comps[2]) return false;
+			if (m_comps[3] != rhs.m_comps[3]) return false;
+			return true;
+		}
+
+		inline bool operator!= (const color_rgba &rhs) const
+		{
+			return !(*this == rhs);
+		}
+
+		inline bool operator<(const color_rgba &rhs) const
+		{
+			for (int i = 0; i < 4; i++)
+			{
+				if (m_comps[i] < rhs.m_comps[i])
+					return true;
+				else if (m_comps[i] != rhs.m_comps[i])
+					return false;
+			}
+			return false;
+		}
+
+		inline int get_601_luma() const { return (19595U * m_comps[0] + 38470U * m_comps[1] + 7471U * m_comps[2] + 32768U) >> 16U; }
+		inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; } 
+		inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); }
+
+		inline basist::color32 get_color32() const
+		{
+			return basist::color32(r, g, b, a);
+		}
+
+		static color_rgba comp_min(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::minimum(a[0], b[0]), basisu::minimum(a[1], b[1]), basisu::minimum(a[2], b[2]), basisu::minimum(a[3], b[3])); }
+		static color_rgba comp_max(const color_rgba& a, const color_rgba& b) { return color_rgba(basisu::maximum(a[0], b[0]), basisu::maximum(a[1], b[1]), basisu::maximum(a[2], b[2]), basisu::maximum(a[3], b[3])); }
+	};
+
+	typedef basisu::vector<color_rgba> color_rgba_vec;
+
+	const color_rgba g_black_color(0, 0, 0, 255);
+	const color_rgba g_black_trans_color(0, 0, 0, 0);
+	const color_rgba g_white_color(255, 255, 255, 255);
+
+	inline int color_distance(int r0, int g0, int b0, int r1, int g1, int b1)
+	{
+		int dr = r0 - r1, dg = g0 - g1, db = b0 - b1;
+		return dr * dr + dg * dg + db * db;
+	}
+
+	inline int color_distance(int r0, int g0, int b0, int a0, int r1, int g1, int b1, int a1)
+	{
+		int dr = r0 - r1, dg = g0 - g1, db = b0 - b1, da = a0 - a1;
+		return dr * dr + dg * dg + db * db + da * da;
+	}
+
+	inline int color_distance(const color_rgba &c0, const color_rgba &c1, bool alpha)
+	{
+		if (alpha)
+			return color_distance(c0.r, c0.g, c0.b, c0.a, c1.r, c1.g, c1.b, c1.a);
+		else
+			return color_distance(c0.r, c0.g, c0.b, c1.r, c1.g, c1.b);
+	}
+		
+	// TODO: Allow user to control channel weightings.
+	inline uint32_t color_distance(bool perceptual, const color_rgba &e1, const color_rgba &e2, bool alpha)
+	{
+		if (perceptual)
+		{
+#if BASISU_USE_HIGH_PRECISION_COLOR_DISTANCE
+			const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f;
+			const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f;
+
+			const float cr1 = e1.r - l1;
+			const float cr2 = e2.r - l2;
+
+			const float cb1 = e1.b - l1;
+			const float cb2 = e2.b - l2;
+
+			const float dl = l1 - l2;
+			const float dcr = cr1 - cr2;
+			const float dcb = cb1 - cb2;
+
+			uint32_t d = static_cast<uint32_t>(32.0f*4.0f*dl*dl + 32.0f*2.0f*(.5f / (1.0f - .2126f))*(.5f / (1.0f - .2126f))*dcr*dcr + 32.0f*.25f*(.5f / (1.0f - .0722f))*(.5f / (1.0f - .0722f))*dcb*dcb);
+			
+			if (alpha)
+			{
+				int da = static_cast<int>(e1.a) - static_cast<int>(e2.a);
+				d += static_cast<uint32_t>(128.0f*da*da);
+			}
+
+			return d;
+#elif 1
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int delta_l = dr * 27 + dg * 92 + db * 9;
+			int delta_cr = dr * 128 - delta_l;
+			int delta_cb = db * 128 - delta_l;
+
+			uint32_t id = ((uint32_t)(delta_l * delta_l) >> 7U) +
+				((((uint32_t)(delta_cr * delta_cr) >> 7U) * 26U) >> 7U) +
+				((((uint32_t)(delta_cb * delta_cb) >> 7U) * 3U) >> 7U);
+
+			if (alpha)
+			{
+				int da = (e1.a - e2.a) << 7;
+				id += ((uint32_t)(da * da) >> 7U);
+			}
+
+			return id;
+#else
+			int dr = e1.r - e2.r;
+			int dg = e1.g - e2.g;
+			int db = e1.b - e2.b;
+
+			int64_t delta_l = dr * 27 + dg * 92 + db * 9;
+			int64_t delta_cr = dr * 128 - delta_l;
+			int64_t delta_cb = db * 128 - delta_l;
+
+			int64_t id = ((delta_l * delta_l) * 128) +
+				((delta_cr * delta_cr) * 26) +
+				((delta_cb * delta_cb) * 3);
+
+			if (alpha)
+			{
+				int64_t da = (e1.a - e2.a);
+				id += (da * da) * 128;
+			}
+
+			int d = (id + 8192) >> 14;
+
+			return d;
+#endif
+		}
+		else
+			return color_distance(e1, e2, alpha);
+	}
+
+	static inline uint32_t color_distance_la(const color_rgba& a, const color_rgba& b)
+	{
+		const int dl = a.r - b.r;
+		const int da = a.a - b.a;
+		return dl * dl + da * da;
+	}
+
+	// String helpers
+
+	inline int string_find_right(const std::string& filename, char c)
+	{
+		size_t result = filename.find_last_of(c);
+		return (result == std::string::npos) ? -1 : (int)result;
+	}
+
+	inline std::string string_get_extension(const std::string &filename)
+	{
+		int sep = -1;
+#ifdef _WIN32
+		sep = string_find_right(filename, '\\');
+#endif
+		if (sep < 0)
+			sep = string_find_right(filename, '/');
+
+		int dot = string_find_right(filename, '.');
+		if (dot <= sep)
+			return "";
+
+		std::string result(filename);
+		result.erase(0, dot + 1);
+
+		return result;
+	}
+
+	inline bool string_remove_extension(std::string &filename)
+	{
+		int sep = -1;
+#ifdef _WIN32
+		sep = string_find_right(filename, '\\');
+#endif
+		if (sep < 0)
+			sep = string_find_right(filename, '/');
+
+		int dot = string_find_right(filename, '.');
+		if ((dot < sep) || (dot < 0))
+			return false;
+
+		filename.resize(dot);
+
+		return true;
+	}
+
+	inline std::string string_format(const char* pFmt, ...)
+	{
+		char buf[2048];
+
+		va_list args;
+		va_start(args, pFmt);
+#ifdef _WIN32		
+		vsprintf_s(buf, sizeof(buf), pFmt, args);
+#else
+		vsnprintf(buf, sizeof(buf), pFmt, args);
+#endif		
+		va_end(args);
+
+		return std::string(buf);
+	}
+
+	inline std::string string_tolower(const std::string& s)
+	{
+		std::string result(s);
+		for (size_t i = 0; i < result.size(); i++)
+			result[i] = (char)tolower((int)result[i]);
+		return result;
+	}
+
+	inline char *strcpy_safe(char *pDst, size_t dst_len, const char *pSrc)
+	{
+		assert(pDst && pSrc && dst_len);
+		if (!dst_len)
+			return pDst;
+
+		const size_t src_len = strlen(pSrc);
+		const size_t src_len_plus_terminator = src_len + 1;
+
+		if (src_len_plus_terminator <= dst_len)
+			memcpy(pDst, pSrc, src_len_plus_terminator);
+		else
+		{
+			if (dst_len > 1)
+				memcpy(pDst, pSrc, dst_len - 1);
+			pDst[dst_len - 1] = '\0';
+		}
+
+		return pDst;
+	}
+
+	inline bool string_ends_with(const std::string& s, char c)
+	{
+		return (s.size() != 0) && (s.back() == c);
+	}
+
+	inline bool string_split_path(const char *p, std::string *pDrive, std::string *pDir, std::string *pFilename, std::string *pExt)
+	{
+#ifdef _MSC_VER
+		char drive_buf[_MAX_DRIVE] = { 0 };
+		char dir_buf[_MAX_DIR] = { 0 };
+		char fname_buf[_MAX_FNAME] = { 0 };
+		char ext_buf[_MAX_EXT] = { 0 };
+
+		errno_t error = _splitpath_s(p, 
+			pDrive ? drive_buf : NULL, pDrive ? _MAX_DRIVE : 0,
+			pDir ? dir_buf : NULL, pDir ? _MAX_DIR : 0,
+			pFilename ? fname_buf : NULL, pFilename ? _MAX_FNAME : 0,
+			pExt ? ext_buf : NULL, pExt ? _MAX_EXT : 0);
+		if (error != 0)
+			return false;
+
+		if (pDrive) *pDrive = drive_buf;
+		if (pDir) *pDir = dir_buf;
+		if (pFilename) *pFilename = fname_buf;
+		if (pExt) *pExt = ext_buf;
+		return true;
+#else
+		char dirtmp[1024], nametmp[1024];
+		strcpy_safe(dirtmp, sizeof(dirtmp), p);
+		strcpy_safe(nametmp, sizeof(nametmp), p);
+
+		if (pDrive)
+			pDrive->resize(0);
+
+		const char *pDirName = dirname(dirtmp);
+		const char* pBaseName = basename(nametmp);
+		if ((!pDirName) || (!pBaseName))
+			return false;
+
+		if (pDir)
+		{
+			*pDir = pDirName;
+			if ((pDir->size()) && (pDir->back() != '/'))
+				*pDir += "/";
+		}
+				
+		if (pFilename)
+		{
+			*pFilename = pBaseName;
+			string_remove_extension(*pFilename);
+		}
+
+		if (pExt)
+		{
+			*pExt = pBaseName;
+			*pExt = string_get_extension(*pExt);
+			if (pExt->size())
+				*pExt = "." + *pExt;
+		}
+
+		return true;
+#endif
+	}
+
+	inline bool is_path_separator(char c)
+	{
+#ifdef _WIN32
+		return (c == '/') || (c == '\\');
+#else
+		return (c == '/');
+#endif
+	}
+		
+	inline bool is_drive_separator(char c)
+	{
+#ifdef _WIN32
+		return (c == ':');
+#else
+		(void)c;
+		return false;
+#endif
+	}
+
+	inline void string_combine_path(std::string &dst, const char *p, const char *q)
+	{
+		std::string temp(p);
+		if (temp.size() && !is_path_separator(q[0]))
+		{
+			if (!is_path_separator(temp.back()))
+				temp.append(1, BASISU_PATH_SEPERATOR_CHAR);
+		}
+		temp += q;
+		dst.swap(temp);
+	}
+
+	inline void string_combine_path(std::string &dst, const char *p, const char *q, const char *r)
+	{
+		string_combine_path(dst, p, q);
+		string_combine_path(dst, dst.c_str(), r);
+	}
+		
+	inline void string_combine_path_and_extension(std::string &dst, const char *p, const char *q, const char *r, const char *pExt)
+	{
+		string_combine_path(dst, p, q, r);
+		if ((!string_ends_with(dst, '.')) && (pExt[0]) && (pExt[0] != '.'))
+			dst.append(1, '.');
+		dst.append(pExt);
+	}
+
+	inline bool string_get_pathname(const char *p, std::string &path)
+	{
+		std::string temp_drive, temp_path;
+		if (!string_split_path(p, &temp_drive, &temp_path, NULL, NULL))
+			return false;
+		string_combine_path(path, temp_drive.c_str(), temp_path.c_str());
+		return true;
+	}
+
+	inline bool string_get_filename(const char *p, std::string &filename)
+	{
+		std::string temp_ext;
+		if (!string_split_path(p, nullptr, nullptr, &filename, &temp_ext))
+			return false;
+		filename += temp_ext;
+		return true;
+	}
+
+	class rand
+	{
+		std::mt19937 m_mt;
+
+	public:
+		rand() {	}
+
+		rand(uint32_t s) { seed(s); }
+		void seed(uint32_t s) { m_mt.seed(s); }
+
+		// between [l,h]
+		int irand(int l, int h) { std::uniform_int_distribution<int> d(l, h); return d(m_mt); }
+
+		uint32_t urand32() { return static_cast<uint32_t>(irand(INT32_MIN, INT32_MAX)); }
+
+		bool bit() { return irand(0, 1) == 1; }
+
+		uint8_t byte() { return static_cast<uint8_t>(urand32()); }
+
+		// between [l,h)
+		float frand(float l, float h) { std::uniform_real_distribution<float> d(l, h); return d(m_mt); }
+
+		float gaussian(float mean, float stddev) { std::normal_distribution<float> d(mean, stddev); return d(m_mt); }
+	};
+
+	class priority_queue
+	{
+	public:
+		priority_queue() :
+			m_size(0)
+		{
+		}
+
+		void clear()
+		{
+			m_heap.clear();
+			m_size = 0;
+		}
+
+		void init(uint32_t max_entries, uint32_t first_index, float first_priority)
+		{
+			m_heap.resize(max_entries + 1);
+			m_heap[1].m_index = first_index;
+			m_heap[1].m_priority = first_priority;
+			m_size = 1;
+		}
+
+		inline uint32_t size() const { return m_size; }
+
+		inline uint32_t get_top_index() const { return m_heap[1].m_index; }
+		inline float get_top_priority() const { return m_heap[1].m_priority; }
+
+		inline void delete_top()
+		{
+			assert(m_size > 0);
+			m_heap[1] = m_heap[m_size];
+			m_size--;
+			if (m_size)
+				down_heap(1);
+		}
+
+		inline void add_heap(uint32_t index, float priority)
+		{
+			m_size++;
+
+			uint32_t k = m_size;
+
+			if (m_size >= m_heap.size())
+				m_heap.resize(m_size + 1);
+
+			for (;;)
+			{
+				uint32_t parent_index = k >> 1;
+				if ((!parent_index) || (m_heap[parent_index].m_priority > priority))
+					break;
+				m_heap[k] = m_heap[parent_index];
+				k = parent_index;
+			}
+
+			m_heap[k].m_index = index;
+			m_heap[k].m_priority = priority;
+		}
+
+	private:
+		struct entry
+		{
+			uint32_t m_index;
+			float m_priority;
+		};
+
+		basisu::vector<entry> m_heap;
+		uint32_t m_size;
+
+		// Push down entry at index
+		inline void down_heap(uint32_t heap_index)
+		{
+			uint32_t orig_index = m_heap[heap_index].m_index;
+			const float orig_priority = m_heap[heap_index].m_priority;
+
+			uint32_t child_index;
+			while ((child_index = (heap_index << 1)) <= m_size)
+			{
+				if ((child_index < m_size) && (m_heap[child_index].m_priority < m_heap[child_index + 1].m_priority)) ++child_index;
+				if (orig_priority > m_heap[child_index].m_priority)
+					break;
+				m_heap[heap_index] = m_heap[child_index];
+				heap_index = child_index;
+			}
+
+			m_heap[heap_index].m_index = orig_index;
+			m_heap[heap_index].m_priority = orig_priority;
+		}
+	};
+
+	// Tree structured vector quantization (TSVQ)
+
+	template <typename TrainingVectorType>
+	class tree_vector_quant
+	{
+	public:
+		typedef TrainingVectorType training_vec_type;
+		typedef std::pair<TrainingVectorType, uint64_t> training_vec_with_weight;
+		typedef basisu::vector< training_vec_with_weight > array_of_weighted_training_vecs;
+
+		tree_vector_quant() :
+			m_next_codebook_index(0)
+		{
+		}
+
+		void clear()
+		{
+			clear_vector(m_training_vecs);
+			clear_vector(m_nodes);
+			m_next_codebook_index = 0;
+		}
+
+		void add_training_vec(const TrainingVectorType &v, uint64_t weight) { m_training_vecs.push_back(std::make_pair(v, weight)); }
+
+		size_t get_total_training_vecs() const { return m_training_vecs.size(); }
+		const array_of_weighted_training_vecs &get_training_vecs() const	{ return m_training_vecs; }
+				array_of_weighted_training_vecs &get_training_vecs()			{ return m_training_vecs; }
+
+		void retrieve(basisu::vector< basisu::vector<uint32_t> > &codebook) const
+		{
+			for (uint32_t i = 0; i < m_nodes.size(); i++)
+			{
+				const tsvq_node &n = m_nodes[i];
+				if (!n.is_leaf())
+					continue;
+
+				codebook.resize(codebook.size() + 1);
+				codebook.back() = n.m_training_vecs;
+			}
+		}
+
+		void retrieve(basisu::vector<TrainingVectorType> &codebook) const
+		{
+			for (uint32_t i = 0; i < m_nodes.size(); i++)
+			{
+				const tsvq_node &n = m_nodes[i];
+				if (!n.is_leaf())
+					continue;
+
+				codebook.resize(codebook.size() + 1);
+				codebook.back() = n.m_origin;
+			}
+		}
+
+		void retrieve(uint32_t max_clusters, basisu::vector<uint_vec> &codebook) const
+      {
+			uint_vec node_stack;
+         node_stack.reserve(512);
+
+         codebook.resize(0);
+         codebook.reserve(max_clusters);
+			         
+         uint32_t node_index = 0;
+
+         while (true)
+         {
+            const tsvq_node& cur = m_nodes[node_index];
+
+            if (cur.is_leaf() || ((2 + cur.m_codebook_index) > (int)max_clusters))
+            {
+               codebook.resize(codebook.size() + 1);
+               codebook.back() = cur.m_training_vecs;
+
+               if (node_stack.empty())
+                  break;
+
+               node_index = node_stack.back();
+               node_stack.pop_back();
+               continue;
+            }
+				            
+            node_stack.push_back(cur.m_right_index);
+				node_index = cur.m_left_index;
+         }
+      }
+
+		bool generate(uint32_t max_size)
+		{
+			if (!m_training_vecs.size())
+				return false;
+
+			m_next_codebook_index = 0;
+
+			clear_vector(m_nodes);
+			m_nodes.reserve(max_size * 2 + 1);
+
+			m_nodes.push_back(prepare_root());
+
+			priority_queue var_heap;
+			var_heap.init(max_size, 0, m_nodes[0].m_var);
+
+			basisu::vector<uint32_t> l_children, r_children;
+
+			// Now split the worst nodes
+			l_children.reserve(m_training_vecs.size() + 1);
+			r_children.reserve(m_training_vecs.size() + 1);
+
+			uint32_t total_leaf_nodes = 1;
+
+			while ((var_heap.size()) && (total_leaf_nodes < max_size))
+			{
+				const uint32_t node_index = var_heap.get_top_index();
+				const tsvq_node &node = m_nodes[node_index];
+
+				assert(node.m_var == var_heap.get_top_priority());
+				assert(node.is_leaf());
+
+				var_heap.delete_top();
+								
+				if (node.m_training_vecs.size() > 1)
+				{
+					if (split_node(node_index, var_heap, l_children, r_children))
+					{
+						// This removes one leaf node (making an internal node) and replaces it with two new leaves, so +1 total.
+						total_leaf_nodes += 1;
+					}
+				}
+			}
+
+			return true;
+		}
+
+	private:
+		class tsvq_node
+		{
+		public:
+			inline tsvq_node() : m_weight(0), m_origin(cZero), m_left_index(-1), m_right_index(-1), m_codebook_index(-1) { }
+
+			// vecs is erased
+			inline void set(const TrainingVectorType &org, uint64_t weight, float var, basisu::vector<uint32_t> &vecs) { m_origin = org; m_weight = weight; m_var = var; m_training_vecs.swap(vecs); }
+
+			inline bool is_leaf() const { return m_left_index < 0; }
+
+			float m_var;
+			uint64_t m_weight;
+			TrainingVectorType m_origin;
+			int32_t m_left_index, m_right_index;
+			basisu::vector<uint32_t> m_training_vecs;
+			int m_codebook_index;
+		};
+
+		typedef basisu::vector<tsvq_node> tsvq_node_vec;
+		tsvq_node_vec m_nodes;
+
+		array_of_weighted_training_vecs m_training_vecs;
+
+		uint32_t m_next_codebook_index;
+
+		tsvq_node prepare_root() const
+		{
+			double ttsum = 0.0f;
+
+			// Prepare root node containing all training vectors
+			tsvq_node root;
+			root.m_training_vecs.reserve(m_training_vecs.size());
+
+			for (uint32_t i = 0; i < m_training_vecs.size(); i++)
+			{
+				const TrainingVectorType &v = m_training_vecs[i].first;
+				const uint64_t weight = m_training_vecs[i].second;
+
+				root.m_training_vecs.push_back(i);
+
+				root.m_origin += (v * static_cast<float>(weight));
+				root.m_weight += weight;
+
+				ttsum += v.dot(v) * weight;
+			}
+
+			root.m_var = static_cast<float>(ttsum - (root.m_origin.dot(root.m_origin) / root.m_weight));
+
+			root.m_origin *= (1.0f / root.m_weight);
+
+			return root;
+		}
+
+		bool split_node(uint32_t node_index, priority_queue &var_heap, basisu::vector<uint32_t> &l_children, basisu::vector<uint32_t> &r_children)
+		{
+			TrainingVectorType l_child_org, r_child_org;
+			uint64_t l_weight = 0, r_weight = 0;
+			float l_var = 0.0f, r_var = 0.0f;
+
+			// Compute initial left/right child origins
+			if (!prep_split(m_nodes[node_index], l_child_org, r_child_org))
+				return false;
+
+			// Use k-means iterations to refine these children vectors
+			if (!refine_split(m_nodes[node_index], l_child_org, l_weight, l_var, l_children, r_child_org, r_weight, r_var, r_children))
+				return false;
+
+			// Create children
+			const uint32_t l_child_index = (uint32_t)m_nodes.size(), r_child_index = (uint32_t)m_nodes.size() + 1;
+
+			m_nodes[node_index].m_left_index = l_child_index;
+			m_nodes[node_index].m_right_index = r_child_index;
+			
+			m_nodes[node_index].m_codebook_index = m_next_codebook_index;
+			m_next_codebook_index++;
+
+			m_nodes.resize(m_nodes.size() + 2);
+
+			tsvq_node &l_child = m_nodes[l_child_index], &r_child = m_nodes[r_child_index];
+
+			l_child.set(l_child_org, l_weight, l_var, l_children);
+			r_child.set(r_child_org, r_weight, r_var, r_children);
+
+			if ((l_child.m_var <= 0.0f) && (l_child.m_training_vecs.size() > 1))
+			{
+				TrainingVectorType v(m_training_vecs[l_child.m_training_vecs[0]].first);
+				
+				for (uint32_t i = 1; i < l_child.m_training_vecs.size(); i++)
+				{
+					if (!(v == m_training_vecs[l_child.m_training_vecs[i]].first))
+					{
+						l_child.m_var = 1e-4f;
+						break;
+					}
+				}
+			}
+
+			if ((r_child.m_var <= 0.0f) && (r_child.m_training_vecs.size() > 1))
+			{
+				TrainingVectorType v(m_training_vecs[r_child.m_training_vecs[0]].first);
+
+				for (uint32_t i = 1; i < r_child.m_training_vecs.size(); i++)
+				{
+					if (!(v == m_training_vecs[r_child.m_training_vecs[i]].first))
+					{
+						r_child.m_var = 1e-4f;
+						break;
+					}
+				}
+			}
+
+			if ((l_child.m_var > 0.0f) && (l_child.m_training_vecs.size() > 1))
+				var_heap.add_heap(l_child_index, l_child.m_var);
+						
+			if ((r_child.m_var > 0.0f) && (r_child.m_training_vecs.size() > 1))
+				var_heap.add_heap(r_child_index, r_child.m_var);
+						
+			return true;
+		}
+
+		TrainingVectorType compute_split_axis(const tsvq_node &node) const
+		{
+			const uint32_t N = TrainingVectorType::num_elements;
+
+			matrix<N, N, float> cmatrix(cZero);
+
+			// Compute covariance matrix from weighted input vectors
+			for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+			{
+				const TrainingVectorType v(m_training_vecs[node.m_training_vecs[i]].first - node.m_origin);
+				const TrainingVectorType w(static_cast<float>(m_training_vecs[node.m_training_vecs[i]].second) * v);
+
+				for (uint32_t x = 0; x < N; x++)
+					for (uint32_t y = x; y < N; y++)
+						cmatrix[x][y] = cmatrix[x][y] + v[x] * w[y];
+			}
+
+			const float renorm_scale = 1.0f / node.m_weight;
+
+			for (uint32_t x = 0; x < N; x++)
+				for (uint32_t y = x; y < N; y++)
+					cmatrix[x][y] *= renorm_scale;
+
+			// Diagonal flip
+			for (uint32_t x = 0; x < (N - 1); x++)
+				for (uint32_t y = x + 1; y < N; y++)
+					cmatrix[y][x] = cmatrix[x][y];
+
+			return compute_pca_from_covar<N, TrainingVectorType>(cmatrix);
+		}
+
+		bool prep_split(const tsvq_node &node, TrainingVectorType &l_child_result, TrainingVectorType &r_child_result) const
+		{
+			//const uint32_t N = TrainingVectorType::num_elements;
+
+			if (2 == node.m_training_vecs.size())
+			{
+				l_child_result = m_training_vecs[node.m_training_vecs[0]].first;
+				r_child_result = m_training_vecs[node.m_training_vecs[1]].first;
+				return true;
+			}
+
+			TrainingVectorType axis(compute_split_axis(node)), l_child(0.0f), r_child(0.0f);
+			double l_weight = 0.0f, r_weight = 0.0f;
+
+			// Compute initial left/right children
+			for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+			{
+				const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
+
+				const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
+
+				double t = (v - node.m_origin).dot(axis);
+				if (t >= 0.0f)
+				{
+					r_child += v * weight;
+					r_weight += weight;
+				}
+				else
+				{
+					l_child += v * weight;
+					l_weight += weight;
+				}
+			}
+
+			if ((l_weight > 0.0f) && (r_weight > 0.0f))
+			{
+				l_child_result = l_child * static_cast<float>(1.0f / l_weight);
+				r_child_result = r_child * static_cast<float>(1.0f / r_weight);
+			}
+			else
+			{
+				TrainingVectorType l(1e+20f);
+				TrainingVectorType h(-1e+20f);
+				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+				{
+					const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
+					
+					l = TrainingVectorType::component_min(l, v);
+					h = TrainingVectorType::component_max(h, v);
+				}
+
+				TrainingVectorType r(h - l);
+
+				float largest_axis_v = 0.0f;
+				int largest_axis_index = -1;
+				for (uint32_t i = 0; i < TrainingVectorType::num_elements; i++)
+				{
+					if (r[i] > largest_axis_v)
+					{
+						largest_axis_v = r[i];
+						largest_axis_index = i;
+					}
+				}
+
+				if (largest_axis_index < 0)
+					return false;
+
+				basisu::vector<float> keys(node.m_training_vecs.size());
+				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+					keys[i] = m_training_vecs[node.m_training_vecs[i]].first[largest_axis_index];
+
+				uint_vec indices(node.m_training_vecs.size());
+				indirect_sort((uint32_t)node.m_training_vecs.size(), &indices[0], &keys[0]);
+
+				l_child.set_zero();
+				l_weight = 0;
+
+				r_child.set_zero();
+				r_weight = 0;
+
+				const uint32_t half_index = (uint32_t)node.m_training_vecs.size() / 2;
+				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+				{
+					const float weight = (float)m_training_vecs[node.m_training_vecs[i]].second;
+
+					const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
+
+					if (i < half_index)
+					{
+						l_child += v * weight;
+						l_weight += weight;
+					}
+					else
+					{
+						r_child += v * weight;
+						r_weight += weight;
+					}
+				}
+
+				if ((l_weight > 0.0f) && (r_weight > 0.0f))
+				{
+					l_child_result = l_child * static_cast<float>(1.0f / l_weight);
+					r_child_result = r_child * static_cast<float>(1.0f / r_weight);
+				}
+				else
+				{
+					l_child_result = l;
+					r_child_result = h;
+				}
+			}
+
+			return true;
+		}
+
+		bool refine_split(const tsvq_node &node,
+			TrainingVectorType &l_child, uint64_t &l_weight, float &l_var, basisu::vector<uint32_t> &l_children,
+			TrainingVectorType &r_child, uint64_t &r_weight, float &r_var, basisu::vector<uint32_t> &r_children) const
+		{
+			l_children.reserve(node.m_training_vecs.size());
+			r_children.reserve(node.m_training_vecs.size());
+
+			float prev_total_variance = 1e+10f;
+
+			// Refine left/right children locations using k-means iterations
+			const uint32_t cMaxIters = 6;
+			for (uint32_t iter = 0; iter < cMaxIters; iter++)
+			{
+				l_children.resize(0); 
+				r_children.resize(0); 
+
+				TrainingVectorType new_l_child(cZero), new_r_child(cZero);
+
+				double l_ttsum = 0.0f, r_ttsum = 0.0f;
+
+				l_weight = 0;
+				r_weight = 0;
+
+				for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+				{
+					const TrainingVectorType &v = m_training_vecs[node.m_training_vecs[i]].first;
+					const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
+
+					double left_dist2 = l_child.squared_distance_d(v), right_dist2 = r_child.squared_distance_d(v);
+
+					if (left_dist2 >= right_dist2)
+					{
+						new_r_child += (v * static_cast<float>(weight));
+						r_weight += weight;
+
+						r_ttsum += weight * v.dot(v);
+						r_children.push_back(node.m_training_vecs[i]);
+					}
+					else
+					{
+						new_l_child += (v * static_cast<float>(weight));
+						l_weight += weight;
+
+						l_ttsum += weight * v.dot(v);
+						l_children.push_back(node.m_training_vecs[i]);
+					}
+				}
+
+				if ((!l_weight) || (!r_weight))
+				{
+					TrainingVectorType firstVec;
+					for (uint32_t i = 0; i < node.m_training_vecs.size(); i++)
+					{
+						const TrainingVectorType& v = m_training_vecs[node.m_training_vecs[i]].first;
+						const uint64_t weight = m_training_vecs[node.m_training_vecs[i]].second;
+					
+						if ((!i) || (v == firstVec))
+						{
+							firstVec = v;
+
+							new_r_child += (v * static_cast<float>(weight));
+							r_weight += weight;
+
+							r_ttsum += weight * v.dot(v);
+							r_children.push_back(node.m_training_vecs[i]);
+						}
+						else
+						{
+							new_l_child += (v * static_cast<float>(weight));
+							l_weight += weight;
+
+							l_ttsum += weight * v.dot(v);
+							l_children.push_back(node.m_training_vecs[i]);
+						}
+					}
+
+					if (!l_weight)
+						return false;
+				}
+
+				l_var = static_cast<float>(l_ttsum - (new_l_child.dot(new_l_child) / l_weight));
+				r_var = static_cast<float>(r_ttsum - (new_r_child.dot(new_r_child) / r_weight));
+
+				new_l_child *= (1.0f / l_weight);
+				new_r_child *= (1.0f / r_weight);
+
+				l_child = new_l_child;
+				r_child = new_r_child;
+
+				float total_var = l_var + r_var;
+				const float cGiveupVariance = .00001f;
+				if (total_var < cGiveupVariance)
+					break;
+
+				// Check to see if the variance has settled
+				const float cVarianceDeltaThresh = .00125f;
+				if (((prev_total_variance - total_var) / total_var) < cVarianceDeltaThresh)
+					break;
+
+				prev_total_variance = total_var;
+			}
+
+			return true;
+		}
+	};
+
+	struct weighted_block_group
+	{
+		uint64_t m_total_weight;
+		uint_vec m_indices;
+	};
+
+	template<typename Quantizer>
+	bool generate_hierarchical_codebook_threaded_internal(Quantizer& q,
+		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
+		uint32_t max_threads, bool limit_clusterizers, job_pool *pJob_pool)
+	{
+		codebook.resize(0);
+		parent_codebook.resize(0);
+
+		if ((max_threads <= 1) || (q.get_training_vecs().size() < 256) || (max_codebook_size < max_threads * 16))
+		{
+			if (!q.generate(max_codebook_size))
+				return false;
+
+			q.retrieve(codebook);
+
+			if (max_parent_codebook_size)
+				q.retrieve(max_parent_codebook_size, parent_codebook);
+
+			return true;
+		}
+
+		const uint32_t cMaxThreads = 16;
+		if (max_threads > cMaxThreads)
+			max_threads = cMaxThreads;
+
+		if (!q.generate(max_threads))
+			return false;
+
+		basisu::vector<uint_vec> initial_codebook;
+
+		q.retrieve(initial_codebook);
+
+		if (initial_codebook.size() < max_threads)
+		{
+			codebook = initial_codebook;
+
+			if (max_parent_codebook_size)
+				q.retrieve(max_parent_codebook_size, parent_codebook);
+
+			return true;
+		}
+
+		Quantizer quantizers[cMaxThreads];
+		
+		bool success_flags[cMaxThreads];
+		clear_obj(success_flags);
+
+		basisu::vector<uint_vec> local_clusters[cMaxThreads];
+		basisu::vector<uint_vec> local_parent_clusters[cMaxThreads];
+
+		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
+		{
+#ifndef __EMSCRIPTEN__
+			pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] {
+#endif
+
+				Quantizer& lq = quantizers[thread_iter];
+				uint_vec& cluster_indices = initial_codebook[thread_iter];
+
+				uint_vec local_to_global(cluster_indices.size());
+
+				for (uint32_t i = 0; i < cluster_indices.size(); i++)
+				{
+					const uint32_t global_training_vec_index = cluster_indices[i];
+					local_to_global[i] = global_training_vec_index;
+
+					lq.add_training_vec(q.get_training_vecs()[global_training_vec_index].first, q.get_training_vecs()[global_training_vec_index].second);
+				}
+
+				const uint32_t max_clusters = limit_clusterizers ? ((max_codebook_size + max_threads - 1) / max_threads) : (uint32_t)lq.get_total_training_vecs();
+
+				success_flags[thread_iter] = lq.generate(max_clusters);
+
+				if (success_flags[thread_iter])
+				{
+					lq.retrieve(local_clusters[thread_iter]);
+
+					for (uint32_t i = 0; i < local_clusters[thread_iter].size(); i++)
+					{
+						for (uint32_t j = 0; j < local_clusters[thread_iter][i].size(); j++)
+							local_clusters[thread_iter][i][j] = local_to_global[local_clusters[thread_iter][i][j]];
+					}
+
+					if (max_parent_codebook_size)
+					{
+						lq.retrieve((max_parent_codebook_size + max_threads - 1) / max_threads, local_parent_clusters[thread_iter]);
+
+						for (uint32_t i = 0; i < local_parent_clusters[thread_iter].size(); i++)
+						{
+							for (uint32_t j = 0; j < local_parent_clusters[thread_iter][i].size(); j++)
+								local_parent_clusters[thread_iter][i][j] = local_to_global[local_parent_clusters[thread_iter][i][j]];
+						}
+					}
+				}
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		} // thread_iter
+
+#ifndef __EMSCRIPTEN__
+		pJob_pool->wait_for_all();
+#endif
+
+		uint32_t total_clusters = 0, total_parent_clusters = 0;
+
+		for (int thread_iter = 0; thread_iter < (int)max_threads; thread_iter++)
+		{
+			if (!success_flags[thread_iter])
+				return false;
+			total_clusters += (uint32_t)local_clusters[thread_iter].size();
+			total_parent_clusters += (uint32_t)local_parent_clusters[thread_iter].size();
+		}
+
+		codebook.reserve(total_clusters);
+		parent_codebook.reserve(total_parent_clusters);
+
+		for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++)
+		{
+			for (uint32_t j = 0; j < local_clusters[thread_iter].size(); j++)
+			{
+				codebook.resize(codebook.size() + 1);
+				codebook.back().swap(local_clusters[thread_iter][j]);
+			}
+
+			for (uint32_t j = 0; j < local_parent_clusters[thread_iter].size(); j++)
+			{
+				parent_codebook.resize(parent_codebook.size() + 1);
+				parent_codebook.back().swap(local_parent_clusters[thread_iter][j]);
+			}
+		}
+
+		return true;
+	}
+
+	template<typename Quantizer>
+	bool generate_hierarchical_codebook_threaded(Quantizer& q,
+		uint32_t max_codebook_size, uint32_t max_parent_codebook_size,
+		basisu::vector<uint_vec>& codebook,
+		basisu::vector<uint_vec>& parent_codebook,
+		uint32_t max_threads, job_pool *pJob_pool)
+	{
+		typedef bit_hasher<typename Quantizer::training_vec_type> training_vec_bit_hasher;
+		typedef std::unordered_map < typename Quantizer::training_vec_type, weighted_block_group, 
+			training_vec_bit_hasher> group_hash;
+		
+		group_hash unique_vecs;
+
+		weighted_block_group g;
+		g.m_indices.resize(1);
+
+		for (uint32_t i = 0; i < q.get_training_vecs().size(); i++)
+		{
+			g.m_total_weight = q.get_training_vecs()[i].second;
+			g.m_indices[0] = i;
+
+			auto ins_res = unique_vecs.insert(std::make_pair(q.get_training_vecs()[i].first, g));
+
+			if (!ins_res.second)
+			{
+				(ins_res.first)->second.m_total_weight += g.m_total_weight;
+				(ins_res.first)->second.m_indices.push_back(i);
+			}
+		}
+
+		debug_printf("generate_hierarchical_codebook_threaded: %u training vectors, %u unique training vectors\n", q.get_total_training_vecs(), (uint32_t)unique_vecs.size());
+
+		Quantizer group_quant;
+		typedef typename group_hash::const_iterator group_hash_const_iter;
+		basisu::vector<group_hash_const_iter> unique_vec_iters;
+		unique_vec_iters.reserve(unique_vecs.size());
+
+		for (auto iter = unique_vecs.begin(); iter != unique_vecs.end(); ++iter)
+		{
+			group_quant.add_training_vec(iter->first, iter->second.m_total_weight);
+			unique_vec_iters.push_back(iter);
+		}
+
+		bool limit_clusterizers = true;
+		if (unique_vecs.size() <= max_codebook_size)
+			limit_clusterizers = false;
+
+		debug_printf("Limit clusterizers: %u\n", limit_clusterizers);
+
+		basisu::vector<uint_vec> group_codebook, group_parent_codebook;
+		bool status = generate_hierarchical_codebook_threaded_internal(group_quant,
+			max_codebook_size, max_parent_codebook_size,
+			group_codebook,
+			group_parent_codebook,
+			(unique_vecs.size() < 65536*4) ? 1 : max_threads, limit_clusterizers, pJob_pool);
+
+		if (!status)
+			return false;
+
+		codebook.resize(0);
+		for (uint32_t i = 0; i < group_codebook.size(); i++)
+		{
+			codebook.resize(codebook.size() + 1);
+
+			for (uint32_t j = 0; j < group_codebook[i].size(); j++)
+			{
+				const uint32_t group_index = group_codebook[i][j];
+
+				typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
+				const uint_vec& training_vec_indices = group_iter->second.m_indices;
+				
+				append_vector(codebook.back(), training_vec_indices);
+			}
+		}
+
+		parent_codebook.resize(0);
+		for (uint32_t i = 0; i < group_parent_codebook.size(); i++)
+		{
+			parent_codebook.resize(parent_codebook.size() + 1);
+
+			for (uint32_t j = 0; j < group_parent_codebook[i].size(); j++)
+			{
+				const uint32_t group_index = group_parent_codebook[i][j];
+
+				typename group_hash::const_iterator group_iter = unique_vec_iters[group_index];
+				const uint_vec& training_vec_indices = group_iter->second.m_indices;
+
+				append_vector(parent_codebook.back(), training_vec_indices);
+			}
+		}
+
+		return true;
+	}
+
+	// Canonical Huffman coding
+
+	class histogram
+	{
+		basisu::vector<uint32_t> m_hist;
+
+	public:
+		histogram(uint32_t size = 0) { init(size); }
+
+		void clear()
+		{
+			clear_vector(m_hist);
+		}
+
+		void init(uint32_t size)
+		{
+			m_hist.resize(0);
+			m_hist.resize(size);
+		}
+
+		inline uint32_t size() const { return static_cast<uint32_t>(m_hist.size()); }
+
+		inline const uint32_t &operator[] (uint32_t index) const
+		{
+			return m_hist[index];
+		}
+
+		inline uint32_t &operator[] (uint32_t index)
+		{
+			return m_hist[index];
+		}
+
+		inline void inc(uint32_t index)
+		{
+			m_hist[index]++;
+		}
+
+		uint64_t get_total() const
+		{
+			uint64_t total = 0;
+			for (uint32_t i = 0; i < m_hist.size(); ++i)
+				total += m_hist[i];
+			return total;
+		}
+
+		double get_entropy() const
+		{
+			double total = static_cast<double>(get_total());
+			if (total == 0.0f)
+				return 0.0f;
+
+			const double inv_total = 1.0f / total;
+			const double neg_inv_log2 = -1.0f / log(2.0f);
+			
+			double e = 0.0f;
+			for (uint32_t i = 0; i < m_hist.size(); i++)
+				if (m_hist[i])
+					e += log(m_hist[i] * inv_total) * neg_inv_log2 * static_cast<double>(m_hist[i]);
+
+			return e;
+		}
+	};
+		
+	struct sym_freq
+	{
+		uint32_t m_key;
+		uint16_t m_sym_index;
+	};
+
+	sym_freq *canonical_huffman_radix_sort_syms(uint32_t num_syms, sym_freq *pSyms0, sym_freq *pSyms1);
+	void canonical_huffman_calculate_minimum_redundancy(sym_freq *A, int num_syms);
+	void canonical_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size);
+	
+	class huffman_encoding_table
+	{
+	public:
+		huffman_encoding_table()
+		{
+		}
+
+		void clear()
+		{
+			clear_vector(m_codes);
+			clear_vector(m_code_sizes);
+		}
+
+		bool init(const histogram &h, uint32_t max_code_size = cHuffmanMaxSupportedCodeSize)
+		{
+			return init(h.size(), &h[0], max_code_size);
+		}
+
+		bool init(uint32_t num_syms, const uint16_t *pFreq, uint32_t max_code_size);
+		bool init(uint32_t num_syms, const uint32_t *pSym_freq, uint32_t max_code_size);
+		
+		inline const uint16_vec &get_codes() const { return m_codes; }
+		inline const uint8_vec &get_code_sizes() const { return m_code_sizes; }
+
+		uint32_t get_total_used_codes() const
+		{
+			for (int i = static_cast<int>(m_code_sizes.size()) - 1; i >= 0; i--)
+				if (m_code_sizes[i])
+					return i + 1;
+			return 0;
+		}
+
+	private:
+		uint16_vec m_codes;
+		uint8_vec m_code_sizes;
+	};
+
+	class bitwise_coder
+	{
+	public:
+		bitwise_coder() :
+			m_bit_buffer(0),
+			m_bit_buffer_size(0),
+			m_total_bits(0)
+		{
+		}
+
+		inline void clear()
+		{
+			clear_vector(m_bytes);
+			m_bit_buffer = 0;
+			m_bit_buffer_size = 0;
+			m_total_bits = 0;
+		}
+
+		inline const uint8_vec &get_bytes() const { return m_bytes; }
+
+		inline uint64_t get_total_bits() const { return m_total_bits; }
+		inline void clear_total_bits() { m_total_bits = 0; }
+
+		inline void init(uint32_t reserve_size = 1024)
+		{
+			m_bytes.reserve(reserve_size);
+			m_bytes.resize(0);
+
+			m_bit_buffer = 0;
+			m_bit_buffer_size = 0;
+			m_total_bits = 0;
+		}
+
+		inline uint32_t flush()
+		{
+			if (m_bit_buffer_size)
+			{
+				m_total_bits += 8 - (m_bit_buffer_size & 7);
+				append_byte(static_cast<uint8_t>(m_bit_buffer));
+
+				m_bit_buffer = 0;
+				m_bit_buffer_size = 0;
+				
+				return 8;
+			}
+
+			return 0;
+		}
+
+		inline uint32_t put_bits(uint32_t bits, uint32_t num_bits)
+		{
+			assert(num_bits <= 32);
+			assert(bits < (1ULL << num_bits));
+
+			if (!num_bits)
+				return 0;
+
+			m_total_bits += num_bits;
+
+			uint64_t v = (static_cast<uint64_t>(bits) << m_bit_buffer_size) | m_bit_buffer;
+			m_bit_buffer_size += num_bits;
+
+			while (m_bit_buffer_size >= 8)
+			{
+				append_byte(static_cast<uint8_t>(v));
+				v >>= 8;
+				m_bit_buffer_size -= 8;
+			}
+
+			m_bit_buffer = static_cast<uint8_t>(v);
+			return num_bits;
+		}
+
+		inline uint32_t put_code(uint32_t sym, const huffman_encoding_table &tab)
+		{
+			uint32_t code = tab.get_codes()[sym];
+			uint32_t code_size = tab.get_code_sizes()[sym];
+			assert(code_size >= 1);
+			put_bits(code, code_size);
+			return code_size;
+		}
+
+		inline uint32_t put_truncated_binary(uint32_t v, uint32_t n)
+		{
+			assert((n >= 2) && (v < n));
+
+			uint32_t k = floor_log2i(n);
+			uint32_t u = (1 << (k + 1)) - n;
+
+			if (v < u)
+				return put_bits(v, k);
+			
+			uint32_t x = v + u;
+			assert((x >> 1) >= u);
+
+			put_bits(x >> 1, k);
+			put_bits(x & 1, 1);
+			return k + 1;
+		}
+
+		inline uint32_t put_rice(uint32_t v, uint32_t m)
+		{
+			assert(m);
+			
+			const uint64_t start_bits = m_total_bits;
+
+			uint32_t q = v >> m, r = v & ((1 << m) - 1);
+
+			// rice coding sanity check
+			assert(q <= 64);
+			
+			for (; q > 16; q -= 16)
+				put_bits(0xFFFF, 16);
+
+			put_bits((1 << q) - 1, q);
+			put_bits(r << 1, m + 1);
+			
+			return (uint32_t)(m_total_bits - start_bits);
+		}
+
+		inline uint32_t put_vlc(uint32_t v, uint32_t chunk_bits)
+		{
+			assert(chunk_bits);
+
+			const uint32_t chunk_size = 1 << chunk_bits;
+			const uint32_t chunk_mask = chunk_size - 1;
+					
+			uint32_t total_bits = 0;
+
+			for ( ; ; )
+			{
+				uint32_t next_v = v >> chunk_bits;
+								
+				total_bits += put_bits((v & chunk_mask) | (next_v ? chunk_size : 0), chunk_bits + 1);
+				if (!next_v)
+					break;
+
+				v = next_v;
+			}
+
+			return total_bits;
+		}
+
+		uint32_t emit_huffman_table(const huffman_encoding_table &tab);
+		
+	private:
+		uint8_vec m_bytes;
+		uint32_t m_bit_buffer, m_bit_buffer_size;
+		uint64_t m_total_bits;
+
+		void append_byte(uint8_t c)
+		{
+			m_bytes.resize(m_bytes.size() + 1);
+			m_bytes.back() = c;
+		}
+
+		static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len);
+		static void end_zero_run(uint16_vec &syms, uint32_t &run_size);
+	};
+
+	class huff2D
+	{
+	public:
+		huff2D() { }
+		huff2D(uint32_t bits_per_sym, uint32_t total_syms_per_group) { init(bits_per_sym, total_syms_per_group); }
+
+		inline const histogram &get_histogram() const { return m_histogram; }
+		inline const huffman_encoding_table &get_encoding_table() const { return m_encoding_table; }
+
+		inline void init(uint32_t bits_per_sym, uint32_t total_syms_per_group)
+		{
+			assert((bits_per_sym * total_syms_per_group) <= 16 && total_syms_per_group >= 1 && bits_per_sym >= 1);
+						
+			m_bits_per_sym = bits_per_sym;
+			m_total_syms_per_group = total_syms_per_group;
+			m_cur_sym_bits = 0;
+			m_cur_num_syms = 0;
+			m_decode_syms_remaining = 0;
+			m_next_decoder_group_index = 0;
+
+			m_histogram.init(1 << (bits_per_sym * total_syms_per_group));
+		}
+
+		inline void clear()
+		{
+			m_group_bits.clear();
+
+			m_cur_sym_bits = 0;
+			m_cur_num_syms = 0;
+			m_decode_syms_remaining = 0;
+			m_next_decoder_group_index = 0;
+		}
+
+		inline void emit(uint32_t sym)
+		{
+			m_cur_sym_bits |= (sym << (m_cur_num_syms * m_bits_per_sym));
+			m_cur_num_syms++;
+
+			if (m_cur_num_syms == m_total_syms_per_group)
+				flush();
+		}
+
+		inline void flush()
+		{
+			if (m_cur_num_syms)
+			{
+				m_group_bits.push_back(m_cur_sym_bits);
+				m_histogram.inc(m_cur_sym_bits);
+
+				m_cur_sym_bits = 0;
+				m_cur_num_syms = 0;
+			}
+		}
+
+		inline bool start_encoding(uint32_t code_size_limit = 16)
+		{
+			flush();
+
+			if (!m_encoding_table.init(m_histogram, code_size_limit))
+				return false;
+
+			m_decode_syms_remaining = 0;
+			m_next_decoder_group_index = 0;
+
+			return true;
+		}
+				
+		inline uint32_t emit_next_sym(bitwise_coder &c)
+		{
+			uint32_t bits = 0;
+
+			if (!m_decode_syms_remaining)
+			{
+				bits = c.put_code(m_group_bits[m_next_decoder_group_index++], m_encoding_table);
+				m_decode_syms_remaining = m_total_syms_per_group;
+			}
+
+			m_decode_syms_remaining--;
+			return bits;
+		}
+
+		inline void emit_flush()
+		{
+			m_decode_syms_remaining = 0;
+		}
+
+	private:
+		uint_vec m_group_bits;
+		huffman_encoding_table m_encoding_table;
+		histogram m_histogram;
+		uint32_t m_bits_per_sym, m_total_syms_per_group, m_cur_sym_bits, m_cur_num_syms, m_next_decoder_group_index, m_decode_syms_remaining;
+	};
+
+	bool huffman_test(int rand_seed);
+
+	// VQ index reordering
+	
+	class palette_index_reorderer
+	{
+	public:
+		palette_index_reorderer()
+		{
+		}
+
+		void clear()
+		{
+			clear_vector(m_hist);
+			clear_vector(m_total_count_to_picked);
+			clear_vector(m_entries_picked);
+			clear_vector(m_entries_to_do);
+			clear_vector(m_remap_table);
+		}
+
+		// returns [0,1] distance of entry i to entry j
+		typedef float(*pEntry_dist_func)(uint32_t i, uint32_t j, void *pCtx);
+
+		void init(uint32_t num_indices, const uint32_t *pIndices, uint32_t num_syms, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
+		
+		// Table remaps old to new symbol indices
+		inline const uint_vec &get_remap_table() const { return m_remap_table; }
+
+	private:
+		uint_vec m_hist, m_total_count_to_picked, m_entries_picked, m_entries_to_do, m_remap_table;
+
+		inline uint32_t get_hist(int i, int j, int n) const { return (i > j) ? m_hist[j * n + i] : m_hist[i * n + j]; }
+		inline void inc_hist(int i, int j, int n) { if ((i != j) && (i < j) && (i != -1) && (j != -1)) { assert(((uint32_t)i < (uint32_t)n) && ((uint32_t)j < (uint32_t)n)); m_hist[i * n + j]++; } }
+
+		void prepare_hist(uint32_t num_syms, uint32_t num_indices, const uint32_t *pIndices);
+		void find_initial(uint32_t num_syms);
+		void find_next_entry(uint32_t &best_entry, double &best_count, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
+		float pick_side(uint32_t num_syms, uint32_t entry_to_move, pEntry_dist_func pDist_func, void *pCtx, float dist_func_weight);
+	};
+
+	// Simple 32-bit 2D image class
+
+	class image
+	{
+	public:
+		image() : 
+			m_width(0), m_height(0), m_pitch(0)
+		{
+		}
+
+		image(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) : 
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			resize(w, h, p);
+		}
+
+		image(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			init(pImage, width, height, comps);
+		}
+
+		image(const image &other) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			*this = other;
+		}
+
+		image &swap(image &other)
+		{
+			std::swap(m_width, other.m_width);
+			std::swap(m_height, other.m_height);
+			std::swap(m_pitch, other.m_pitch);
+			m_pixels.swap(other.m_pixels);
+			return *this;
+		}
+
+		image &operator= (const image &rhs)
+		{
+			if (this != &rhs)
+			{
+				m_width = rhs.m_width;
+				m_height = rhs.m_height;
+				m_pitch = rhs.m_pitch;
+				m_pixels = rhs.m_pixels;
+			}
+			return *this;
+		}
+
+		image &clear()
+		{
+			m_width = 0; 
+			m_height = 0;
+			m_pitch = 0;
+			clear_vector(m_pixels);
+			return *this;
+		}
+
+		image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color)
+		{
+			return crop(w, h, p, background);
+		}
+
+		image &set_all(const color_rgba &c)
+		{
+			for (uint32_t i = 0; i < m_pixels.size(); i++)
+				m_pixels[i] = c;
+			return *this;
+		}
+
+		void init(const uint8_t *pImage, uint32_t width, uint32_t height, uint32_t comps)
+		{
+			assert(comps >= 1 && comps <= 4);
+			
+			resize(width, height);
+
+			for (uint32_t y = 0; y < height; y++)
+			{
+				for (uint32_t x = 0; x < width; x++)
+				{
+					const uint8_t *pSrc = &pImage[(x + y * width) * comps];
+					color_rgba &dst = (*this)(x, y);
+
+					if (comps == 1)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = 255;
+					}
+					else if (comps == 2)
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[0];
+						dst.b = pSrc[0];
+						dst.a = pSrc[1];
+					}
+					else
+					{
+						dst.r = pSrc[0];
+						dst.g = pSrc[1];
+						dst.b = pSrc[2];
+						if (comps == 4)
+							dst.a = pSrc[3];
+						else
+							dst.a = 255;
+					}
+				}
+			}
+		}
+
+		image &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba &c)
+		{
+			for (uint32_t iy = 0; iy < h; iy++)
+				for (uint32_t ix = 0; ix < w; ix++)
+					set_clipped(x + ix, y + iy, c);
+			return *this;
+		}
+
+		image& fill_box_alpha(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const color_rgba& c)
+		{
+			for (uint32_t iy = 0; iy < h; iy++)
+				for (uint32_t ix = 0; ix < w; ix++)
+					set_clipped_alpha(x + ix, y + iy, c);
+			return *this;
+		}
+
+		image &crop_dup_borders(uint32_t w, uint32_t h)
+		{
+			const uint32_t orig_w = m_width, orig_h = m_height;
+
+			crop(w, h);
+
+			if (orig_w && orig_h)
+			{
+				if (m_width > orig_w)
+				{
+					for (uint32_t x = orig_w; x < m_width; x++)
+						for (uint32_t y = 0; y < m_height; y++)
+							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
+				}
+
+				if (m_height > orig_h)
+				{
+					for (uint32_t y = orig_h; y < m_height; y++)
+						for (uint32_t x = 0; x < m_width; x++)
+							set_clipped(x, y, get_clamped(minimum(x, orig_w - 1U), minimum(y, orig_h - 1U)));
+				}
+			}
+			return *this;
+		}
+
+		image &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba &background = g_black_color)
+		{
+			if (p == UINT32_MAX)
+				p = w;
+
+			if ((w == m_width) && (m_height == h) && (m_pitch == p))
+				return *this;
+
+			if ((!w) || (!h) || (!p))
+			{
+				clear();
+				return *this;
+			}
+
+			color_rgba_vec cur_state;
+			cur_state.swap(m_pixels);
+
+			m_pixels.resize(p * h);
+			
+			for (uint32_t y = 0; y < h; y++)
+			{
+				for (uint32_t x = 0; x < w; x++)
+				{
+					if ((x < m_width) && (y < m_height))
+						m_pixels[x + y * p] = cur_state[x + y * m_pitch];
+					else
+						m_pixels[x + y * p] = background;
+				}
+			}
+
+			m_width = w;
+			m_height = h;
+			m_pitch = p;
+
+			return *this;
+		}
+
+		inline const color_rgba &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
+		inline color_rgba &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
+
+		inline const color_rgba &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
+		inline color_rgba &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
+
+		inline const color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
+		{
+			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
+			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
+			return m_pixels[x + y * m_pitch];
+		}
+
+		inline color_rgba &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
+		{
+			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
+			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
+			return m_pixels[x + y * m_pitch];
+		}
+		
+		inline image &set_clipped(int x, int y, const color_rgba &c) 
+		{
+			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
+				(*this)(x, y) = c;
+			return *this;
+		}
+
+		inline image& set_clipped_alpha(int x, int y, const color_rgba& c)
+		{
+			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
+				(*this)(x, y).m_comps[3] = c.m_comps[3];
+			return *this;
+		}
+
+		// Very straightforward blit with full clipping. Not fast, but it works.
+		image &blit(const image &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
+		{
+			for (int y = 0; y < src_h; y++)
+			{
+				const int sy = src_y + y;
+				if (sy < 0)
+					continue;
+				else if (sy >= (int)src.get_height())
+					break;
+
+				for (int x = 0; x < src_w; x++)
+				{
+					const int sx = src_x + x;
+					if (sx < 0)
+						continue;
+					else if (sx >= (int)src.get_height())
+						break;
+
+					set_clipped(dst_x + x, dst_y + y, src(sx, sy));
+				}
+			}
+
+			return *this;
+		}
+
+		const image &extract_block_clamped(color_rgba *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
+		{
+			for (uint32_t y = 0; y < h; y++)
+				for (uint32_t x = 0; x < w; x++)
+					*pDst++ = get_clamped(src_x + x, src_y + y);
+			return *this;
+		}
+
+		image &set_block_clipped(const color_rgba *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
+		{
+			for (uint32_t y = 0; y < h; y++)
+				for (uint32_t x = 0; x < w; x++)
+					set_clipped(dst_x + x, dst_y + y, *pSrc++);
+			return *this;
+		}
+
+		inline uint32_t get_width() const { return m_width; }
+		inline uint32_t get_height() const { return m_height; }
+		inline uint32_t get_pitch() const { return m_pitch; }
+		inline uint32_t get_total_pixels() const { return m_width * m_height; }
+
+		inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
+		inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
+		inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
+
+		inline const color_rgba_vec &get_pixels() const { return m_pixels; }
+		inline color_rgba_vec &get_pixels() { return m_pixels; }
+
+		inline const color_rgba *get_ptr() const { return &m_pixels[0]; }
+		inline color_rgba *get_ptr() { return &m_pixels[0]; }
+
+		bool has_alpha() const
+		{
+			for (uint32_t y = 0; y < m_height; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					if ((*this)(x, y).a < 255)
+						return true;
+
+			return false;
+		}
+
+		image &set_alpha(uint8_t a)
+		{
+			for (uint32_t y = 0; y < m_height; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					(*this)(x, y).a = a;
+			return *this;
+		}
+
+		image &flip_y()
+		{
+			for (uint32_t y = 0; y < m_height / 2; ++y)
+				for (uint32_t x = 0; x < m_width; ++x)
+					std::swap((*this)(x, y), (*this)(x, m_height - 1 - y));
+			return *this;
+		}
+
+		// TODO: There are many ways to do this, not sure this is the best way.
+		image &renormalize_normal_map()
+		{
+			for (uint32_t y = 0; y < m_height; y++)
+			{
+				for (uint32_t x = 0; x < m_width; x++)
+				{
+					color_rgba &c = (*this)(x, y);
+					if ((c.r == 128) && (c.g == 128) && (c.b == 128))
+						continue;
+
+					vec3F v(c.r, c.g, c.b);
+					v = (v * (2.0f / 255.0f)) - vec3F(1.0f);
+					v.clamp(-1.0f, 1.0f);
+
+					float length = v.length();
+					const float cValidThresh = .077f;
+					if (length < cValidThresh)
+					{
+						c.set(128, 128, 128, c.a);
+					}
+					else if (fabs(length - 1.0f) > cValidThresh)
+					{
+						if (length)
+							v /= length;
+
+						for (uint32_t i = 0; i < 3; i++)
+							c[i] = static_cast<uint8_t>(clamp<float>(floor((v[i] + 1.0f) * 255.0f * .5f + .5f), 0.0f, 255.0f));
+
+						if ((c.g == 128) && (c.r == 128))
+						{
+							if (c.b < 128)
+								c.b = 0;
+							else
+								c.b = 255;
+						}
+					}
+				}
+			}
+			return *this;
+		}
+
+		void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...);
+				
+	private:
+		uint32_t m_width, m_height, m_pitch;  // all in pixels
+		color_rgba_vec m_pixels;
+	};
+
+	// Float images
+
+	typedef basisu::vector<vec4F> vec4F_vec;
+
+	class imagef
+	{
+	public:
+		imagef() : 
+			m_width(0), m_height(0), m_pitch(0)
+		{
+		}
+
+		imagef(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) : 
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			resize(w, h, p);
+		}
+
+		imagef(const imagef &other) :
+			m_width(0), m_height(0), m_pitch(0)
+		{
+			*this = other;
+		}
+
+		imagef &swap(imagef &other)
+		{
+			std::swap(m_width, other.m_width);
+			std::swap(m_height, other.m_height);
+			std::swap(m_pitch, other.m_pitch);
+			m_pixels.swap(other.m_pixels);
+			return *this;
+		}
+
+		imagef &operator= (const imagef &rhs)
+		{
+			if (this != &rhs)
+			{
+				m_width = rhs.m_width;
+				m_height = rhs.m_height;
+				m_pitch = rhs.m_pitch;
+				m_pixels = rhs.m_pixels;
+			}
+			return *this;
+		}
+
+		imagef &clear()
+		{
+			m_width = 0; 
+			m_height = 0;
+			m_pitch = 0;
+			clear_vector(m_pixels);
+			return *this;
+		}
+
+		imagef &set(const image &src, const vec4F &scale = vec4F(1), const vec4F &bias = vec4F(0))
+		{
+			const uint32_t width = src.get_width();
+			const uint32_t height = src.get_height();
+
+			resize(width, height);
+
+			for (int y = 0; y < (int)height; y++)
+			{
+				for (uint32_t x = 0; x < width; x++)
+				{
+					const color_rgba &src_pixel = src(x, y);
+					(*this)(x, y).set((float)src_pixel.r * scale[0] + bias[0], (float)src_pixel.g * scale[1] + bias[1], (float)src_pixel.b * scale[2] + bias[2], (float)src_pixel.a * scale[3] + bias[3]);
+				}
+			}
+
+			return *this;
+		}
+
+		imagef &resize(const imagef &other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))
+		{
+			return resize(other.get_width(), other.get_height(), p, background);
+		}
+
+		imagef &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1))
+		{
+			return crop(w, h, p, background);
+		}
+
+		imagef &set_all(const vec4F &c)
+		{
+			for (uint32_t i = 0; i < m_pixels.size(); i++)
+				m_pixels[i] = c;
+			return *this;
+		}
+
+		imagef &fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const vec4F &c)
+		{
+			for (uint32_t iy = 0; iy < h; iy++)
+				for (uint32_t ix = 0; ix < w; ix++)
+					set_clipped(x + ix, y + iy, c);
+			return *this;
+		}
+				
+		imagef &crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F &background = vec4F(0,0,0,1))
+		{
+			if (p == UINT32_MAX)
+				p = w;
+
+			if ((w == m_width) && (m_height == h) && (m_pitch == p))
+				return *this;
+
+			if ((!w) || (!h) || (!p))
+			{
+				clear();
+				return *this;
+			}
+
+			vec4F_vec cur_state;
+			cur_state.swap(m_pixels);
+
+			m_pixels.resize(p * h);
+			
+			for (uint32_t y = 0; y < h; y++)
+			{
+				for (uint32_t x = 0; x < w; x++)
+				{
+					if ((x < m_width) && (y < m_height))
+						m_pixels[x + y * p] = cur_state[x + y * m_pitch];
+					else
+						m_pixels[x + y * p] = background;
+				}
+			}
+
+			m_width = w;
+			m_height = h;
+			m_pitch = p;
+
+			return *this;
+		}
+
+		inline const vec4F &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
+		inline vec4F &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_pixels[x + y * m_pitch]; }
+
+		inline const vec4F &get_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
+		inline vec4F &get_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width - 1), clamp<int>(y, 0, m_height - 1)); }
+
+		inline const vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const
+		{
+			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
+			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
+			return m_pixels[x + y * m_pitch];
+		}
+
+		inline vec4F &get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v)
+		{
+			x = wrap_u ? posmod(x, m_width) : clamp<int>(x, 0, m_width - 1);
+			y = wrap_v ? posmod(y, m_height) : clamp<int>(y, 0, m_height - 1);
+			return m_pixels[x + y * m_pitch];
+		}
+		
+		inline imagef &set_clipped(int x, int y, const vec4F &c) 
+		{
+			if ((static_cast<uint32_t>(x) < m_width) && (static_cast<uint32_t>(y) < m_height))
+				(*this)(x, y) = c;
+			return *this;
+		}
+
+		// Very straightforward blit with full clipping. Not fast, but it works.
+		imagef &blit(const imagef &src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y)
+		{
+			for (int y = 0; y < src_h; y++)
+			{
+				const int sy = src_y + y;
+				if (sy < 0)
+					continue;
+				else if (sy >= (int)src.get_height())
+					break;
+
+				for (int x = 0; x < src_w; x++)
+				{
+					const int sx = src_x + x;
+					if (sx < 0)
+						continue;
+					else if (sx >= (int)src.get_height())
+						break;
+
+					set_clipped(dst_x + x, dst_y + y, src(sx, sy));
+				}
+			}
+
+			return *this;
+		}
+
+		const imagef &extract_block_clamped(vec4F *pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const
+		{
+			for (uint32_t y = 0; y < h; y++)
+				for (uint32_t x = 0; x < w; x++)
+					*pDst++ = get_clamped(src_x + x, src_y + y);
+			return *this;
+		}
+
+		imagef &set_block_clipped(const vec4F *pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h)
+		{
+			for (uint32_t y = 0; y < h; y++)
+				for (uint32_t x = 0; x < w; x++)
+					set_clipped(dst_x + x, dst_y + y, *pSrc++);
+			return *this;
+		}
+
+		inline uint32_t get_width() const { return m_width; }
+		inline uint32_t get_height() const { return m_height; }
+		inline uint32_t get_pitch() const { return m_pitch; }
+		inline uint32_t get_total_pixels() const { return m_width * m_height; }
+
+		inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; }
+		inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; }
+		inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); }
+
+		inline const vec4F_vec &get_pixels() const { return m_pixels; }
+		inline vec4F_vec &get_pixels() { return m_pixels; }
+
+		inline const vec4F *get_ptr() const { return &m_pixels[0]; }
+		inline vec4F *get_ptr() { return &m_pixels[0]; }
+						
+	private:
+		uint32_t m_width, m_height, m_pitch;  // all in pixels
+		vec4F_vec m_pixels;
+	};
+
+	// Image metrics
+		
+	class image_metrics
+	{
+	public:
+		// TODO: Add ssim
+		float m_max, m_mean, m_mean_squared, m_rms, m_psnr, m_ssim;
+
+		image_metrics()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_max = 0;
+			m_mean = 0;
+			m_mean_squared = 0;
+			m_rms = 0;
+			m_psnr = 0;
+			m_ssim = 0;
+		}
+
+		void print(const char *pPrefix = nullptr)	{ printf("%sMax: %3.0f Mean: %3.3f RMS: %3.3f PSNR: %2.3f dB\n", pPrefix ? pPrefix : "", m_max, m_mean, m_rms, m_psnr);	}
+
+		void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false);
+	};
+
+	// Image saving/loading/resampling
+	
+	bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr);
+	bool load_png(const char* pFilename, image& img);
+	inline bool load_png(const std::string &filename, image &img) { return load_png(filename.c_str(), img); }
+
+	bool load_bmp(const char* pFilename, image& img);
+	inline bool load_bmp(const std::string &filename, image &img) { return load_bmp(filename.c_str(), img); }
+		
+	bool load_tga(const char* pFilename, image& img);
+	inline bool load_tga(const std::string &filename, image &img) { return load_tga(filename.c_str(), img); }
+
+	bool load_jpg(const char *pFilename, image& img);
+	inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); }
+	
+	// Currently loads .BMP, .PNG, or .TGA.
+	bool load_image(const char* pFilename, image& img);
+	inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); }
+
+	uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans);
+	uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans);
+		
+	enum
+	{
+		cImageSaveGrayscale = 1,
+		cImageSaveIgnoreAlpha = 2
+	};
+
+	bool save_png(const char* pFilename, const image& img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0);
+	inline bool save_png(const std::string &filename, const image &img, uint32_t image_save_flags = 0, uint32_t grayscale_comp = 0) { return save_png(filename.c_str(), img, image_save_flags, grayscale_comp); }
+	
+	bool read_file_to_vec(const char* pFilename, uint8_vec& data);
+	
+	bool write_data_to_file(const char* pFilename, const void* pData, size_t len);
+	
+	inline bool write_vec_to_file(const char* pFilename, const uint8_vec& v) {	return v.size() ? write_data_to_file(pFilename, &v[0], v.size()) : write_data_to_file(pFilename, "", 0); }
+
+	float linear_to_srgb(float l);
+	float srgb_to_linear(float s);
+
+	bool image_resample(const image &src, image &dst, bool srgb = false,
+		const char *pFilter = "lanczos4", float filter_scale = 1.0f, 
+		bool wrapping = false,
+		uint32_t first_comp = 0, uint32_t num_comps = 4);
+
+	// Timing
+			
+	typedef uint64_t timer_ticks;
+
+	class interval_timer
+	{
+	public:
+		interval_timer();
+
+		void start();
+		void stop();
+
+		double get_elapsed_secs() const;
+		inline double get_elapsed_ms() const { return 1000.0f* get_elapsed_secs(); }
+		
+		static void init();
+		static inline timer_ticks get_ticks_per_sec() { return g_freq; }
+		static timer_ticks get_ticks();
+		static double ticks_to_secs(timer_ticks ticks);
+		static inline double ticks_to_ms(timer_ticks ticks) {	return ticks_to_secs(ticks) * 1000.0f; }
+
+	private:
+		static timer_ticks g_init_ticks, g_freq;
+		static double g_timer_freq;
+
+		timer_ticks m_start_time, m_stop_time;
+
+		bool m_started, m_stopped;
+	};
+
+	// 2D array
+
+	template<typename T>
+	class vector2D
+	{
+		typedef basisu::vector<T> TVec;
+
+		uint32_t m_width, m_height;
+		TVec m_values;
+
+	public:
+		vector2D() :
+			m_width(0),
+			m_height(0)
+		{
+		}
+
+		vector2D(uint32_t w, uint32_t h) :
+			m_width(0),
+			m_height(0)
+		{
+			resize(w, h);
+		}
+
+		vector2D(const vector2D &other)
+		{
+			*this = other;
+		}
+
+		vector2D &operator= (const vector2D &other)
+		{
+			if (this != &other)
+			{
+				m_width = other.m_width;
+				m_height = other.m_height;
+				m_values = other.m_values;
+			}
+			return *this;
+		}
+
+		inline bool operator== (const vector2D &rhs) const
+		{
+			return (m_width == rhs.m_width) && (m_height == rhs.m_height) && (m_values == rhs.m_values);
+		}
+
+		inline uint32_t size_in_bytes() const { return (uint32_t)m_values.size() * sizeof(m_values[0]); }
+
+		inline const T &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
+		inline T &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; }
+
+		inline const T &operator[] (uint32_t i) const { return m_values[i]; }
+		inline T &operator[] (uint32_t i) { return m_values[i]; }
+				
+		inline const T &at_clamped(int x, int y) const { return (*this)(clamp<int>(x, 0, m_width), clamp<int>(y, 0, m_height)); }		
+		inline T &at_clamped(int x, int y) { return (*this)(clamp<int>(x, 0, m_width), clamp<int>(y, 0, m_height)); }
+
+		void clear()
+		{
+			m_width = 0;
+			m_height = 0;
+			m_values.clear();
+		}
+
+		void set_all(const T&val)
+		{
+			vector_set_all(m_values, val);
+		}
+
+		inline const T* get_ptr() const { return &m_values[0]; }
+		inline T* get_ptr() { return &m_values[0]; }
+
+		vector2D &resize(uint32_t new_width, uint32_t new_height)
+		{
+			if ((m_width == new_width) && (m_height == new_height))
+				return *this;
+
+			TVec oldVals(new_width * new_height);
+			oldVals.swap(m_values);
+
+			const uint32_t w = minimum(m_width, new_width);
+			const uint32_t h = minimum(m_height, new_height);
+
+			if ((w) && (h))
+			{
+				for (uint32_t y = 0; y < h; y++)
+					for (uint32_t x = 0; x < w; x++)
+						m_values[x + y * new_width] = oldVals[x + y * m_width];
+			}
+
+			m_width = new_width;
+			m_height = new_height;
+
+			return *this;
+		}
+	};
+
+	inline FILE *fopen_safe(const char *pFilename, const char *pMode)
+	{
+#ifdef _WIN32
+		FILE *pFile = nullptr;
+		fopen_s(&pFile, pFilename, pMode);
+		return pFile;
+#else
+		return fopen(pFilename, pMode);
+#endif
+	}
+
+	void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed = 1);
+		
+} // namespace basisu
+
+
diff --git a/thirdparty/basis_universal/encoder/basisu_etc.cpp b/thirdparty/basis_universal/encoder/basisu_etc.cpp
new file mode 100644
index 0000000000..232e8965b0
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_etc.cpp
@@ -0,0 +1,1593 @@
+// basis_etc.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_etc.h"
+
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
+#define BASISU_DEBUG_ETC_ENCODER 0
+#define BASISU_DEBUG_ETC_ENCODER_DEEPER 0
+
+namespace basisu
+{
+	const int8_t g_etc2_eac_tables[16][8] =
+	{
+		{ -3, -6, -9, -15, 2, 5, 8, 14 }, { -3, -7, -10, -13, 2, 6, 9, 12 }, { -2, -5, -8, -13, 1, 4, 7, 12 }, { -2, -4, -6, -13, 1, 3, 5, 12 },
+		{ -3, -6, -8, -12, 2, 5, 7, 11 }, { -3, -7, -9, -11, 2, 6, 8, 10 }, { -4, -7, -8, -11, 3, 6, 7, 10 }, { -3, -5, -8, -11, 2, 4, 7, 10 },
+		{ -2, -6, -8, -10, 1, 5, 7, 9 }, { -2, -5, -8, -10, 1, 4, 7, 9 }, { -2, -4, -8, -10, 1, 3, 7, 9 }, { -2, -5, -7, -10, 1, 4, 6, 9 },
+		{ -3, -4, -7, -10, 2, 3, 6, 9 }, { -1, -2, -3, -10, 0, 1, 2, 9 }, { -4, -6, -8, -9, 3, 5, 7, 8 }, { -3, -5, -7, -9, 2, 4, 6, 8 }
+	};
+
+	const int8_t g_etc2_eac_tables8[16][8] =
+	{
+		{ -24, -48, -72, -120, 16, 40, 64, 112 }, { -24,-56,-80,-104,16,48,72,96 }, { -16,-40,-64,-104,8,32,56,96 }, { -16,-32,-48,-104,8,24,40,96 },
+		{ -24,-48,-64,-96,16,40,56,88 }, { -24,-56,-72,-88,16,48,64,80 }, { -32,-56,-64,-88,24,48,56,80 }, { -24,-40,-64,-88,16,32,56,80 },
+		{ -16,-48,-64,-80,8,40,56,72 }, { -16,-40,-64,-80,8,32,56,72 }, { -16,-32,-64,-80,8,24,56,72 }, { -16,-40,-56,-80,8,32,48,72 },
+		{ -24,-32,-56,-80,16,24,48,72 }, { -8,-16,-24,-80,0,8,16,72 }, { -32,-48,-64,-72,24,40,56,64 },	{ -24,-40,-56,-72,16,32,48,64 }
+	};
+		
+	// Given an ETC1 diff/inten_table/selector, and an 8-bit desired color, this table encodes the best packed_color in the low byte, and the abs error in the high byte.
+	static uint16_t g_etc1_inverse_lookup[2 * 8 * 4][256];      // [ diff/inten_table/selector][desired_color ]
+
+	// g_color8_to_etc_block_config[color][table_index] = Supplies for each 8-bit color value a list of packed ETC1 diff/intensity table/selectors/packed_colors that map to that color.
+	// To pack: diff | (inten << 1) | (selector << 4) | (packed_c << 8)
+	static const uint16_t g_etc1_color8_to_etc_block_config_0_255[2][33] =
+	{
+		{ 0x0000,  0x0010,  0x0002,  0x0012,  0x0004,  0x0014,  0x0006,  0x0016,  0x0008,  0x0018,  0x000A,  0x001A,  0x000C,  0x001C,  0x000E,  0x001E,		  0x0001,  0x0011,  0x0003,  0x0013,  0x0005,  0x0015,  0x0007,  0x0017,  0x0009,  0x0019,  0x000B,  0x001B,  0x000D,  0x001D,  0x000F,  0x001F, 0xFFFF },
+		{ 0x0F20,  0x0F30,  0x0E32,  0x0F22,  0x0E34,  0x0F24,  0x0D36,  0x0F26,  0x0C38,  0x0E28,  0x0B3A,  0x0E2A,  0x093C,  0x0E2C,  0x053E,  0x0D2E,		  0x1E31,  0x1F21,  0x1D33,  0x1F23,  0x1C35,  0x1E25,  0x1A37,  0x1E27,  0x1839,  0x1D29,  0x163B,  0x1C2B,  0x133D,  0x1B2D,  0x093F,  0x1A2F, 0xFFFF },
+	};
+
+	// Really only [254][11].
+	static const uint16_t g_etc1_color8_to_etc_block_config_1_to_254[254][12] =
+	{
+		{ 0x021C, 0x0D0D, 0xFFFF }, { 0x0020, 0x0021, 0x0A0B, 0x061F, 0xFFFF }, { 0x0113, 0x0217, 0xFFFF }, { 0x0116, 0x031E,		0x0B0E, 0x0405, 0xFFFF }, { 0x0022, 0x0204, 0x050A, 0x0023, 0xFFFF }, { 0x0111, 0x0319, 0x0809, 0x170F, 0xFFFF }, {
+		0x0303, 0x0215, 0x0607, 0xFFFF }, { 0x0030, 0x0114, 0x0408, 0x0031, 0x0201, 0x051D, 0xFFFF }, { 0x0100, 0x0024, 0x0306,		0x0025, 0x041B, 0x0E0D, 0xFFFF }, { 0x021A, 0x0121, 0x0B0B, 0x071F, 0xFFFF }, { 0x0213, 0x0317, 0xFFFF }, { 0x0112,
+		0x0505, 0xFFFF }, { 0x0026, 0x070C, 0x0123, 0x0027, 0xFFFF }, { 0x0211, 0x0909, 0xFFFF }, { 0x0110, 0x0315, 0x0707,		0x0419, 0x180F, 0xFFFF }, { 0x0218, 0x0131, 0x0301, 0x0403, 0x061D, 0xFFFF }, { 0x0032, 0x0202, 0x0033, 0x0125, 0x051B,
+		0x0F0D, 0xFFFF }, { 0x0028, 0x031C, 0x0221, 0x0029, 0xFFFF }, { 0x0120, 0x0313, 0x0C0B, 0x081F, 0xFFFF }, { 0x0605,		0x0417, 0xFFFF }, { 0x0216, 0x041E, 0x0C0E, 0x0223, 0x0127, 0xFFFF }, { 0x0122, 0x0304, 0x060A, 0x0311, 0x0A09, 0xFFFF
+		}, { 0x0519, 0x190F, 0xFFFF }, { 0x002A, 0x0231, 0x0503, 0x0415, 0x0807, 0x002B, 0x071D, 0xFFFF }, { 0x0130, 0x0214,		0x0508, 0x0401, 0x0133, 0x0225, 0x061B, 0xFFFF }, { 0x0200, 0x0124, 0x0406, 0x0321, 0x0129, 0x100D, 0xFFFF }, { 0x031A,
+		0x0D0B, 0x091F, 0xFFFF }, { 0x0413, 0x0705, 0x0517, 0xFFFF }, { 0x0212, 0x0034, 0x0323, 0x0035, 0x0227, 0xFFFF }, {		0x0126, 0x080C, 0x0B09, 0xFFFF }, { 0x0411, 0x0619, 0x1A0F, 0xFFFF }, { 0x0210, 0x0331, 0x0603, 0x0515, 0x0907, 0x012B,
+		0xFFFF }, { 0x0318, 0x002C, 0x0501, 0x0233, 0x0325, 0x071B, 0x002D, 0x081D, 0xFFFF }, { 0x0132, 0x0302, 0x0229, 0x110D,		0xFFFF }, { 0x0128, 0x041C, 0x0421, 0x0E0B, 0x0A1F, 0xFFFF }, { 0x0220, 0x0513, 0x0617, 0xFFFF }, { 0x0135, 0x0805,
+		0x0327, 0xFFFF }, { 0x0316, 0x051E, 0x0D0E, 0x0423, 0xFFFF }, { 0x0222, 0x0404, 0x070A, 0x0511, 0x0719, 0x0C09, 0x1B0F,		0xFFFF }, { 0x0703, 0x0615, 0x0A07, 0x022B, 0xFFFF }, { 0x012A, 0x0431, 0x0601, 0x0333, 0x012D, 0x091D, 0xFFFF }, {
+		0x0230, 0x0314, 0x0036, 0x0608, 0x0425, 0x0037, 0x0329, 0x081B, 0x120D, 0xFFFF }, { 0x0300, 0x0224, 0x0506, 0x0521,		0x0F0B, 0x0B1F, 0xFFFF }, { 0x041A, 0x0613, 0x0717, 0xFFFF }, { 0x0235, 0x0905, 0xFFFF }, { 0x0312, 0x0134, 0x0523,
+		0x0427, 0xFFFF }, { 0x0226, 0x090C, 0x002E, 0x0611, 0x0D09, 0x002F, 0xFFFF }, { 0x0715, 0x0B07, 0x0819, 0x032B, 0x1C0F,		0xFFFF }, { 0x0310, 0x0531, 0x0701, 0x0803, 0x022D, 0x0A1D, 0xFFFF }, { 0x0418, 0x012C, 0x0433, 0x0525, 0x0137, 0x091B,
+		0x130D, 0xFFFF }, { 0x0232, 0x0402, 0x0621, 0x0429, 0xFFFF }, { 0x0228, 0x051C, 0x0713, 0x100B, 0x0C1F, 0xFFFF }, {		0x0320, 0x0335, 0x0A05, 0x0817, 0xFFFF }, { 0x0623, 0x0527, 0xFFFF }, { 0x0416, 0x061E, 0x0E0E, 0x0711, 0x0E09, 0x012F,
+		0xFFFF }, { 0x0322, 0x0504, 0x080A, 0x0919, 0x1D0F, 0xFFFF }, { 0x0631, 0x0903, 0x0815, 0x0C07, 0x042B, 0x032D, 0x0B1D,		0xFFFF }, { 0x022A, 0x0801, 0x0533, 0x0625, 0x0237, 0x0A1B, 0xFFFF }, { 0x0330, 0x0414, 0x0136, 0x0708, 0x0721, 0x0529,
+		0x140D, 0xFFFF }, { 0x0400, 0x0324, 0x0606, 0x0038, 0x0039, 0x110B, 0x0D1F, 0xFFFF }, { 0x051A, 0x0813, 0x0B05, 0x0917,		0xFFFF }, { 0x0723, 0x0435, 0x0627, 0xFFFF }, { 0x0412, 0x0234, 0x0F09, 0x022F, 0xFFFF }, { 0x0326, 0x0A0C, 0x012E,
+		0x0811, 0x0A19, 0x1E0F, 0xFFFF }, { 0x0731, 0x0A03, 0x0915, 0x0D07, 0x052B, 0xFFFF }, { 0x0410, 0x0901, 0x0633, 0x0725,		0x0337, 0x0B1B, 0x042D, 0x0C1D, 0xFFFF }, { 0x0518, 0x022C, 0x0629, 0x150D, 0xFFFF }, { 0x0332, 0x0502, 0x0821, 0x0139,
+		0x120B, 0x0E1F, 0xFFFF }, { 0x0328, 0x061C, 0x0913, 0x0A17, 0xFFFF }, { 0x0420, 0x0535, 0x0C05, 0x0727, 0xFFFF }, {		0x0823, 0x032F, 0xFFFF }, { 0x0516, 0x071E, 0x0F0E, 0x0911, 0x0B19, 0x1009, 0x1F0F, 0xFFFF }, { 0x0422, 0x0604, 0x090A,
+		0x0B03, 0x0A15, 0x0E07, 0x062B, 0xFFFF }, { 0x0831, 0x0A01, 0x0733, 0x052D, 0x0D1D, 0xFFFF }, { 0x032A, 0x0825, 0x0437,		0x0729, 0x0C1B, 0x160D, 0xFFFF }, { 0x0430, 0x0514, 0x0236, 0x0808, 0x0921, 0x0239, 0x130B, 0x0F1F, 0xFFFF }, { 0x0500,
+		0x0424, 0x0706, 0x0138, 0x0A13, 0x0B17, 0xFFFF }, { 0x061A, 0x0635, 0x0D05, 0xFFFF }, { 0x0923, 0x0827, 0xFFFF }, {		0x0512, 0x0334, 0x003A, 0x0A11, 0x1109, 0x003B, 0x042F, 0xFFFF }, { 0x0426, 0x0B0C, 0x022E, 0x0B15, 0x0F07, 0x0C19,
+		0x072B, 0xFFFF }, { 0x0931, 0x0B01, 0x0C03, 0x062D, 0x0E1D, 0xFFFF }, { 0x0510, 0x0833, 0x0925, 0x0537, 0x0D1B, 0x170D,		0xFFFF }, { 0x0618, 0x032C, 0x0A21, 0x0339, 0x0829, 0xFFFF }, { 0x0432, 0x0602, 0x0B13, 0x140B, 0x101F, 0xFFFF }, {
+		0x0428, 0x071C, 0x0735, 0x0E05, 0x0C17, 0xFFFF }, { 0x0520, 0x0A23, 0x0927, 0xFFFF }, { 0x0B11, 0x1209, 0x013B, 0x052F,		0xFFFF }, { 0x0616, 0x081E, 0x0D19, 0xFFFF }, { 0x0522, 0x0704, 0x0A0A, 0x0A31, 0x0D03, 0x0C15, 0x1007, 0x082B, 0x072D,
+		0x0F1D, 0xFFFF }, { 0x0C01, 0x0933, 0x0A25, 0x0637, 0x0E1B, 0xFFFF }, { 0x042A, 0x0B21, 0x0929, 0x180D, 0xFFFF }, {		0x0530, 0x0614, 0x0336, 0x0908, 0x0439, 0x150B, 0x111F, 0xFFFF }, { 0x0600, 0x0524, 0x0806, 0x0238, 0x0C13, 0x0F05,
+		0x0D17, 0xFFFF }, { 0x071A, 0x0B23, 0x0835, 0x0A27, 0xFFFF }, { 0x1309, 0x023B, 0x062F, 0xFFFF }, { 0x0612, 0x0434,		0x013A, 0x0C11, 0x0E19, 0xFFFF }, { 0x0526, 0x0C0C, 0x032E, 0x0B31, 0x0E03, 0x0D15, 0x1107, 0x092B, 0xFFFF }, { 0x0D01,
+		0x0A33, 0x0B25, 0x0737, 0x0F1B, 0x082D, 0x101D, 0xFFFF }, { 0x0610, 0x0A29, 0x190D, 0xFFFF }, { 0x0718, 0x042C, 0x0C21,		0x0539, 0x160B, 0x121F, 0xFFFF }, { 0x0532, 0x0702, 0x0D13, 0x0E17, 0xFFFF }, { 0x0528, 0x081C, 0x0935, 0x1005, 0x0B27,
+		0xFFFF }, { 0x0620, 0x0C23, 0x033B, 0x072F, 0xFFFF }, { 0x0D11, 0x0F19, 0x1409, 0xFFFF }, { 0x0716, 0x003C, 0x091E,		0x0F03, 0x0E15, 0x1207, 0x0A2B, 0x003D, 0xFFFF }, { 0x0622, 0x0804, 0x0B0A, 0x0C31, 0x0E01, 0x0B33, 0x092D, 0x111D,
+		0xFFFF }, { 0x0C25, 0x0837, 0x0B29, 0x101B, 0x1A0D, 0xFFFF }, { 0x052A, 0x0D21, 0x0639, 0x170B, 0x131F, 0xFFFF }, {		0x0630, 0x0714, 0x0436, 0x0A08, 0x0E13, 0x0F17, 0xFFFF }, { 0x0700, 0x0624, 0x0906, 0x0338, 0x0A35, 0x1105, 0xFFFF }, {
+		0x081A, 0x0D23, 0x0C27, 0xFFFF }, { 0x0E11, 0x1509, 0x043B, 0x082F, 0xFFFF }, { 0x0712, 0x0534, 0x023A, 0x0F15, 0x1307,		0x1019, 0x0B2B, 0x013D, 0xFFFF }, { 0x0626, 0x0D0C, 0x042E, 0x0D31, 0x0F01, 0x1003, 0x0A2D, 0x121D, 0xFFFF }, { 0x0C33,
+		0x0D25, 0x0937, 0x111B, 0x1B0D, 0xFFFF }, { 0x0710, 0x0E21, 0x0739, 0x0C29, 0xFFFF }, { 0x0818, 0x052C, 0x0F13, 0x180B,		0x141F, 0xFFFF }, { 0x0632, 0x0802, 0x0B35, 0x1205, 0x1017, 0xFFFF }, { 0x0628, 0x091C, 0x0E23, 0x0D27, 0xFFFF }, {
+		0x0720, 0x0F11, 0x1609, 0x053B, 0x092F, 0xFFFF }, { 0x1119, 0x023D, 0xFFFF }, { 0x0816, 0x013C, 0x0A1E, 0x0E31, 0x1103,		0x1015, 0x1407, 0x0C2B, 0x0B2D, 0x131D, 0xFFFF }, { 0x0722, 0x0904, 0x0C0A, 0x1001, 0x0D33, 0x0E25, 0x0A37, 0x121B,
+		0xFFFF }, { 0x0F21, 0x0D29, 0x1C0D, 0xFFFF }, { 0x062A, 0x0839, 0x190B, 0x151F, 0xFFFF }, { 0x0730, 0x0814, 0x0536,		0x0B08, 0x1013, 0x1305, 0x1117, 0xFFFF }, { 0x0800, 0x0724, 0x0A06, 0x0438, 0x0F23, 0x0C35, 0x0E27, 0xFFFF }, { 0x091A,
+		0x1709, 0x063B, 0x0A2F, 0xFFFF }, { 0x1011, 0x1219, 0x033D, 0xFFFF }, { 0x0812, 0x0634, 0x033A, 0x0F31, 0x1203, 0x1115,		0x1507, 0x0D2B, 0xFFFF }, { 0x0726, 0x0E0C, 0x052E, 0x1101, 0x0E33, 0x0F25, 0x0B37, 0x131B, 0x0C2D, 0x141D, 0xFFFF }, {
+		0x0E29, 0x1D0D, 0xFFFF }, { 0x0810, 0x1021, 0x0939, 0x1A0B, 0x161F, 0xFFFF }, { 0x0918, 0x062C, 0x1113, 0x1217, 0xFFFF		}, { 0x0732, 0x0902, 0x0D35, 0x1405, 0x0F27, 0xFFFF }, { 0x0728, 0x0A1C, 0x1023, 0x073B, 0x0B2F, 0xFFFF }, { 0x0820,
+		0x1111, 0x1319, 0x1809, 0xFFFF }, { 0x1303, 0x1215, 0x1607, 0x0E2B, 0x043D, 0xFFFF }, { 0x0916, 0x023C, 0x0B1E, 0x1031,		0x1201, 0x0F33, 0x0D2D, 0x151D, 0xFFFF }, { 0x0822, 0x0A04, 0x0D0A, 0x1025, 0x0C37, 0x0F29, 0x141B, 0x1E0D, 0xFFFF }, {
+		0x1121, 0x0A39, 0x1B0B, 0x171F, 0xFFFF }, { 0x072A, 0x1213, 0x1317, 0xFFFF }, { 0x0830, 0x0914, 0x0636, 0x0C08, 0x0E35,		0x1505, 0xFFFF }, { 0x0900, 0x0824, 0x0B06, 0x0538, 0x1123, 0x1027, 0xFFFF }, { 0x0A1A, 0x1211, 0x1909, 0x083B, 0x0C2F,
+		0xFFFF }, { 0x1315, 0x1707, 0x1419, 0x0F2B, 0x053D, 0xFFFF }, { 0x0912, 0x0734, 0x043A, 0x1131, 0x1301, 0x1403, 0x0E2D,		0x161D, 0xFFFF }, { 0x0826, 0x0F0C, 0x062E, 0x1033, 0x1125, 0x0D37, 0x151B, 0x1F0D, 0xFFFF }, { 0x1221, 0x0B39, 0x1029,
+		0xFFFF }, { 0x0910, 0x1313, 0x1C0B, 0x181F, 0xFFFF }, { 0x0A18, 0x072C, 0x0F35, 0x1605, 0x1417, 0xFFFF }, { 0x0832,		0x0A02, 0x1223, 0x1127, 0xFFFF }, { 0x0828, 0x0B1C, 0x1311, 0x1A09, 0x093B, 0x0D2F, 0xFFFF }, { 0x0920, 0x1519, 0x063D,
+		0xFFFF }, { 0x1231, 0x1503, 0x1415, 0x1807, 0x102B, 0x0F2D, 0x171D, 0xFFFF }, { 0x0A16, 0x033C, 0x0C1E, 0x1401, 0x1133,		0x1225, 0x0E37, 0x161B, 0xFFFF }, { 0x0922, 0x0B04, 0x0E0A, 0x1321, 0x1129, 0xFFFF }, { 0x0C39, 0x1D0B, 0x191F, 0xFFFF
+		}, { 0x082A, 0x1413, 0x1705, 0x1517, 0xFFFF }, { 0x0930, 0x0A14, 0x0736, 0x0D08, 0x1323, 0x1035, 0x1227, 0xFFFF }, {		0x0A00, 0x0924, 0x0C06, 0x0638, 0x1B09, 0x0A3B, 0x0E2F, 0xFFFF }, { 0x0B1A, 0x1411, 0x1619, 0x073D, 0xFFFF }, { 0x1331,
+		0x1603, 0x1515, 0x1907, 0x112B, 0xFFFF }, { 0x0A12, 0x0834, 0x053A, 0x1501, 0x1233, 0x1325, 0x0F37, 0x171B, 0x102D,		0x181D, 0xFFFF }, { 0x0926, 0x072E, 0x1229, 0xFFFF }, { 0x1421, 0x0D39, 0x1E0B, 0x1A1F, 0xFFFF }, { 0x0A10, 0x1513,
+		0x1617, 0xFFFF }, { 0x0B18, 0x082C, 0x1135, 0x1805, 0x1327, 0xFFFF }, { 0x0932, 0x0B02, 0x1423, 0x0B3B, 0x0F2F, 0xFFFF		}, { 0x0928, 0x0C1C, 0x1511, 0x1719, 0x1C09, 0xFFFF }, { 0x0A20, 0x1703, 0x1615, 0x1A07, 0x122B, 0x083D, 0xFFFF }, {
+		0x1431, 0x1601, 0x1333, 0x112D, 0x191D, 0xFFFF }, { 0x0B16, 0x043C, 0x0D1E, 0x1425, 0x1037, 0x1329, 0x181B, 0xFFFF }, {		0x0A22, 0x0C04, 0x0F0A, 0x1521, 0x0E39, 0x1F0B, 0x1B1F, 0xFFFF }, { 0x1613, 0x1717, 0xFFFF }, { 0x092A, 0x1235, 0x1905,
+		0xFFFF }, { 0x0A30, 0x0B14, 0x0836, 0x0E08, 0x1523, 0x1427, 0xFFFF }, { 0x0B00, 0x0A24, 0x0D06, 0x0738, 0x1611, 0x1D09,		0x0C3B, 0x102F, 0xFFFF }, { 0x0C1A, 0x1715, 0x1B07, 0x1819, 0x132B, 0x093D, 0xFFFF }, { 0x1531, 0x1701, 0x1803, 0x122D,
+		0x1A1D, 0xFFFF }, { 0x0B12, 0x0934, 0x063A, 0x1433, 0x1525, 0x1137, 0x191B, 0xFFFF }, { 0x0A26, 0x003E, 0x082E, 0x1621,		0x0F39, 0x1429, 0x003F, 0xFFFF }, { 0x1713, 0x1C1F, 0xFFFF }, { 0x0B10, 0x1335, 0x1A05, 0x1817, 0xFFFF }, { 0x0C18,
+		0x092C, 0x1623, 0x1527, 0xFFFF }, { 0x0A32, 0x0C02, 0x1711, 0x1E09, 0x0D3B, 0x112F, 0xFFFF }, { 0x0A28, 0x0D1C, 0x1919,		0x0A3D, 0xFFFF }, { 0x0B20, 0x1631, 0x1903, 0x1815, 0x1C07, 0x142B, 0x132D, 0x1B1D, 0xFFFF }, { 0x1801, 0x1533, 0x1625,
+		0x1237, 0x1A1B, 0xFFFF }, { 0x0C16, 0x053C, 0x0E1E, 0x1721, 0x1529, 0x013F, 0xFFFF }, { 0x0B22, 0x0D04, 0x1039, 0x1D1F,		0xFFFF }, { 0x1813, 0x1B05, 0x1917, 0xFFFF }, { 0x0A2A, 0x1723, 0x1435, 0x1627, 0xFFFF }, { 0x0B30, 0x0C14, 0x0936,
+		0x0F08, 0x1F09, 0x0E3B, 0x122F, 0xFFFF }, { 0x0C00, 0x0B24, 0x0E06, 0x0838, 0x1811, 0x1A19, 0x0B3D, 0xFFFF }, { 0x0D1A,		0x1731, 0x1A03, 0x1915, 0x1D07, 0x152B, 0xFFFF }, { 0x1901, 0x1633, 0x1725, 0x1337, 0x1B1B, 0x142D, 0x1C1D, 0xFFFF }, {
+		0x0C12, 0x0A34, 0x073A, 0x1629, 0x023F, 0xFFFF }, { 0x0B26, 0x013E, 0x092E, 0x1821, 0x1139, 0x1E1F, 0xFFFF }, { 0x1913,		0x1A17, 0xFFFF }, { 0x0C10, 0x1535, 0x1C05, 0x1727, 0xFFFF }, { 0x0D18, 0x0A2C, 0x1823, 0x0F3B, 0x132F, 0xFFFF }, {
+		0x0B32, 0x0D02, 0x1911, 0x1B19, 0xFFFF }, { 0x0B28, 0x0E1C, 0x1B03, 0x1A15, 0x1E07, 0x162B, 0x0C3D, 0xFFFF }, { 0x0C20,		0x1831, 0x1A01, 0x1733, 0x152D, 0x1D1D, 0xFFFF }, { 0x1825, 0x1437, 0x1729, 0x1C1B, 0x033F, 0xFFFF }, { 0x0D16, 0x063C,
+		0x0F1E, 0x1921, 0x1239, 0x1F1F, 0xFFFF }, { 0x0C22, 0x0E04, 0x1A13, 0x1B17, 0xFFFF }, { 0x1635, 0x1D05, 0xFFFF }, {		0x0B2A, 0x1923, 0x1827, 0xFFFF }, { 0x0C30, 0x0D14, 0x0A36, 0x1A11, 0x103B, 0x142F, 0xFFFF }, { 0x0D00, 0x0C24, 0x0F06,
+		0x0938, 0x1B15, 0x1F07, 0x1C19, 0x172B, 0x0D3D, 0xFFFF }, { 0x0E1A, 0x1931, 0x1B01, 0x1C03, 0x162D, 0x1E1D, 0xFFFF }, {		0x1833, 0x1925, 0x1537, 0x1D1B, 0xFFFF }, { 0x0D12, 0x0B34, 0x083A, 0x1A21, 0x1339, 0x1829, 0x043F, 0xFFFF }, { 0x0C26,
+		0x023E, 0x0A2E, 0x1B13, 0xFFFF }, { 0x1735, 0x1E05, 0x1C17, 0xFFFF }, { 0x0D10, 0x1A23, 0x1927, 0xFFFF }, { 0x0E18,		0x0B2C, 0x1B11, 0x113B, 0x152F, 0xFFFF }, { 0x0C32, 0x0E02, 0x1D19, 0x0E3D, 0xFFFF }, { 0x0C28, 0x0F1C, 0x1A31, 0x1D03,
+		0x1C15, 0x182B, 0x172D, 0x1F1D, 0xFFFF }, { 0x0D20, 0x1C01, 0x1933, 0x1A25, 0x1637, 0x1E1B, 0xFFFF }, { 0x1B21, 0x1929,		0x053F, 0xFFFF }, { 0x0E16, 0x073C, 0x1439, 0xFFFF }, { 0x0D22, 0x0F04, 0x1C13, 0x1F05, 0x1D17, 0xFFFF }, { 0x1B23,
+		0x1835, 0x1A27, 0xFFFF }, { 0x0C2A, 0x123B, 0x162F, 0xFFFF }, { 0x0D30, 0x0E14, 0x0B36, 0x1C11, 0x1E19, 0x0F3D, 0xFFFF		}, { 0x0E00, 0x0D24, 0x0A38, 0x1B31, 0x1E03, 0x1D15, 0x192B, 0xFFFF }, { 0x0F1A, 0x1D01, 0x1A33, 0x1B25, 0x1737, 0x1F1B,
+		0x182D, 0xFFFF }, { 0x1A29, 0x063F, 0xFFFF }, { 0x0E12, 0x0C34, 0x093A, 0x1C21, 0x1539, 0xFFFF }, { 0x0D26, 0x033E,		0x0B2E, 0x1D13, 0x1E17, 0xFFFF }, { 0x1935, 0x1B27, 0xFFFF }, { 0x0E10, 0x1C23, 0x133B, 0x172F, 0xFFFF }, { 0x0F18,
+		0x0C2C, 0x1D11, 0x1F19, 0xFFFF }, { 0x0D32, 0x0F02, 0x1F03, 0x1E15, 0x1A2B, 0x103D, 0xFFFF }, { 0x0D28, 0x1C31, 0x1E01,		0x1B33, 0x192D, 0xFFFF }, { 0x0E20, 0x1C25, 0x1837, 0x1B29, 0x073F, 0xFFFF }, { 0x1D21, 0x1639, 0xFFFF }, { 0x0F16,
+		0x083C, 0x1E13, 0x1F17, 0xFFFF }, { 0x0E22, 0x1A35, 0xFFFF }, { 0x1D23, 0x1C27, 0xFFFF }, { 0x0D2A, 0x1E11, 0x143B,		0x182F, 0xFFFF }, { 0x0E30, 0x0F14, 0x0C36, 0x1F15, 0x1B2B, 0x113D, 0xFFFF }, { 0x0F00, 0x0E24, 0x0B38, 0x1D31, 0x1F01,
+		0x1A2D, 0xFFFF }, { 0x1C33, 0x1D25, 0x1937, 0xFFFF }, { 0x1E21, 0x1739, 0x1C29, 0x083F, 0xFFFF }, { 0x0F12, 0x0D34,		0x0A3A, 0x1F13, 0xFFFF }, { 0x0E26, 0x043E, 0x0C2E, 0x1B35, 0xFFFF }, { 0x1E23, 0x1D27, 0xFFFF }, { 0x0F10, 0x1F11,		0x153B, 0x192F, 0xFFFF }, { 0x0D2C, 0x123D, 0xFFFF },
+	};
+
+	static uint32_t etc1_decode_value(uint32_t diff, uint32_t inten, uint32_t selector, uint32_t packed_c)
+	{
+		const uint32_t limit = diff ? 32 : 16; 
+		BASISU_NOTE_UNUSED(limit);
+		assert((diff < 2) && (inten < 8) && (selector < 4) && (packed_c < limit));
+		int c;
+		if (diff)
+			c = (packed_c >> 2) | (packed_c << 3);
+		else
+			c = packed_c | (packed_c << 4);
+		c += g_etc1_inten_tables[inten][selector];
+		c = clamp<int>(c, 0, 255);
+		return c;
+	}
+
+	void pack_etc1_solid_color_init()
+	{
+		for (uint32_t diff = 0; diff < 2; diff++)
+		{
+			const uint32_t limit = diff ? 32 : 16;
+
+			for (uint32_t inten = 0; inten < 8; inten++)
+			{
+				for (uint32_t selector = 0; selector < 4; selector++)
+				{
+					const uint32_t inverse_table_index = diff + (inten << 1) + (selector << 4);
+					for (uint32_t color = 0; color < 256; color++)
+					{
+						uint32_t best_error = UINT32_MAX, best_packed_c = 0;
+						for (uint32_t packed_c = 0; packed_c < limit; packed_c++)
+						{
+							int v = etc1_decode_value(diff, inten, selector, packed_c);
+							uint32_t err = (uint32_t)labs(v - static_cast<int>(color));
+							if (err < best_error)
+							{
+								best_error = err;
+								best_packed_c = packed_c;
+								if (!best_error)
+									break;
+							}
+						}
+						assert(best_error <= 255);
+						g_etc1_inverse_lookup[inverse_table_index][color] = static_cast<uint16_t>(best_packed_c | (best_error << 8));
+					}
+				}
+			}
+		}
+	}
+
+	// Packs solid color blocks efficiently using a set of small precomputed tables.
+	// For random 888 inputs, MSE results are better than Erricson's ETC1 packer in "slow" mode ~9.5% of the time, is slightly worse only ~.01% of the time, and is equal the rest of the time.
+	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor)
+	{
+		assert(g_etc1_inverse_lookup[0][255]);
+
+		static uint32_t s_next_comp[4] = { 1, 2, 0, 1 };
+
+		uint32_t best_error = UINT32_MAX, best_i = 0;
+		int best_x = 0, best_packed_c1 = 0, best_packed_c2 = 0;
+
+		// For each possible 8-bit value, there is a precomputed list of diff/inten/selector configurations that allow that 8-bit value to be encoded with no error.
+		for (uint32_t i = 0; i < 3; i++)
+		{
+			const uint32_t c1 = pColor[s_next_comp[i]], c2 = pColor[s_next_comp[i + 1]];
+
+			const int delta_range = 1;
+			for (int delta = -delta_range; delta <= delta_range; delta++)
+			{
+				const int c_plus_delta = clamp<int>(pColor[i] + delta, 0, 255);
+
+				const uint16_t* pTable;
+				if (!c_plus_delta)
+					pTable = g_etc1_color8_to_etc_block_config_0_255[0];
+				else if (c_plus_delta == 255)
+					pTable = g_etc1_color8_to_etc_block_config_0_255[1];
+				else
+					pTable = g_etc1_color8_to_etc_block_config_1_to_254[c_plus_delta - 1];
+
+				do
+				{
+					const uint32_t x = *pTable++;
+
+#ifdef _DEBUG
+					const uint32_t diff = x & 1;
+					const uint32_t inten = (x >> 1) & 7;
+					const uint32_t selector = (x >> 4) & 3;
+					const uint32_t p0 = (x >> 8) & 255;
+					assert(etc1_decode_value(diff, inten, selector, p0) == (uint32_t)c_plus_delta);
+#endif
+
+					const uint16_t* pInverse_table = g_etc1_inverse_lookup[x & 0xFF];
+					uint16_t p1 = pInverse_table[c1];
+					uint16_t p2 = pInverse_table[c2];
+					const uint32_t trial_error = square(c_plus_delta - pColor[i]) + square(p1 >> 8) + square(p2 >> 8);
+					if (trial_error < best_error)
+					{
+						best_error = trial_error;
+						best_x = x;
+						best_packed_c1 = p1 & 0xFF;
+						best_packed_c2 = p2 & 0xFF;
+						best_i = i;
+						if (!best_error)
+							goto found_perfect_match;
+					}
+				} while (*pTable != 0xFFFF);
+			}
+		}
+	found_perfect_match:
+
+		const uint32_t diff = best_x & 1;
+		const uint32_t inten = (best_x >> 1) & 7;
+
+		block.m_bytes[3] = static_cast<uint8_t>(((inten | (inten << 3)) << 2) | (diff << 1));
+
+		const uint32_t etc1_selector = g_selector_index_to_etc1[(best_x >> 4) & 3];
+		*reinterpret_cast<uint16_t*>(&block.m_bytes[4]) = (etc1_selector & 2) ? 0xFFFF : 0;
+		*reinterpret_cast<uint16_t*>(&block.m_bytes[6]) = (etc1_selector & 1) ? 0xFFFF : 0;
+
+		const uint32_t best_packed_c0 = (best_x >> 8) & 255;
+		if (diff)
+		{
+			block.m_bytes[best_i] = static_cast<uint8_t>(best_packed_c0 << 3);
+			block.m_bytes[s_next_comp[best_i]] = static_cast<uint8_t>(best_packed_c1 << 3);
+			block.m_bytes[s_next_comp[best_i + 1]] = static_cast<uint8_t>(best_packed_c2 << 3);
+		}
+		else
+		{
+			block.m_bytes[best_i] = static_cast<uint8_t>(best_packed_c0 | (best_packed_c0 << 4));
+			block.m_bytes[s_next_comp[best_i]] = static_cast<uint8_t>(best_packed_c1 | (best_packed_c1 << 4));
+			block.m_bytes[s_next_comp[best_i + 1]] = static_cast<uint8_t>(best_packed_c2 | (best_packed_c2 << 4));
+		}
+
+		return best_error;
+	}
+	
+	const uint32_t BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE = 165;
+
+	static const struct { uint8_t m_v[4]; } g_cluster_fit_order_tab[BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE] =
+	{
+		{ { 0, 0, 0, 8 } },{ { 0, 5, 2, 1 } },{ { 0, 6, 1, 1 } },{ { 0, 7, 0, 1 } },{ { 0, 7, 1, 0 } },
+		{ { 0, 0, 8, 0 } },{ { 0, 0, 3, 5 } },{ { 0, 1, 7, 0 } },{ { 0, 0, 4, 4 } },{ { 0, 0, 2, 6 } },
+		{ { 0, 0, 7, 1 } },{ { 0, 0, 1, 7 } },{ { 0, 0, 5, 3 } },{ { 1, 6, 0, 1 } },{ { 0, 0, 6, 2 } },
+		{ { 0, 2, 6, 0 } },{ { 2, 4, 2, 0 } },{ { 0, 3, 5, 0 } },{ { 3, 3, 1, 1 } },{ { 4, 2, 0, 2 } },
+		{ { 1, 5, 2, 0 } },{ { 0, 5, 3, 0 } },{ { 0, 6, 2, 0 } },{ { 2, 4, 1, 1 } },{ { 5, 1, 0, 2 } },
+		{ { 6, 1, 1, 0 } },{ { 3, 3, 0, 2 } },{ { 6, 0, 0, 2 } },{ { 0, 8, 0, 0 } },{ { 6, 1, 0, 1 } },
+		{ { 0, 1, 6, 1 } },{ { 1, 6, 1, 0 } },{ { 4, 1, 3, 0 } },{ { 0, 2, 5, 1 } },{ { 5, 0, 3, 0 } },
+		{ { 5, 3, 0, 0 } },{ { 0, 1, 5, 2 } },{ { 0, 3, 4, 1 } },{ { 2, 5, 1, 0 } },{ { 1, 7, 0, 0 } },
+		{ { 0, 1, 4, 3 } },{ { 6, 0, 2, 0 } },{ { 0, 4, 4, 0 } },{ { 2, 6, 0, 0 } },{ { 0, 2, 4, 2 } },
+		{ { 0, 5, 1, 2 } },{ { 0, 6, 0, 2 } },{ { 3, 5, 0, 0 } },{ { 0, 4, 3, 1 } },{ { 3, 4, 1, 0 } },
+		{ { 4, 3, 1, 0 } },{ { 1, 5, 0, 2 } },{ { 0, 3, 3, 2 } },{ { 1, 4, 1, 2 } },{ { 0, 4, 2, 2 } },
+		{ { 2, 3, 3, 0 } },{ { 4, 4, 0, 0 } },{ { 1, 2, 4, 1 } },{ { 0, 5, 0, 3 } },{ { 0, 1, 3, 4 } },
+		{ { 1, 5, 1, 1 } },{ { 1, 4, 2, 1 } },{ { 1, 3, 2, 2 } },{ { 5, 2, 1, 0 } },{ { 1, 3, 3, 1 } },
+		{ { 0, 1, 2, 5 } },{ { 1, 1, 5, 1 } },{ { 0, 3, 2, 3 } },{ { 2, 5, 0, 1 } },{ { 3, 2, 2, 1 } },
+		{ { 2, 3, 0, 3 } },{ { 1, 4, 3, 0 } },{ { 2, 2, 1, 3 } },{ { 6, 2, 0, 0 } },{ { 1, 0, 6, 1 } },
+		{ { 3, 3, 2, 0 } },{ { 7, 1, 0, 0 } },{ { 3, 1, 4, 0 } },{ { 0, 2, 3, 3 } },{ { 0, 4, 1, 3 } },
+		{ { 0, 4, 0, 4 } },{ { 0, 1, 0, 7 } },{ { 2, 0, 5, 1 } },{ { 2, 0, 4, 2 } },{ { 3, 0, 2, 3 } },
+		{ { 2, 2, 4, 0 } },{ { 2, 2, 3, 1 } },{ { 4, 0, 3, 1 } },{ { 3, 2, 3, 0 } },{ { 2, 3, 2, 1 } },
+		{ { 1, 3, 4, 0 } },{ { 7, 0, 1, 0 } },{ { 3, 0, 4, 1 } },{ { 1, 0, 5, 2 } },{ { 8, 0, 0, 0 } },
+		{ { 3, 0, 1, 4 } },{ { 4, 1, 1, 2 } },{ { 4, 0, 2, 2 } },{ { 1, 2, 5, 0 } },{ { 4, 2, 1, 1 } },
+		{ { 3, 4, 0, 1 } },{ { 2, 0, 3, 3 } },{ { 5, 0, 1, 2 } },{ { 5, 0, 0, 3 } },{ { 2, 4, 0, 2 } },
+		{ { 2, 1, 4, 1 } },{ { 4, 0, 1, 3 } },{ { 2, 1, 5, 0 } },{ { 4, 2, 2, 0 } },{ { 4, 0, 4, 0 } },
+		{ { 1, 0, 4, 3 } },{ { 1, 4, 0, 3 } },{ { 3, 0, 3, 2 } },{ { 4, 3, 0, 1 } },{ { 0, 1, 1, 6 } },
+		{ { 1, 3, 1, 3 } },{ { 0, 2, 2, 4 } },{ { 2, 0, 2, 4 } },{ { 5, 1, 1, 1 } },{ { 3, 0, 5, 0 } },
+		{ { 2, 3, 1, 2 } },{ { 3, 0, 0, 5 } },{ { 0, 3, 1, 4 } },{ { 5, 0, 2, 1 } },{ { 2, 1, 3, 2 } },
+		{ { 2, 0, 6, 0 } },{ { 3, 1, 3, 1 } },{ { 5, 1, 2, 0 } },{ { 1, 0, 3, 4 } },{ { 1, 1, 6, 0 } },
+		{ { 4, 0, 0, 4 } },{ { 2, 0, 1, 5 } },{ { 0, 3, 0, 5 } },{ { 1, 3, 0, 4 } },{ { 4, 1, 2, 1 } },
+		{ { 1, 2, 3, 2 } },{ { 3, 1, 0, 4 } },{ { 5, 2, 0, 1 } },{ { 1, 2, 2, 3 } },{ { 3, 2, 1, 2 } },
+		{ { 2, 2, 2, 2 } },{ { 6, 0, 1, 1 } },{ { 1, 2, 1, 4 } },{ { 1, 1, 4, 2 } },{ { 3, 2, 0, 3 } },
+		{ { 1, 2, 0, 5 } },{ { 1, 0, 7, 0 } },{ { 3, 1, 2, 2 } },{ { 1, 0, 2, 5 } },{ { 2, 0, 0, 6 } },
+		{ { 2, 1, 1, 4 } },{ { 2, 2, 0, 4 } },{ { 1, 1, 3, 3 } },{ { 7, 0, 0, 1 } },{ { 1, 0, 0, 7 } },
+		{ { 2, 1, 2, 3 } },{ { 4, 1, 0, 3 } },{ { 3, 1, 1, 3 } },{ { 1, 1, 2, 4 } },{ { 2, 1, 0, 5 } },
+		{ { 1, 0, 1, 6 } },{ { 0, 2, 1, 5 } },{ { 0, 2, 0, 6 } },{ { 1, 1, 1, 5 } },{ { 1, 1, 0, 6 } }
+	};
+		
+	const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues] =
+	{
+		{ -8,  -2,   2,   8 }, { -17,  -5,  5,  17 }, { -29,  -9,   9,  29 }, {  -42, -13, 13,  42 },
+		{ -60, -18, 18,  60 }, { -80, -24, 24,  80 }, { -106, -33, 33, 106 }, { -183, -47, 47, 183 }
+	};
+
+	const uint8_t g_etc1_to_selector_index[cETC1SelectorValues] = { 2, 3, 1, 0 };
+	const uint8_t g_selector_index_to_etc1[cETC1SelectorValues] = { 3, 2, 0, 1 };
+
+	// [flip][subblock][pixel_index]
+	const etc_coord2 g_etc1_pixel_coords[2][2][8] =
+	{
+		{
+		  {
+			 { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 },
+			 { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 }
+		  },
+		  {
+			 { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 },
+			 { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 }
+		  }
+		},
+		{
+		  {
+			 { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 },
+			 { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 }
+		  },
+		  {
+			 { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 },
+			 { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 }
+		  },
+		}
+	};
+
+	// [flip][subblock][pixel_index]
+	const uint32_t g_etc1_pixel_indices[2][2][8] =
+	{
+		{
+			{
+				0 + 4 * 0, 0 + 4 * 1, 0 + 4 * 2, 0 + 4 * 3,
+				1 + 4 * 0, 1 + 4 * 1, 1 + 4 * 2, 1 + 4 * 3
+			},
+			{
+				2 + 4 * 0, 2 + 4 * 1, 2 + 4 * 2, 2 + 4 * 3,
+				3 + 4 * 0, 3 + 4 * 1, 3 + 4 * 2, 3 + 4 * 3
+			}
+		},
+		{
+			{
+				0 + 4 * 0, 1 + 4 * 0, 2 + 4 * 0, 3 + 4 * 0,
+				0 + 4 * 1, 1 + 4 * 1, 2 + 4 * 1, 3 + 4 * 1
+			},
+			{
+				0 + 4 * 2, 1 + 4 * 2, 2 + 4 * 2, 3 + 4 * 2,
+				0 + 4 * 3, 1 + 4 * 3, 2 + 4 * 3, 3 + 4 * 3
+			},
+		}
+	};
+
+	uint16_t etc_block::pack_color5(const color_rgba& color, bool scaled, uint32_t bias)
+	{
+		return pack_color5(color.r, color.g, color.b, scaled, bias);
+	}
+
+	uint16_t etc_block::pack_color5(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias)
+	{
+		if (scaled)
+		{
+			r = (r * 31U + bias) / 255U;
+			g = (g * 31U + bias) / 255U;
+			b = (b * 31U + bias) / 255U;
+		}
+
+		r = minimum(r, 31U);
+		g = minimum(g, 31U);
+		b = minimum(b, 31U);
+
+		return static_cast<uint16_t>(b | (g << 5U) | (r << 10U));
+	}
+
+	color_rgba etc_block::unpack_color5(uint16_t packed_color5, bool scaled, uint32_t alpha)
+	{
+		uint32_t b = packed_color5 & 31U;
+		uint32_t g = (packed_color5 >> 5U) & 31U;
+		uint32_t r = (packed_color5 >> 10U) & 31U;
+
+		if (scaled)
+		{
+			b = (b << 3U) | (b >> 2U);
+			g = (g << 3U) | (g >> 2U);
+			r = (r << 3U) | (r >> 2U);
+		}
+
+		return color_rgba(cNoClamp, r, g, b, minimum(alpha, 255U));
+	}
+
+	void etc_block::unpack_color5(color_rgba& result, uint16_t packed_color5, bool scaled)
+	{
+		result = unpack_color5(packed_color5, scaled, 255);
+	}
+
+	void etc_block::unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, bool scaled)
+	{
+		color_rgba c(unpack_color5(packed_color5, scaled, 0));
+		r = c.r;
+		g = c.g;
+		b = c.b;
+	}
+
+	bool etc_block::unpack_color5(color_rgba& result, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha)
+	{
+		color_rgba_i16 dc(unpack_delta3(packed_delta3));
+
+		int b = (packed_color5 & 31U) + dc.b;
+		int g = ((packed_color5 >> 5U) & 31U) + dc.g;
+		int r = ((packed_color5 >> 10U) & 31U) + dc.r;
+
+		bool success = true;
+		if (static_cast<uint32_t>(r | g | b) > 31U)
+		{
+			success = false;
+			r = clamp<int>(r, 0, 31);
+			g = clamp<int>(g, 0, 31);
+			b = clamp<int>(b, 0, 31);
+		}
+
+		if (scaled)
+		{
+			b = (b << 3U) | (b >> 2U);
+			g = (g << 3U) | (g >> 2U);
+			r = (r << 3U) | (r >> 2U);
+		}
+
+		result.set_noclamp_rgba(r, g, b, minimum(alpha, 255U));
+		return success;
+	}
+
+	bool etc_block::unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha)
+	{
+		color_rgba result;
+		const bool success = unpack_color5(result, packed_color5, packed_delta3, scaled, alpha);
+		r = result.r;
+		g = result.g;
+		b = result.b;
+		return success;
+	}
+
+	uint16_t etc_block::pack_delta3(const color_rgba_i16& color)
+	{
+		return pack_delta3(color.r, color.g, color.b);
+	}
+
+	uint16_t etc_block::pack_delta3(int r, int g, int b)
+	{
+		assert((r >= cETC1ColorDeltaMin) && (r <= cETC1ColorDeltaMax));
+		assert((g >= cETC1ColorDeltaMin) && (g <= cETC1ColorDeltaMax));
+		assert((b >= cETC1ColorDeltaMin) && (b <= cETC1ColorDeltaMax));
+		if (r < 0) r += 8;
+		if (g < 0) g += 8;
+		if (b < 0) b += 8;
+		return static_cast<uint16_t>(b | (g << 3) | (r << 6));
+	}
+
+	color_rgba_i16 etc_block::unpack_delta3(uint16_t packed_delta3)
+	{
+		int r = (packed_delta3 >> 6) & 7;
+		int g = (packed_delta3 >> 3) & 7;
+		int b = packed_delta3 & 7;
+		if (r >= 4) r -= 8;
+		if (g >= 4) g -= 8;
+		if (b >= 4) b -= 8;
+		return color_rgba_i16(r, g, b, 255);
+	}
+
+	void etc_block::unpack_delta3(int& r, int& g, int& b, uint16_t packed_delta3)
+	{
+		r = (packed_delta3 >> 6) & 7;
+		g = (packed_delta3 >> 3) & 7;
+		b = packed_delta3 & 7;
+		if (r >= 4) r -= 8;
+		if (g >= 4) g -= 8;
+		if (b >= 4) b -= 8;
+	}
+
+	uint16_t etc_block::pack_color4(const color_rgba& color, bool scaled, uint32_t bias)
+	{
+		return pack_color4(color.r, color.g, color.b, scaled, bias);
+	}
+
+	uint16_t etc_block::pack_color4(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias)
+	{
+		if (scaled)
+		{
+			r = (r * 15U + bias) / 255U;
+			g = (g * 15U + bias) / 255U;
+			b = (b * 15U + bias) / 255U;
+		}
+
+		r = minimum(r, 15U);
+		g = minimum(g, 15U);
+		b = minimum(b, 15U);
+
+		return static_cast<uint16_t>(b | (g << 4U) | (r << 8U));
+	}
+
+	color_rgba etc_block::unpack_color4(uint16_t packed_color4, bool scaled, uint32_t alpha)
+	{
+		uint32_t b = packed_color4 & 15U;
+		uint32_t g = (packed_color4 >> 4U) & 15U;
+		uint32_t r = (packed_color4 >> 8U) & 15U;
+
+		if (scaled)
+		{
+			b = (b << 4U) | b;
+			g = (g << 4U) | g;
+			r = (r << 4U) | r;
+		}
+
+		return color_rgba(cNoClamp, r, g, b, minimum(alpha, 255U));
+	}
+
+	void etc_block::unpack_color4(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color4, bool scaled)
+	{
+		color_rgba c(unpack_color4(packed_color4, scaled, 0));
+		r = c.r;
+		g = c.g;
+		b = c.b;
+	}
+
+	void etc_block::get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint32_t table_idx)
+	{
+		assert(table_idx < cETC1IntenModifierValues);
+		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+		uint32_t r, g, b;
+		unpack_color5(r, g, b, packed_color5, true);
+
+		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+		const int y0 = pInten_modifer_table[0];
+		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
+
+		const int y1 = pInten_modifer_table[1];
+		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
+
+		const int y2 = pInten_modifer_table[2];
+		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
+
+		const int y3 = pInten_modifer_table[3];
+		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
+	}
+
+	bool etc_block::get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint16_t packed_delta3, uint32_t table_idx)
+	{
+		assert(table_idx < cETC1IntenModifierValues);
+		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+		uint32_t r, g, b;
+		bool success = unpack_color5(r, g, b, packed_color5, packed_delta3, true);
+
+		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+		const int y0 = pInten_modifer_table[0];
+		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
+
+		const int y1 = pInten_modifer_table[1];
+		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
+
+		const int y2 = pInten_modifer_table[2];
+		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
+
+		const int y3 = pInten_modifer_table[3];
+		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
+
+		return success;
+	}
+
+	void etc_block::get_abs_subblock_colors(color_rgba* pDst, uint16_t packed_color4, uint32_t table_idx)
+	{
+		assert(table_idx < cETC1IntenModifierValues);
+		const int *pInten_modifer_table = &g_etc1_inten_tables[table_idx][0];
+
+		uint32_t r, g, b;
+		unpack_color4(r, g, b, packed_color4, true);
+
+		const int ir = static_cast<int>(r), ig = static_cast<int>(g), ib = static_cast<int>(b);
+
+		const int y0 = pInten_modifer_table[0];
+		pDst[0].set(ir + y0, ig + y0, ib + y0, 255);
+
+		const int y1 = pInten_modifer_table[1];
+		pDst[1].set(ir + y1, ig + y1, ib + y1, 255);
+
+		const int y2 = pInten_modifer_table[2];
+		pDst[2].set(ir + y2, ig + y2, ib + y2, 255);
+
+		const int y3 = pInten_modifer_table[3];
+		pDst[3].set(ir + y3, ig + y3, ib + y3, 255);
+	}
+		
+	bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha)
+	{
+		const bool diff_flag = block.get_diff_bit();
+		const bool flip_flag = block.get_flip_bit();
+		const uint32_t table_index0 = block.get_inten_table(0);
+		const uint32_t table_index1 = block.get_inten_table(1);
+
+		color_rgba subblock_colors0[4];
+		color_rgba subblock_colors1[4];
+
+		if (diff_flag)
+		{
+			const uint16_t base_color5 = block.get_base5_color();
+			const uint16_t delta_color3 = block.get_delta3_color();
+			etc_block::get_diff_subblock_colors(subblock_colors0, base_color5, table_index0);
+
+			if (!etc_block::get_diff_subblock_colors(subblock_colors1, base_color5, delta_color3, table_index1))
+				return false;
+		}
+		else
+		{
+			const uint16_t base_color4_0 = block.get_base4_color(0);
+			etc_block::get_abs_subblock_colors(subblock_colors0, base_color4_0, table_index0);
+
+			const uint16_t base_color4_1 = block.get_base4_color(1);
+			etc_block::get_abs_subblock_colors(subblock_colors1, base_color4_1, table_index1);
+		}
+
+		if (preserve_alpha)
+		{
+			if (flip_flag)
+			{
+				for (uint32_t y = 0; y < 2; y++)
+				{
+					pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+					pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+					pDst[2].set_rgb(subblock_colors0[block.get_selector(2, y)]);
+					pDst[3].set_rgb(subblock_colors0[block.get_selector(3, y)]);
+					pDst += 4;
+				}
+
+				for (uint32_t y = 2; y < 4; y++)
+				{
+					pDst[0].set_rgb(subblock_colors1[block.get_selector(0, y)]);
+					pDst[1].set_rgb(subblock_colors1[block.get_selector(1, y)]);
+					pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+					pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+					pDst += 4;
+				}
+			}
+			else
+			{
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					pDst[0].set_rgb(subblock_colors0[block.get_selector(0, y)]);
+					pDst[1].set_rgb(subblock_colors0[block.get_selector(1, y)]);
+					pDst[2].set_rgb(subblock_colors1[block.get_selector(2, y)]);
+					pDst[3].set_rgb(subblock_colors1[block.get_selector(3, y)]);
+					pDst += 4;
+				}
+			}
+		}
+		else
+		{
+			if (flip_flag)
+			{
+				// 0000
+				// 0000
+				// 1111
+				// 1111
+				for (uint32_t y = 0; y < 2; y++)
+				{
+					pDst[0] = subblock_colors0[block.get_selector(0, y)];
+					pDst[1] = subblock_colors0[block.get_selector(1, y)];
+					pDst[2] = subblock_colors0[block.get_selector(2, y)];
+					pDst[3] = subblock_colors0[block.get_selector(3, y)];
+					pDst += 4;
+				}
+
+				for (uint32_t y = 2; y < 4; y++)
+				{
+					pDst[0] = subblock_colors1[block.get_selector(0, y)];
+					pDst[1] = subblock_colors1[block.get_selector(1, y)];
+					pDst[2] = subblock_colors1[block.get_selector(2, y)];
+					pDst[3] = subblock_colors1[block.get_selector(3, y)];
+					pDst += 4;
+				}
+			}
+			else
+			{
+				// 0011
+				// 0011
+				// 0011
+				// 0011
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					pDst[0] = subblock_colors0[block.get_selector(0, y)];
+					pDst[1] = subblock_colors0[block.get_selector(1, y)];
+					pDst[2] = subblock_colors1[block.get_selector(2, y)];
+					pDst[3] = subblock_colors1[block.get_selector(3, y)];
+					pDst += 4;
+				}
+			}
+		}
+
+		return true;
+	}
+
+	inline int extend_6_to_8(uint32_t n)
+	{
+		return (n << 2) | (n >> 4);
+	}
+
+	inline int extend_7_to_8(uint32_t n)
+	{
+		return (n << 1) | (n >> 6);
+	}
+
+	inline int extend_4_to_8(uint32_t n)
+	{
+		return (n << 4) | n;
+	}
+		
+	uint64_t etc_block::evaluate_etc1_error(const color_rgba* pBlock_pixels, bool perceptual, int subblock_index) const
+	{
+		color_rgba unpacked_block[16];
+
+		unpack_etc1(*this, unpacked_block);
+
+		uint64_t total_error = 0;
+
+		if (subblock_index < 0)
+		{
+			for (uint32_t i = 0; i < 16; i++)
+				total_error += color_distance(perceptual, pBlock_pixels[i], unpacked_block[i], false);
+		}
+		else
+		{
+			const bool flip_bit = get_flip_bit();
+
+			for (uint32_t i = 0; i < 8; i++)
+			{
+				const uint32_t idx = g_etc1_pixel_indices[flip_bit][subblock_index][i];
+
+				total_error += color_distance(perceptual, pBlock_pixels[idx], unpacked_block[idx], false);
+			}
+		}
+
+		return total_error;
+	}
+
+	void etc_block::get_subblock_pixels(color_rgba* pPixels, int subblock_index) const
+	{
+		if (subblock_index < 0)
+			unpack_etc1(*this, pPixels);
+		else
+		{
+			color_rgba unpacked_block[16];
+
+			unpack_etc1(*this, unpacked_block);
+
+			const bool flip_bit = get_flip_bit();
+
+			for (uint32_t i = 0; i < 8; i++)
+			{
+				const uint32_t idx = g_etc1_pixel_indices[flip_bit][subblock_index][i];
+
+				pPixels[i] = unpacked_block[idx];
+			}
+		}
+	}
+								
+	bool etc1_optimizer::compute()
+	{
+		assert(m_pResult->m_pSelectors);
+
+		if (m_pParams->m_pForce_selectors)
+		{
+			assert(m_pParams->m_quality >= cETCQualitySlow);
+			if (m_pParams->m_quality < cETCQualitySlow)
+				return false;
+		}
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+
+		if (m_pParams->m_cluster_fit)
+		{
+			if (m_pParams->m_quality == cETCQualityFast)
+				compute_internal_cluster_fit(4);
+			else if (m_pParams->m_quality == cETCQualityMedium)
+				compute_internal_cluster_fit(16);
+			else if (m_pParams->m_quality == cETCQualitySlow)
+				compute_internal_cluster_fit(64);
+			else
+				compute_internal_cluster_fit(BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE);
+		}
+		else
+			compute_internal_neighborhood(m_br, m_bg, m_bb);
+
+		if (!m_best_solution.m_valid)
+		{
+			m_pResult->m_error = UINT32_MAX;
+			return false;
+		}
+
+		//const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
+		const uint8_t* pSelectors = m_pParams->m_pForce_selectors ? m_pParams->m_pForce_selectors : &m_best_solution.m_selectors[0];
+
+#if defined(DEBUG) || defined(_DEBUG)
+		{
+			// sanity check the returned error
+			color_rgba block_colors[4];
+			m_best_solution.m_coords.get_block_colors(block_colors);
+
+			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
+			uint64_t actual_error = 0;
+			
+			bool perceptual;
+			if (m_pParams->m_quality >= cETCQualityMedium)
+				perceptual = m_pParams->m_perceptual;
+			else
+				perceptual = (m_pParams->m_quality == cETCQualityFast) ? false : m_pParams->m_perceptual;
+						
+			for (uint32_t i = 0; i < n; i++)
+				actual_error += color_distance(perceptual, pSrc_pixels[i], block_colors[pSelectors[i]], false);
+
+			assert(actual_error == m_best_solution.m_error);
+		}
+#endif      
+
+		m_pResult->m_error = m_best_solution.m_error;
+
+		m_pResult->m_block_color_unscaled = m_best_solution.m_coords.m_unscaled_color;
+		m_pResult->m_block_color4 = m_best_solution.m_coords.m_color4;
+
+		m_pResult->m_block_inten_table = m_best_solution.m_coords.m_inten_table;
+		memcpy(m_pResult->m_pSelectors, pSelectors, n);
+		m_pResult->m_n = n;
+
+		return true;
+	}
+
+	void etc1_optimizer::refine_solution(uint32_t max_refinement_trials)
+	{
+		// Now we have the input block, the avg. color of the input pixels, a set of trial selector indices, and the block color+intensity index.
+		// Now, for each component, attempt to refine the current solution by solving a simple linear equation. For example, for 4 colors:
+		// The goal is:
+		// pixel0 - (block_color+inten_table[selector0]) + pixel1 - (block_color+inten_table[selector1]) + pixel2 - (block_color+inten_table[selector2]) + pixel3 - (block_color+inten_table[selector3]) = 0
+		// Rearranging this:
+		// (pixel0 + pixel1 + pixel2 + pixel3) - (block_color+inten_table[selector0]) - (block_color+inten_table[selector1]) - (block_color+inten_table[selector2]) - (block_color+inten_table[selector3]) = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3) - block_color - inten_table[selector0] - block_color-inten_table[selector1] - block_color-inten_table[selector2] - block_color-inten_table[selector3] = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - inten_table[selector0] - inten_table[selector1] - inten_table[selector2] - inten_table[selector3] = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3) - 4*block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3]) = 0
+		// (pixel0 + pixel1 + pixel2 + pixel3)/4 - block_color - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4 = 0
+		// block_color = (pixel0 + pixel1 + pixel2 + pixel3)/4 - (inten_table[selector0] + inten_table[selector1] + inten_table[selector2] + inten_table[selector3])/4
+		// So what this means:
+		// optimal_block_color = avg_input - avg_inten_delta
+		// So the optimal block color can be computed by taking the average block color and subtracting the current average of the intensity delta.
+		// Unfortunately, optimal_block_color must then be quantized to 555 or 444 so it's not always possible to improve matters using this formula.
+		// Also, the above formula is for unclamped intensity deltas. The actual implementation takes into account clamping.
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+
+		for (uint32_t refinement_trial = 0; refinement_trial < max_refinement_trials; refinement_trial++)
+		{
+			const uint8_t* pSelectors = &m_best_solution.m_selectors[0];
+			const int* pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
+
+			int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
+			const color_rgba base_color(m_best_solution.m_coords.get_scaled_color());
+			for (uint32_t r = 0; r < n; r++)
+			{
+				const uint32_t s = *pSelectors++;
+				const int yd_temp = pInten_table[s];
+				// Compute actual delta being applied to each pixel, taking into account clamping.
+				delta_sum_r += clamp<int>(base_color.r + yd_temp, 0, 255) - base_color.r;
+				delta_sum_g += clamp<int>(base_color.g + yd_temp, 0, 255) - base_color.g;
+				delta_sum_b += clamp<int>(base_color.b + yd_temp, 0, 255) - base_color.b;
+			}
+
+			if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
+				break;
+
+			const float avg_delta_r_f = static_cast<float>(delta_sum_r) / n;
+			const float avg_delta_g_f = static_cast<float>(delta_sum_g) / n;
+			const float avg_delta_b_f = static_cast<float>(delta_sum_b) / n;
+			const int br1 = clamp<int>(static_cast<int32_t>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bg1 = clamp<int>(static_cast<int32_t>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bb1 = clamp<int>(static_cast<int32_t>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+			printf("Refinement trial %u, avg_delta %f %f %f\n", refinement_trial, avg_delta_r_f, avg_delta_g_f, avg_delta_b_f);
+#endif
+
+			if (!evaluate_solution(etc1_solution_coordinates(br1, bg1, bb1, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution))
+				break;
+
+		}  // refinement_trial
+	}
+
+	void etc1_optimizer::compute_internal_neighborhood(int scan_r, int scan_g, int scan_b)
+	{
+		if (m_best_solution.m_error == 0)
+			return;
+
+		//const uint32_t n = m_pParams->m_num_src_pixels;
+		const int scan_delta_size = m_pParams->m_scan_delta_size;
+
+		// Scan through a subset of the 3D lattice centered around the avg block color trying each 3D (555 or 444) lattice point as a potential block color.
+		// Each time a better solution is found try to refine the current solution's block color based of the current selectors and intensity table index.
+		for (int zdi = 0; zdi < scan_delta_size; zdi++)
+		{
+			const int zd = m_pParams->m_pScan_deltas[zdi];
+			const int mbb = scan_b + zd;
+			if (mbb < 0) continue; else if (mbb > m_limit) break;
+
+			for (int ydi = 0; ydi < scan_delta_size; ydi++)
+			{
+				const int yd = m_pParams->m_pScan_deltas[ydi];
+				const int mbg = scan_g + yd;
+				if (mbg < 0) continue; else if (mbg > m_limit) break;
+
+				for (int xdi = 0; xdi < scan_delta_size; xdi++)
+				{
+					const int xd = m_pParams->m_pScan_deltas[xdi];
+					const int mbr = scan_r + xd;
+					if (mbr < 0) continue; else if (mbr > m_limit) break;
+
+					etc1_solution_coordinates coords(mbr, mbg, mbb, 0, m_pParams->m_use_color4);
+
+					if (!evaluate_solution(coords, m_trial_solution, &m_best_solution))
+						continue;
+
+					if (m_pParams->m_refinement)
+					{
+						refine_solution((m_pParams->m_quality == cETCQualityFast) ? 2 : (((xd | yd | zd) == 0) ? 4 : 2));
+					}
+
+				} // xdi
+			} // ydi
+		} // zdi
+	}
+
+	void etc1_optimizer::compute_internal_cluster_fit(uint32_t total_perms_to_try)
+	{
+		if ((!m_best_solution.m_valid) || ((m_br != m_best_solution.m_coords.m_unscaled_color.r) || (m_bg != m_best_solution.m_coords.m_unscaled_color.g) || (m_bb != m_best_solution.m_coords.m_unscaled_color.b)))
+		{
+			evaluate_solution(etc1_solution_coordinates(m_br, m_bg, m_bb, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution);
+		}
+
+		if ((m_best_solution.m_error == 0) || (!m_best_solution.m_valid))
+			return;
+
+		for (uint32_t i = 0; i < total_perms_to_try; i++)
+		{
+			int delta_sum_r = 0, delta_sum_g = 0, delta_sum_b = 0;
+
+			const int *pInten_table = g_etc1_inten_tables[m_best_solution.m_coords.m_inten_table];
+			const color_rgba base_color(m_best_solution.m_coords.get_scaled_color());
+
+			const uint8_t *pNum_selectors = g_cluster_fit_order_tab[i].m_v;
+
+			for (uint32_t q = 0; q < 4; q++)
+			{
+				const int yd_temp = pInten_table[q];
+
+				delta_sum_r += pNum_selectors[q] * (clamp<int>(base_color.r + yd_temp, 0, 255) - base_color.r);
+				delta_sum_g += pNum_selectors[q] * (clamp<int>(base_color.g + yd_temp, 0, 255) - base_color.g);
+				delta_sum_b += pNum_selectors[q] * (clamp<int>(base_color.b + yd_temp, 0, 255) - base_color.b);
+			}
+
+			if ((!delta_sum_r) && (!delta_sum_g) && (!delta_sum_b))
+				continue;
+
+			const float avg_delta_r_f = static_cast<float>(delta_sum_r) / 8;
+			const float avg_delta_g_f = static_cast<float>(delta_sum_g) / 8;
+			const float avg_delta_b_f = static_cast<float>(delta_sum_b) / 8;
+
+			const int br1 = clamp<int>(static_cast<int32_t>((m_avg_color[0] - avg_delta_r_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bg1 = clamp<int>(static_cast<int32_t>((m_avg_color[1] - avg_delta_g_f) * m_limit / 255.0f + .5f), 0, m_limit);
+			const int bb1 = clamp<int>(static_cast<int32_t>((m_avg_color[2] - avg_delta_b_f) * m_limit / 255.0f + .5f), 0, m_limit);
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+			printf("Second refinement trial %u, avg_delta %f %f %f\n", i, avg_delta_r_f, avg_delta_g_f, avg_delta_b_f);
+#endif
+
+			evaluate_solution(etc1_solution_coordinates(br1, bg1, bb1, 0, m_pParams->m_use_color4), m_trial_solution, &m_best_solution);
+
+			if (m_best_solution.m_error == 0)
+				break;
+		}
+	}
+
+	void etc1_optimizer::init(const params& params, results& result)
+	{
+		m_pParams = &params;
+		m_pResult = &result;
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+
+		m_selectors.resize(n);
+		m_best_selectors.resize(n);
+		m_temp_selectors.resize(n);
+		m_trial_solution.m_selectors.resize(n);
+		m_best_solution.m_selectors.resize(n);
+
+		m_limit = m_pParams->m_use_color4 ? 15 : 31;
+
+		vec3F avg_color(0.0f);
+
+		m_luma.resize(n);
+		m_sorted_luma_indices.resize(n);
+		m_sorted_luma.resize(n);
+		
+		int min_r = 255, min_g = 255, min_b = 255;
+		int max_r = 0, max_g = 0, max_b = 0;
+		
+		for (uint32_t i = 0; i < n; i++)
+		{
+			const color_rgba& c = m_pParams->m_pSrc_pixels[i];
+
+			min_r = basisu::minimum<int>(min_r, c.r);
+			min_g = basisu::minimum<int>(min_g, c.g);
+			min_b = basisu::minimum<int>(min_b, c.b);
+
+			max_r = basisu::maximum<int>(max_r, c.r);
+			max_g = basisu::maximum<int>(max_g, c.g);
+			max_b = basisu::maximum<int>(max_b, c.b);
+
+			const vec3F fc(c.r, c.g, c.b);
+
+			avg_color += fc;
+
+			m_luma[i] = static_cast<uint16_t>(c.r + c.g + c.b);
+			m_sorted_luma_indices[i] = i;
+		}
+		avg_color /= static_cast<float>(n);
+		m_avg_color = avg_color;
+		m_max_comp_spread = basisu::maximum(basisu::maximum(max_r - min_r, max_g - min_g), max_b - min_b);
+
+		m_br = clamp<int>(static_cast<uint32_t>(m_avg_color[0] * m_limit / 255.0f + .5f), 0, m_limit);
+		m_bg = clamp<int>(static_cast<uint32_t>(m_avg_color[1] * m_limit / 255.0f + .5f), 0, m_limit);
+		m_bb = clamp<int>(static_cast<uint32_t>(m_avg_color[2] * m_limit / 255.0f + .5f), 0, m_limit);
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Avg block color: %u %u %u\n", m_br, m_bg, m_bb);
+#endif
+
+		if (m_pParams->m_quality == cETCQualityFast)
+		{
+			indirect_sort(n, &m_sorted_luma_indices[0], &m_luma[0]);
+
+			m_pSorted_luma = &m_sorted_luma[0];
+			m_pSorted_luma_indices = &m_sorted_luma_indices[0];
+			
+			for (uint32_t i = 0; i < n; i++)
+				m_pSorted_luma[i] = m_luma[m_pSorted_luma_indices[i]];
+		}
+
+		m_best_solution.m_coords.clear();
+		m_best_solution.m_valid = false;
+		m_best_solution.m_error = UINT64_MAX;
+
+		clear_obj(m_solutions_tried);
+	}
+
+	// Return false if we've probably already tried this solution, true if we have definitely not.
+	bool etc1_optimizer::check_for_redundant_solution(const etc1_solution_coordinates& coords)
+	{
+		// Hash first 3 bytes of color (RGB)
+		uint32_t kh = hash_hsieh((uint8_t*)&coords.m_unscaled_color.r, 3);
+
+		uint32_t h0 = kh & cSolutionsTriedHashMask;
+		uint32_t h1 = (kh >> cSolutionsTriedHashBits) & cSolutionsTriedHashMask;
+
+		// Simple Bloom filter lookup with k=2
+		if ( ((m_solutions_tried[h0 >> 3] & (1 << (h0 & 7))) != 0) &&
+		     ((m_solutions_tried[h1 >> 3] & (1 << (h1 & 7))) != 0) )
+			return false;
+
+		m_solutions_tried[h0 >> 3] |= (1 << (h0 & 7));
+		m_solutions_tried[h1 >> 3] |= (1 << (h1 & 7));
+
+		return true;
+	}
+		
+	static uint8_t g_eval_dist_tables[8][256] =
+	{
+		// 99% threshold
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,},
+		{ 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,},
+		{ 1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,1,1,0,1,1,1,0,1,1,0,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,},
+		{ 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,},
+		{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,}
+	};
+
+	bool etc1_optimizer::evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+	{
+		if (!check_for_redundant_solution(coords))
+			return false;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval solution: %u %u %u\n", coords.m_unscaled_color.r, coords.m_unscaled_color.g, coords.m_unscaled_color.b);
+#endif
+
+		trial_solution.m_valid = false;
+
+		if (m_pParams->m_constrain_against_base_color5)
+		{
+			const int dr = (int)coords.m_unscaled_color.r - (int)m_pParams->m_base_color5.r;
+			const int dg = (int)coords.m_unscaled_color.g - (int)m_pParams->m_base_color5.g;
+			const int db = (int)coords.m_unscaled_color.b - (int)m_pParams->m_base_color5.b;
+
+			if ((minimum(dr, dg, db) < cETC1ColorDeltaMin) || (maximum(dr, dg, db) > cETC1ColorDeltaMax))
+			{
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+				printf("Eval failed due to constraint from %u %u %u\n", m_pParams->m_base_color5.r, m_pParams->m_base_color5.g, m_pParams->m_base_color5.b);
+#endif
+				return false;
+			}
+		}
+
+		const color_rgba base_color(coords.get_scaled_color());
+
+		const uint32_t n = m_pParams->m_num_src_pixels;
+		assert(trial_solution.m_selectors.size() == n);
+
+		trial_solution.m_error = INT64_MAX;
+
+		const uint8_t *pSelectors_to_use = m_pParams->m_pForce_selectors;
+
+		for (uint32_t inten_table = 0; inten_table < cETC1IntenModifierValues; inten_table++)
+		{
+			if (m_pParams->m_quality <= cETCQualityMedium)
+			{
+				if (!g_eval_dist_tables[inten_table][m_max_comp_spread])
+					continue;
+			}
+#if 0
+			if (m_pParams->m_quality <= cETCQualityMedium)
+			{
+				// For tables 5-7, if the max component spread falls within certain ranges, skip the inten table. Statistically they are extremely unlikely to result in lower error.
+				if (inten_table == 7)
+				{
+					if (m_max_comp_spread < 42)
+						continue;
+				}
+				else if (inten_table == 6)
+				{
+					if ((m_max_comp_spread >= 12) && (m_max_comp_spread <= 31))
+						continue;
+				}
+				else if (inten_table == 5)
+				{
+					if ((m_max_comp_spread >= 13) && (m_max_comp_spread <= 21))
+						continue;
+				}
+			}
+#endif
+
+			const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+			color_rgba block_colors[4];
+			for (uint32_t s = 0; s < 4; s++)
+			{
+				const int yd = pInten_table[s];
+				block_colors[s].set(base_color.r + yd, base_color.g + yd, base_color.b + yd, 255);
+			}
+
+			uint64_t total_error = 0;
+
+			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
+
+			if (!g_cpu_supports_sse41)
+			{
+				for (uint32_t c = 0; c < n; c++)
+				{
+					const color_rgba& src_pixel = *pSrc_pixels++;
+
+					uint32_t best_selector_index = 0;
+					uint32_t best_error = 0;
+
+					if (pSelectors_to_use)
+					{
+						best_selector_index = pSelectors_to_use[c];
+						best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[best_selector_index], false);
+					}
+					else
+					{
+						best_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[0], false);
+
+						uint32_t trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[1], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 1;
+						}
+
+						trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[2], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 2;
+						}
+
+						trial_error = color_distance(m_pParams->m_perceptual, src_pixel, block_colors[3], false);
+						if (trial_error < best_error)
+						{
+							best_error = trial_error;
+							best_selector_index = 3;
+						}
+					}
+
+					m_temp_selectors[c] = static_cast<uint8_t>(best_selector_index);
+
+					total_error += best_error;
+					if (total_error >= trial_solution.m_error)
+						break;
+				}
+			}
+			else
+			{
+#if BASISU_SUPPORT_SSE
+				if (pSelectors_to_use)
+				{
+					if (m_pParams->m_perceptual)
+						perceptual_distance_rgb_4_N_sse41((int64_t*)&total_error, pSelectors_to_use, block_colors, pSrc_pixels, n, trial_solution.m_error);
+					else
+						linear_distance_rgb_4_N_sse41((int64_t*)&total_error, pSelectors_to_use, block_colors, pSrc_pixels, n, trial_solution.m_error);
+				}
+				else
+				{
+					if (m_pParams->m_perceptual)
+						find_selectors_perceptual_rgb_4_N_sse41((int64_t*)&total_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, trial_solution.m_error);
+					else
+						find_selectors_linear_rgb_4_N_sse41((int64_t*)&total_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, trial_solution.m_error);
+				}
+#endif
+			}
+
+			if (total_error < trial_solution.m_error)
+			{
+				trial_solution.m_error = total_error;
+				trial_solution.m_coords.m_inten_table = inten_table;
+				trial_solution.m_selectors.swap(m_temp_selectors);
+				trial_solution.m_valid = true;
+			}
+		}
+		trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+		trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+				
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval done: %u error: %I64u best error so far: %I64u\n", (trial_solution.m_error < pBest_solution->m_error), trial_solution.m_error, pBest_solution->m_error);
+#endif
+
+		bool success = false;
+		if (pBest_solution)
+		{
+			if (trial_solution.m_error < pBest_solution->m_error)
+			{
+				*pBest_solution = trial_solution;
+				success = true;
+			}
+		}
+				
+		return success;
+	}
+
+	bool etc1_optimizer::evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+	{
+		if (!check_for_redundant_solution(coords))
+			return false;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval solution fast: %u %u %u\n", coords.m_unscaled_color.r, coords.m_unscaled_color.g, coords.m_unscaled_color.b);
+#endif
+
+		if (m_pParams->m_constrain_against_base_color5)
+		{
+			const int dr = (int)coords.m_unscaled_color.r - (int)m_pParams->m_base_color5.r;
+			const int dg = (int)coords.m_unscaled_color.g - (int)m_pParams->m_base_color5.g;
+			const int db = (int)coords.m_unscaled_color.b - (int)m_pParams->m_base_color5.b;
+
+			if ((minimum(dr, dg, db) < cETC1ColorDeltaMin) || (maximum(dr, dg, db) > cETC1ColorDeltaMax))
+			{
+				trial_solution.m_valid = false;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+				printf("Eval failed due to constraint from %u %u %u\n", m_pParams->m_base_color5.r, m_pParams->m_base_color5.g, m_pParams->m_base_color5.b);
+#endif
+				return false;
+			}
+		}
+
+		const color_rgba base_color(coords.get_scaled_color());
+		
+		const uint32_t n = m_pParams->m_num_src_pixels;
+		assert(trial_solution.m_selectors.size() == n);
+
+		trial_solution.m_error = UINT64_MAX;
+								
+		const bool perceptual = (m_pParams->m_quality == cETCQualityFast) ? false : m_pParams->m_perceptual;
+				
+		for (int inten_table = cETC1IntenModifierValues - 1; inten_table >= 0; --inten_table)
+		{
+			const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+			uint32_t block_inten[4];
+			color_rgba block_colors[4];
+			for (uint32_t s = 0; s < 4; s++)
+			{
+				const int yd = pInten_table[s];
+				color_rgba block_color(base_color.r + yd, base_color.g + yd, base_color.b + yd, 255);
+				block_colors[s] = block_color;
+				block_inten[s] = block_color.r + block_color.g + block_color.b;
+			}
+
+			// evaluate_solution_fast() enforces/assumes a total ordering of the input colors along the intensity (1,1,1) axis to more quickly classify the inputs to selectors.
+			// The inputs colors have been presorted along the projection onto this axis, and ETC1 block colors are always ordered along the intensity axis, so this classification is fast.
+			// 0   1   2   3
+			//   01  12  23
+			const uint32_t block_inten_midpoints[3] = { block_inten[0] + block_inten[1], block_inten[1] + block_inten[2], block_inten[2] + block_inten[3] };
+															
+			uint64_t total_error = 0;
+			const color_rgba* pSrc_pixels = m_pParams->m_pSrc_pixels;
+						
+			if (perceptual)
+			{
+				if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
+				{
+					if (block_inten[0] > m_pSorted_luma[n - 1])
+					{
+						const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 0, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(true, block_colors[0], pSrc_pixels[c], false);
+				}
+				else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
+				{
+					if (m_pSorted_luma[0] > block_inten[3])
+					{
+						const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 3, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(true, block_colors[3], pSrc_pixels[c], false);
+				}
+				else
+				{
+					if (!g_cpu_supports_sse41)
+					{
+						uint32_t cur_selector = 0, c;
+						for (c = 0; c < n; c++)
+						{
+							const uint32_t y = m_pSorted_luma[c];
+							while ((y * 2) >= block_inten_midpoints[cur_selector])
+								if (++cur_selector > 2)
+									goto done;
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+							total_error += color_distance(true, block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
+						}
+					done:
+						while (c < n)
+						{
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = 3;
+							total_error += color_distance(true, block_colors[3], pSrc_pixels[sorted_pixel_index], false);
+							++c;
+						}
+					}
+					else
+					{
+#if BASISU_SUPPORT_SSE
+						uint32_t cur_selector = 0, c;
+
+						for (c = 0; c < n; c++)
+						{
+							const uint32_t y = m_pSorted_luma[c];
+							while ((y * 2) >= block_inten_midpoints[cur_selector])
+							{
+								if (++cur_selector > 2)
+									goto done3;
+							}
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+						}
+					done3:
+
+						while (c < n)
+						{
+							const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+							m_temp_selectors[sorted_pixel_index] = 3;
+							++c;
+						}
+
+						int64_t block_error;
+						perceptual_distance_rgb_4_N_sse41(&block_error, &m_temp_selectors[0], block_colors, pSrc_pixels, n, INT64_MAX);
+						total_error += block_error;
+#endif
+					}
+				}
+			}
+			else
+			{
+				if ((m_pSorted_luma[n - 1] * 2) < block_inten_midpoints[0])
+				{
+					if (block_inten[0] > m_pSorted_luma[n - 1])
+					{
+						const uint32_t min_error = iabs((int)block_inten[0] - (int)m_pSorted_luma[n - 1]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 0, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(block_colors[0], pSrc_pixels[c], false);
+				}
+				else if ((m_pSorted_luma[0] * 2) >= block_inten_midpoints[2])
+				{
+					if (m_pSorted_luma[0] > block_inten[3])
+					{
+						const uint32_t min_error = iabs((int)m_pSorted_luma[0] - (int)block_inten[3]);
+						if (min_error >= trial_solution.m_error)
+							continue;
+					}
+
+					memset(&m_temp_selectors[0], 3, n);
+
+					for (uint32_t c = 0; c < n; c++)
+						total_error += color_distance(block_colors[3], pSrc_pixels[c], false);
+				}
+				else
+				{
+					uint32_t cur_selector = 0, c;
+					for (c = 0; c < n; c++)
+					{
+						const uint32_t y = m_pSorted_luma[c];
+						while ((y * 2) >= block_inten_midpoints[cur_selector])
+							if (++cur_selector > 2)
+								goto done2;
+						const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+						m_temp_selectors[sorted_pixel_index] = static_cast<uint8_t>(cur_selector);
+						total_error += color_distance(block_colors[cur_selector], pSrc_pixels[sorted_pixel_index], false);
+					}
+				done2:
+					while (c < n)
+					{
+						const uint32_t sorted_pixel_index = m_pSorted_luma_indices[c];
+						m_temp_selectors[sorted_pixel_index] = 3;
+						total_error += color_distance(block_colors[3], pSrc_pixels[sorted_pixel_index], false);
+						++c;
+					}
+				}
+			}
+
+			if (total_error < trial_solution.m_error)
+			{
+				trial_solution.m_error = total_error;
+				trial_solution.m_coords.m_inten_table = inten_table;
+				trial_solution.m_selectors.swap(m_temp_selectors);
+				trial_solution.m_valid = true;
+				if (!total_error)
+					break;
+			}
+		}
+		trial_solution.m_coords.m_unscaled_color = coords.m_unscaled_color;
+		trial_solution.m_coords.m_color4 = m_pParams->m_use_color4;
+
+#if BASISU_DEBUG_ETC_ENCODER_DEEPER
+		printf("Eval done: %u error: %I64u best error so far: %I64u\n", (trial_solution.m_error < pBest_solution->m_error), trial_solution.m_error, pBest_solution->m_error);
+#endif
+
+		bool success = false;
+		if (pBest_solution)
+		{
+			if (trial_solution.m_error < pBest_solution->m_error)
+			{
+				*pBest_solution = trial_solution;
+				success = true;
+			}
+		}
+
+		return success;
+	}
+
+	uint64_t pack_eac_a8(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask)
+	{
+		results.m_selectors.resize(num_pixels);
+		results.m_selectors_temp.resize(num_pixels);
+
+		uint32_t min_alpha = 255, max_alpha = 0;
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const uint32_t a = pPixels[i];
+			if (a < min_alpha) min_alpha = a;
+			if (a > max_alpha) max_alpha = a;
+		}
+
+		if (min_alpha == max_alpha)
+		{
+			results.m_base = min_alpha;
+			results.m_table = 13;
+			results.m_multiplier = 1;
+			for (uint32_t i = 0; i < num_pixels; i++)
+				results.m_selectors[i] = 4;
+			return 0;
+		}
+
+		const uint32_t alpha_range = max_alpha - min_alpha;
+
+		uint64_t best_err = UINT64_MAX;
+
+		for (uint32_t table = 0; table < 16; table++)
+		{
+			if ((table_mask & (1U << table)) == 0)
+				continue;
+
+			const float range = (float)(g_etc2_eac_tables[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]);
+			const int center = (int)roundf(lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range));
+
+			const int base_min = clamp255(center - base_search_rad);
+			const int base_max = clamp255(center + base_search_rad);
+
+			const int mul = (int)roundf(alpha_range / range);
+			const int mul_low = clamp<int>(mul - mul_search_rad, 1, 15);
+			const int mul_high = clamp<int>(mul + mul_search_rad, 1, 15);
+
+			for (int base = base_min; base <= base_max; base++)
+			{
+				for (int multiplier = mul_low; multiplier <= mul_high; multiplier++)
+				{
+					uint64_t total_err = 0;
+
+					for (uint32_t i = 0; i < num_pixels; i++)
+					{
+						const int a = pPixels[i];
+
+						uint32_t best_s_err = UINT32_MAX;
+						uint32_t best_s = 0;
+						for (uint32_t s = 0; s < 8; s++)
+						{
+							const int v = clamp255((int)multiplier * g_etc2_eac_tables[table][s] + (int)base);
+
+							uint32_t err = iabs(a - v);
+							if (err < best_s_err)
+							{
+								best_s_err = err;
+								best_s = s;
+							}
+						}
+
+						results.m_selectors_temp[i] = static_cast<uint8_t>(best_s);
+
+						total_err += best_s_err * best_s_err;
+						if (total_err >= best_err)
+							break;
+					}
+
+					if (total_err < best_err)
+					{
+						best_err = total_err;
+						results.m_base = base;
+						results.m_multiplier = multiplier;
+						results.m_table = table;
+						results.m_selectors.swap(results.m_selectors_temp);
+						if (!best_err)
+							return best_err;
+					}
+
+				} // table
+
+			} // multiplier
+
+		} // base
+
+		return best_err;
+	}
+
+	void pack_eac_a8(eac_a8_block* pBlock, const uint8_t* pPixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask)
+	{
+		pack_eac_a8_results results;
+		pack_eac_a8(results, pPixels, 16, base_search_rad, mul_search_rad, table_mask);
+
+		pBlock->m_base = results.m_base;
+		pBlock->m_multiplier = results.m_multiplier;
+		pBlock->m_table = results.m_table;
+		for (uint32_t y = 0; y < 4; y++)
+			for (uint32_t x = 0; x < 4; x++)
+				pBlock->set_selector(x, y, results.m_selectors[x + y * 4]);
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_etc.h b/thirdparty/basis_universal/encoder/basisu_etc.h
new file mode 100644
index 0000000000..1e3ece43b8
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_etc.h
@@ -0,0 +1,1152 @@
+// basis_etc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu.h"
+#include "basisu_enc.h"
+
+namespace basisu
+{
+	enum etc_constants
+	{
+		cETC1BytesPerBlock = 8U,
+
+		cETC1SelectorBits = 2U,
+		cETC1SelectorValues = 1U << cETC1SelectorBits,
+		cETC1SelectorMask = cETC1SelectorValues - 1U,
+
+		cETC1BlockShift = 2U,
+		cETC1BlockSize = 1U << cETC1BlockShift,
+
+		cETC1LSBSelectorIndicesBitOffset = 0,
+		cETC1MSBSelectorIndicesBitOffset = 16,
+
+		cETC1FlipBitOffset = 32,
+		cETC1DiffBitOffset = 33,
+
+		cETC1IntenModifierNumBits = 3,
+		cETC1IntenModifierValues = 1 << cETC1IntenModifierNumBits,
+		cETC1RightIntenModifierTableBitOffset = 34,
+		cETC1LeftIntenModifierTableBitOffset = 37,
+
+		// Base+Delta encoding (5 bit bases, 3 bit delta)
+		cETC1BaseColorCompNumBits = 5,
+		cETC1BaseColorCompMax = 1 << cETC1BaseColorCompNumBits,
+
+		cETC1DeltaColorCompNumBits = 3,
+		cETC1DeltaColorComp = 1 << cETC1DeltaColorCompNumBits,
+		cETC1DeltaColorCompMax = 1 << cETC1DeltaColorCompNumBits,
+
+		cETC1BaseColor5RBitOffset = 59,
+		cETC1BaseColor5GBitOffset = 51,
+		cETC1BaseColor5BBitOffset = 43,
+
+		cETC1DeltaColor3RBitOffset = 56,
+		cETC1DeltaColor3GBitOffset = 48,
+		cETC1DeltaColor3BBitOffset = 40,
+
+		// Absolute (non-delta) encoding (two 4-bit per component bases)
+		cETC1AbsColorCompNumBits = 4,
+		cETC1AbsColorCompMax = 1 << cETC1AbsColorCompNumBits,
+
+		cETC1AbsColor4R1BitOffset = 60,
+		cETC1AbsColor4G1BitOffset = 52,
+		cETC1AbsColor4B1BitOffset = 44,
+
+		cETC1AbsColor4R2BitOffset = 56,
+		cETC1AbsColor4G2BitOffset = 48,
+		cETC1AbsColor4B2BitOffset = 40,
+
+		cETC1ColorDeltaMin = -4,
+		cETC1ColorDeltaMax = 3,
+
+		// Delta3:
+		// 0   1   2   3   4   5   6   7
+		// 000 001 010 011 100 101 110 111
+		// 0   1   2   3   -4  -3  -2  -1
+	};
+	
+	extern const int g_etc1_inten_tables[cETC1IntenModifierValues][cETC1SelectorValues];
+	extern const uint8_t g_etc1_to_selector_index[cETC1SelectorValues];
+	extern const uint8_t g_selector_index_to_etc1[cETC1SelectorValues];
+
+	struct etc_coord2
+	{
+		uint8_t m_x, m_y;
+	};
+	extern const etc_coord2 g_etc1_pixel_coords[2][2][8]; // [flipped][subblock][subblock_pixel]
+	extern const uint32_t g_etc1_pixel_indices[2][2][8]; // [flipped][subblock][subblock_pixel]
+
+	struct etc_block
+	{
+		// big endian uint64:
+		// bit ofs:  56  48  40  32  24  16   8   0
+		// byte ofs: b0, b1, b2, b3, b4, b5, b6, b7 
+		union
+		{
+			uint64_t m_uint64;
+
+			uint8_t m_bytes[8];
+		};
+
+		inline void clear()
+		{
+			assert(sizeof(*this) == 8);
+			clear_obj(*this);
+		}
+
+		inline uint64_t get_all_bits() const
+		{
+			return read_be64(&m_uint64);
+		}
+
+		inline uint32_t get_general_bits(uint32_t ofs, uint32_t num) const
+		{
+			assert((ofs + num) <= 64U);
+			assert(num && (num < 32U));
+			return (uint32_t)(read_be64(&m_uint64) >> ofs) & ((1UL << num) - 1UL);
+		}
+
+		inline void set_general_bits(uint32_t ofs, uint32_t num, uint32_t bits)
+		{
+			assert((ofs + num) <= 64U);
+			assert(num && (num < 32U));
+
+			uint64_t x = read_be64(&m_uint64);
+			uint64_t msk = ((1ULL << static_cast<uint64_t>(num)) - 1ULL) << static_cast<uint64_t>(ofs);
+			x &= ~msk;
+			x |= (static_cast<uint64_t>(bits) << static_cast<uint64_t>(ofs));
+			write_be64(&m_uint64, x);
+		}
+
+		inline uint32_t get_byte_bits(uint32_t ofs, uint32_t num) const
+		{
+			assert((ofs + num) <= 64U);
+			assert(num && (num <= 8U));
+			assert((ofs >> 3) == ((ofs + num - 1) >> 3));
+			const uint32_t byte_ofs = 7 - (ofs >> 3);
+			const uint32_t byte_bit_ofs = ofs & 7;
+			return (m_bytes[byte_ofs] >> byte_bit_ofs) & ((1 << num) - 1);
+		}
+
+		inline void set_byte_bits(uint32_t ofs, uint32_t num, uint32_t bits)
+		{
+			assert((ofs + num) <= 64U);
+			assert(num && (num < 32U));
+			assert((ofs >> 3) == ((ofs + num - 1) >> 3));
+			assert(bits < (1U << num));
+			const uint32_t byte_ofs = 7 - (ofs >> 3);
+			const uint32_t byte_bit_ofs = ofs & 7;
+			const uint32_t mask = (1 << num) - 1;
+			m_bytes[byte_ofs] &= ~(mask << byte_bit_ofs);
+			m_bytes[byte_ofs] |= (bits << byte_bit_ofs);
+		}
+
+		// false = left/right subblocks
+		// true = upper/lower subblocks
+		inline bool get_flip_bit() const
+		{
+			return (m_bytes[3] & 1) != 0;
+		}
+
+		inline void set_flip_bit(bool flip)
+		{
+			m_bytes[3] &= ~1;
+			m_bytes[3] |= static_cast<uint8_t>(flip);
+		}
+
+		inline bool get_diff_bit() const
+		{
+			return (m_bytes[3] & 2) != 0;
+		}
+
+		inline void set_diff_bit(bool diff)
+		{
+			m_bytes[3] &= ~2;
+			m_bytes[3] |= (static_cast<uint32_t>(diff) << 1);
+		}
+
+		// Returns intensity modifier table (0-7) used by subblock subblock_id.
+		// subblock_id=0 left/top (CW 1), 1=right/bottom (CW 2)
+		inline uint32_t get_inten_table(uint32_t subblock_id) const
+		{
+			assert(subblock_id < 2);
+			const uint32_t ofs = subblock_id ? 2 : 5;
+			return (m_bytes[3] >> ofs) & 7;
+		}
+
+		// Sets intensity modifier table (0-7) used by subblock subblock_id (0 or 1)
+		inline void set_inten_table(uint32_t subblock_id, uint32_t t)
+		{
+			assert(subblock_id < 2);
+			assert(t < 8);
+			const uint32_t ofs = subblock_id ? 2 : 5;
+			m_bytes[3] &= ~(7 << ofs);
+			m_bytes[3] |= (t << ofs);
+		}
+
+		inline void set_inten_tables_etc1s(uint32_t t)
+		{
+			set_inten_table(0, t);
+			set_inten_table(1, t);
+		}
+
+		inline bool is_etc1s() const
+		{
+			if (get_inten_table(0) != get_inten_table(1))
+				return false;
+
+			if (get_diff_bit())
+			{
+				if (get_delta3_color() != 0)
+					return false;
+			}
+			else
+			{
+				if (get_base4_color(0) != get_base4_color(1))
+					return false;
+			}
+
+			return true;
+		}
+
+		// Returned encoded selector value ranges from 0-3 (this is NOT a direct index into g_etc1_inten_tables, see get_selector())
+		inline uint32_t get_raw_selector(uint32_t x, uint32_t y) const
+		{
+			assert((x | y) < 4);
+
+			const uint32_t bit_index = x * 4 + y;
+			const uint32_t byte_bit_ofs = bit_index & 7;
+			const uint8_t *p = &m_bytes[7 - (bit_index >> 3)];
+			const uint32_t lsb = (p[0] >> byte_bit_ofs) & 1;
+			const uint32_t msb = (p[-2] >> byte_bit_ofs) & 1;
+			const uint32_t val = lsb | (msb << 1);
+
+			return val;
+		}
+
+		// Returned selector value ranges from 0-3 and is a direct index into g_etc1_inten_tables.
+		inline uint32_t get_selector(uint32_t x, uint32_t y) const
+		{
+			return g_etc1_to_selector_index[get_raw_selector(x, y)];
+		}
+
+		// Selector "val" ranges from 0-3 and is a direct index into g_etc1_inten_tables.
+		inline void set_selector(uint32_t x, uint32_t y, uint32_t val)
+		{
+			assert((x | y | val) < 4);
+			const uint32_t bit_index = x * 4 + y;
+
+			uint8_t *p = &m_bytes[7 - (bit_index >> 3)];
+
+			const uint32_t byte_bit_ofs = bit_index & 7;
+			const uint32_t mask = 1 << byte_bit_ofs;
+
+			const uint32_t etc1_val = g_selector_index_to_etc1[val];
+
+			const uint32_t lsb = etc1_val & 1;
+			const uint32_t msb = etc1_val >> 1;
+
+			p[0] &= ~mask;
+			p[0] |= (lsb << byte_bit_ofs);
+
+			p[-2] &= ~mask;
+			p[-2] |= (msb << byte_bit_ofs);
+		}
+
+		// Selector "etc1_val" ranges from 0-3 and is a direct (raw) ETC1 selector.
+		inline void set_raw_selector(uint32_t x, uint32_t y, uint32_t etc1_val)
+		{
+			assert((x | y | etc1_val) < 4);
+			const uint32_t bit_index = x * 4 + y;
+
+			uint8_t* p = &m_bytes[7 - (bit_index >> 3)];
+
+			const uint32_t byte_bit_ofs = bit_index & 7;
+			const uint32_t mask = 1 << byte_bit_ofs;
+						
+			const uint32_t lsb = etc1_val & 1;
+			const uint32_t msb = etc1_val >> 1;
+
+			p[0] &= ~mask;
+			p[0] |= (lsb << byte_bit_ofs);
+
+			p[-2] &= ~mask;
+			p[-2] |= (msb << byte_bit_ofs);
+		}
+
+		inline uint32_t get_raw_selector_bits() const
+		{
+			return m_bytes[4] | (m_bytes[5] << 8) | (m_bytes[6] << 16) | (m_bytes[7] << 24);
+		}
+
+		inline void set_raw_selector_bits(uint32_t bits)
+		{
+			m_bytes[4] = static_cast<uint8_t>(bits);
+			m_bytes[5] = static_cast<uint8_t>(bits >> 8);
+			m_bytes[6] = static_cast<uint8_t>(bits >> 16);
+			m_bytes[7] = static_cast<uint8_t>(bits >> 24);
+		}
+
+		inline void set_raw_selector_bits(uint8_t byte0, uint8_t byte1, uint8_t byte2, uint8_t byte3)
+		{
+			m_bytes[4] = byte0;
+			m_bytes[5] = byte1;
+			m_bytes[6] = byte2;
+			m_bytes[7] = byte3;
+		}
+
+		inline void set_base4_color(uint32_t idx, uint16_t c)
+		{
+			if (idx)
+			{
+				set_byte_bits(cETC1AbsColor4R2BitOffset, 4, (c >> 8) & 15);
+				set_byte_bits(cETC1AbsColor4G2BitOffset, 4, (c >> 4) & 15);
+				set_byte_bits(cETC1AbsColor4B2BitOffset, 4, c & 15);
+			}
+			else
+			{
+				set_byte_bits(cETC1AbsColor4R1BitOffset, 4, (c >> 8) & 15);
+				set_byte_bits(cETC1AbsColor4G1BitOffset, 4, (c >> 4) & 15);
+				set_byte_bits(cETC1AbsColor4B1BitOffset, 4, c & 15);
+			}
+		}
+
+		inline uint16_t get_base4_color(uint32_t idx) const
+		{
+			uint32_t r, g, b;
+			if (idx)
+			{
+				r = get_byte_bits(cETC1AbsColor4R2BitOffset, 4);
+				g = get_byte_bits(cETC1AbsColor4G2BitOffset, 4);
+				b = get_byte_bits(cETC1AbsColor4B2BitOffset, 4);
+			}
+			else
+			{
+				r = get_byte_bits(cETC1AbsColor4R1BitOffset, 4);
+				g = get_byte_bits(cETC1AbsColor4G1BitOffset, 4);
+				b = get_byte_bits(cETC1AbsColor4B1BitOffset, 4);
+			}
+			return static_cast<uint16_t>(b | (g << 4U) | (r << 8U));
+		}
+
+		inline void set_base5_color(uint16_t c)
+		{
+			set_byte_bits(cETC1BaseColor5RBitOffset, 5, (c >> 10) & 31);
+			set_byte_bits(cETC1BaseColor5GBitOffset, 5, (c >> 5) & 31);
+			set_byte_bits(cETC1BaseColor5BBitOffset, 5, c & 31);
+		}
+
+		inline uint16_t get_base5_color() const
+		{
+			const uint32_t r = get_byte_bits(cETC1BaseColor5RBitOffset, 5);
+			const uint32_t g = get_byte_bits(cETC1BaseColor5GBitOffset, 5);
+			const uint32_t b = get_byte_bits(cETC1BaseColor5BBitOffset, 5);
+			return static_cast<uint16_t>(b | (g << 5U) | (r << 10U));
+		}
+
+		void set_delta3_color(uint16_t c)
+		{
+			set_byte_bits(cETC1DeltaColor3RBitOffset, 3, (c >> 6) & 7);
+			set_byte_bits(cETC1DeltaColor3GBitOffset, 3, (c >> 3) & 7);
+			set_byte_bits(cETC1DeltaColor3BBitOffset, 3, c & 7);
+		}
+
+		inline uint16_t get_delta3_color() const
+		{
+			const uint32_t r = get_byte_bits(cETC1DeltaColor3RBitOffset, 3);
+			const uint32_t g = get_byte_bits(cETC1DeltaColor3GBitOffset, 3);
+			const uint32_t b = get_byte_bits(cETC1DeltaColor3BBitOffset, 3);
+			return static_cast<uint16_t>(b | (g << 3U) | (r << 6U));
+		}
+
+		uint64_t determine_selectors(const color_rgba* pSource_pixels, bool perceptual, uint32_t begin_subblock = 0, uint32_t end_subblock = 2)
+		{
+			uint64_t total_error = 0;
+
+			for (uint32_t subblock = begin_subblock; subblock < end_subblock; subblock++)
+			{
+				color_rgba block_colors[4];
+				get_block_colors(block_colors, subblock);
+
+				if (get_flip_bit())
+				{
+					for (uint32_t y = 0; y < 2; y++)
+					{
+						for (uint32_t x = 0; x < 4; x++)
+						{
+							uint32_t best_selector = 0;
+							uint64_t best_error = UINT64_MAX;
+
+							for (uint32_t s = 0; s < 4; s++)
+							{
+								uint64_t err = color_distance(perceptual, block_colors[s], pSource_pixels[x + (subblock * 2 + y) * 4], false);
+								if (err < best_error)
+								{
+									best_error = err;
+									best_selector = s;
+								}
+							}
+
+							set_selector(x, subblock * 2 + y, best_selector);
+
+							total_error += best_error;
+						}
+					}
+				}
+				else
+				{
+					for (uint32_t y = 0; y < 4; y++)
+					{
+						for (uint32_t x = 0; x < 2; x++)
+						{
+							uint32_t best_selector = 0;
+							uint64_t best_error = UINT64_MAX;
+
+							for (uint32_t s = 0; s < 4; s++)
+							{
+								uint64_t err = color_distance(perceptual, block_colors[s], pSource_pixels[(subblock * 2) + x + y * 4], false);
+								if (err < best_error)
+								{
+									best_error = err;
+									best_selector = s;
+								}
+							}
+
+							set_selector(subblock * 2 + x, y, best_selector);
+
+							total_error += best_error;
+						}
+					}
+				}
+			}
+
+			return total_error;
+		}
+
+		color_rgba get_block_color(uint32_t subblock_index, bool scaled) const
+		{
+			color_rgba b;
+
+			if (get_diff_bit())
+			{
+				if (subblock_index)
+					unpack_color5(b, get_base5_color(), get_delta3_color(), scaled);
+				else
+					unpack_color5(b, get_base5_color(), scaled);
+			}
+			else
+			{
+				b = unpack_color4(get_base4_color(subblock_index), scaled);
+			}
+
+			return b;
+		}
+
+		uint32_t get_subblock_index(uint32_t x, uint32_t y) const
+		{
+			if (get_flip_bit())
+				return y >= 2;
+			else
+				return x >= 2;
+		}
+
+		bool get_block_colors(color_rgba* pBlock_colors, uint32_t subblock_index) const
+		{
+			color_rgba b;
+
+			if (get_diff_bit())
+			{
+				if (subblock_index)
+					unpack_color5(b, get_base5_color(), get_delta3_color(), true);
+				else
+					unpack_color5(b, get_base5_color(), true);
+			}
+			else
+			{
+				b = unpack_color4(get_base4_color(subblock_index), true);
+			}
+
+			const int* pInten_table = g_etc1_inten_tables[get_inten_table(subblock_index)];
+
+			bool dc = false;
+
+			pBlock_colors[0].set(clamp255(b.r + pInten_table[0], dc), clamp255(b.g + pInten_table[0], dc), clamp255(b.b + pInten_table[0], dc), 255);
+			pBlock_colors[1].set(clamp255(b.r + pInten_table[1], dc), clamp255(b.g + pInten_table[1], dc), clamp255(b.b + pInten_table[1], dc), 255);
+			pBlock_colors[2].set(clamp255(b.r + pInten_table[2], dc), clamp255(b.g + pInten_table[2], dc), clamp255(b.b + pInten_table[2], dc), 255);
+			pBlock_colors[3].set(clamp255(b.r + pInten_table[3], dc), clamp255(b.g + pInten_table[3], dc), clamp255(b.b + pInten_table[3], dc), 255);
+
+			return dc;
+		}
+
+		void get_block_color(color_rgba& color, uint32_t subblock_index, uint32_t selector_index) const
+		{
+			color_rgba b;
+
+			if (get_diff_bit())
+			{
+				if (subblock_index)
+					unpack_color5(b, get_base5_color(), get_delta3_color(), true);
+				else
+					unpack_color5(b, get_base5_color(), true);
+			}
+			else
+			{
+				b = unpack_color4(get_base4_color(subblock_index), true);
+			}
+
+			const int* pInten_table = g_etc1_inten_tables[get_inten_table(subblock_index)];
+
+			color.set(clamp255(b.r + pInten_table[selector_index]), clamp255(b.g + pInten_table[selector_index]), clamp255(b.b + pInten_table[selector_index]), 255);
+		}
+
+		bool get_block_low_high_colors(color_rgba* pBlock_colors, uint32_t subblock_index) const
+		{
+			color_rgba b;
+
+			if (get_diff_bit())
+			{
+				if (subblock_index)
+					unpack_color5(b, get_base5_color(), get_delta3_color(), true);
+				else
+					unpack_color5(b, get_base5_color(), true);
+			}
+			else
+			{
+				b = unpack_color4(get_base4_color(subblock_index), true);
+			}
+
+			const int* pInten_table = g_etc1_inten_tables[get_inten_table(subblock_index)];
+
+			bool dc = false;
+
+			pBlock_colors[0].set(clamp255(b.r + pInten_table[0], dc), clamp255(b.g + pInten_table[0], dc), clamp255(b.b + pInten_table[0], dc), 255);
+			pBlock_colors[1].set(clamp255(b.r + pInten_table[3], dc), clamp255(b.g + pInten_table[3], dc), clamp255(b.b + pInten_table[3], dc), 255);
+
+			return dc;
+		}
+
+		static void get_block_colors5(color_rgba *pBlock_colors, const color_rgba &base_color5, uint32_t inten_table, bool scaled = false)
+		{
+			color_rgba b(base_color5);
+
+			if (!scaled)
+			{
+				b.r = (b.r << 3) | (b.r >> 2);
+				b.g = (b.g << 3) | (b.g >> 2);
+				b.b = (b.b << 3) | (b.b >> 2);
+			}
+
+			const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+			pBlock_colors[0].set(clamp255(b.r + pInten_table[0]), clamp255(b.g + pInten_table[0]), clamp255(b.b + pInten_table[0]), 255);
+			pBlock_colors[1].set(clamp255(b.r + pInten_table[1]), clamp255(b.g + pInten_table[1]), clamp255(b.b + pInten_table[1]), 255);
+			pBlock_colors[2].set(clamp255(b.r + pInten_table[2]), clamp255(b.g + pInten_table[2]), clamp255(b.b + pInten_table[2]), 255);
+			pBlock_colors[3].set(clamp255(b.r + pInten_table[3]), clamp255(b.g + pInten_table[3]), clamp255(b.b + pInten_table[3]), 255);
+		}
+
+		static void get_block_colors4(color_rgba *pBlock_colors, const color_rgba &base_color4, uint32_t inten_table, bool scaled = false)
+		{
+			color_rgba b(base_color4);
+
+			if (!scaled)
+			{
+				b.r = (b.r << 4) | b.r;
+				b.g = (b.g << 4) | b.g;
+				b.b = (b.b << 4) | b.b;
+			}
+
+			const int* pInten_table = g_etc1_inten_tables[inten_table];
+
+			pBlock_colors[0].set(clamp255(b.r + pInten_table[0]), clamp255(b.g + pInten_table[0]), clamp255(b.b + pInten_table[0]), 255);
+			pBlock_colors[1].set(clamp255(b.r + pInten_table[1]), clamp255(b.g + pInten_table[1]), clamp255(b.b + pInten_table[1]), 255);
+			pBlock_colors[2].set(clamp255(b.r + pInten_table[2]), clamp255(b.g + pInten_table[2]), clamp255(b.b + pInten_table[2]), 255);
+			pBlock_colors[3].set(clamp255(b.r + pInten_table[3]), clamp255(b.g + pInten_table[3]), clamp255(b.b + pInten_table[3]), 255);
+		}
+
+		uint64_t evaluate_etc1_error(const color_rgba* pBlock_pixels, bool perceptual, int subblock_index = -1) const;
+		void get_subblock_pixels(color_rgba* pPixels, int subblock_index = -1) const;
+
+		void get_selector_range(uint32_t& low, uint32_t& high) const
+		{
+			low = 3;
+			high = 0;
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t s = get_selector(x, y);
+					low = minimum(low, s);
+					high = maximum(high, s);
+				}
+			}
+		}
+
+		void set_block_color4(const color_rgba &c0_unscaled, const color_rgba &c1_unscaled)
+		{
+			set_diff_bit(false);
+
+			set_base4_color(0, pack_color4(c0_unscaled, false));
+			set_base4_color(1, pack_color4(c1_unscaled, false));
+		}
+
+		void set_block_color5(const color_rgba &c0_unscaled, const color_rgba &c1_unscaled)
+		{
+			set_diff_bit(true);
+
+			set_base5_color(pack_color5(c0_unscaled, false));
+
+			int dr = c1_unscaled.r - c0_unscaled.r;
+			int dg = c1_unscaled.g - c0_unscaled.g;
+			int db = c1_unscaled.b - c0_unscaled.b;
+
+			set_delta3_color(pack_delta3(dr, dg, db));
+		}
+
+		void set_block_color5_etc1s(const color_rgba &c_unscaled)
+		{
+			set_diff_bit(true);
+			
+			set_base5_color(pack_color5(c_unscaled, false));
+			set_delta3_color(pack_delta3(0, 0, 0));
+		}
+
+		bool set_block_color5_check(const color_rgba &c0_unscaled, const color_rgba &c1_unscaled)
+		{
+			set_diff_bit(true);
+
+			set_base5_color(pack_color5(c0_unscaled, false));
+
+			int dr = c1_unscaled.r - c0_unscaled.r;
+			int dg = c1_unscaled.g - c0_unscaled.g;
+			int db = c1_unscaled.b - c0_unscaled.b;
+
+			if (((dr < cETC1ColorDeltaMin) || (dr > cETC1ColorDeltaMax)) ||
+				((dg < cETC1ColorDeltaMin) || (dg > cETC1ColorDeltaMax)) ||
+				((db < cETC1ColorDeltaMin) || (db > cETC1ColorDeltaMax)))
+				return false;
+
+			set_delta3_color(pack_delta3(dr, dg, db));
+
+			return true;
+		}
+
+		bool set_block_color5_clamp(const color_rgba &c0_unscaled, const color_rgba &c1_unscaled)
+		{
+			set_diff_bit(true);
+			set_base5_color(pack_color5(c0_unscaled, false));
+
+			int dr = c1_unscaled.r - c0_unscaled.r;
+			int dg = c1_unscaled.g - c0_unscaled.g;
+			int db = c1_unscaled.b - c0_unscaled.b;
+			
+			dr = clamp<int>(dr, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			dg = clamp<int>(dg, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+			db = clamp<int>(db, cETC1ColorDeltaMin, cETC1ColorDeltaMax);
+						
+			set_delta3_color(pack_delta3(dr, dg, db));
+
+			return true;
+		}
+		color_rgba get_selector_color(uint32_t x, uint32_t y, uint32_t s) const
+		{
+			color_rgba block_colors[4];
+
+			get_block_colors(block_colors, get_subblock_index(x, y));
+
+			return block_colors[s];
+		}
+
+		// Base color 5
+		static uint16_t pack_color5(const color_rgba& color, bool scaled, uint32_t bias = 127U);
+		static uint16_t pack_color5(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias = 127U);
+
+		static color_rgba unpack_color5(uint16_t packed_color5, bool scaled, uint32_t alpha = 255U);
+		static void unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color, bool scaled);
+		static void unpack_color5(color_rgba& result, uint16_t packed_color5, bool scaled);
+
+		static bool unpack_color5(color_rgba& result, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha = 255U);
+		static bool unpack_color5(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color5, uint16_t packed_delta3, bool scaled, uint32_t alpha = 255U);
+
+		// Delta color 3
+		// Inputs range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
+		static uint16_t pack_delta3(const color_rgba_i16& color);
+		static uint16_t pack_delta3(int r, int g, int b);
+
+		// Results range from -4 to 3 (cETC1ColorDeltaMin to cETC1ColorDeltaMax)
+		static color_rgba_i16 unpack_delta3(uint16_t packed_delta3);
+		static void unpack_delta3(int& r, int& g, int& b, uint16_t packed_delta3);
+
+		static bool try_pack_color5_delta3(const color_rgba *pColor5_unscaled)
+		{
+			int dr = pColor5_unscaled[1].r - pColor5_unscaled[0].r;
+			int dg = pColor5_unscaled[1].g - pColor5_unscaled[0].g;
+			int db = pColor5_unscaled[1].b - pColor5_unscaled[0].b;
+
+			if ((minimum(dr, dg, db) < cETC1ColorDeltaMin) || (maximum(dr, dg, db) > cETC1ColorDeltaMax))
+				return false;
+
+			return true;
+		}
+
+		// Abs color 4
+		static uint16_t pack_color4(const color_rgba& color, bool scaled, uint32_t bias = 127U);
+		static uint16_t pack_color4(uint32_t r, uint32_t g, uint32_t b, bool scaled, uint32_t bias = 127U);
+
+		static color_rgba unpack_color4(uint16_t packed_color4, bool scaled, uint32_t alpha = 255U);
+		static void unpack_color4(uint32_t& r, uint32_t& g, uint32_t& b, uint16_t packed_color4, bool scaled);
+
+		// subblock colors
+		static void get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint32_t table_idx);
+		static bool get_diff_subblock_colors(color_rgba* pDst, uint16_t packed_color5, uint16_t packed_delta3, uint32_t table_idx);
+		static void get_abs_subblock_colors(color_rgba* pDst, uint16_t packed_color4, uint32_t table_idx);
+
+		static inline void unscaled_to_scaled_color(color_rgba& dst, const color_rgba& src, bool color4)
+		{
+			if (color4)
+			{
+				dst.r = src.r | (src.r << 4);
+				dst.g = src.g | (src.g << 4);
+				dst.b = src.b | (src.b << 4);
+			}
+			else
+			{
+				dst.r = (src.r >> 2) | (src.r << 3);
+				dst.g = (src.g >> 2) | (src.g << 3);
+				dst.b = (src.b >> 2) | (src.b << 3);
+			}
+			dst.a = src.a;
+		}
+
+	private:
+		static uint8_t clamp255(int x, bool &did_clamp)
+		{
+			if (x < 0)
+			{
+				did_clamp = true;
+				return 0;
+			}
+			else if (x > 255)
+			{
+				did_clamp = true;
+				return 255;
+			}
+
+			return static_cast<uint8_t>(x);
+		}
+
+		static uint8_t clamp255(int x)
+		{
+			if (x < 0)
+				return 0;
+			else if (x > 255)
+				return 255;
+
+			return static_cast<uint8_t>(x);
+		}
+	};
+		
+	typedef basisu::vector<etc_block> etc_block_vec;
+
+	// Returns false if the unpack fails (could be bogus data or ETC2)
+	bool unpack_etc1(const etc_block& block, color_rgba *pDst, bool preserve_alpha = false);
+		
+	enum basis_etc_quality
+	{
+		cETCQualityFast,
+		cETCQualityMedium,
+		cETCQualitySlow,
+		cETCQualityUber,
+		cETCQualityTotal,
+	};
+
+	struct basis_etc1_pack_params
+	{
+		basis_etc_quality m_quality;
+		bool m_perceptual;
+		bool m_cluster_fit;
+		bool m_force_etc1s;
+		bool m_use_color4;
+		float m_flip_bias;
+
+		inline basis_etc1_pack_params()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_quality = cETCQualitySlow;
+			m_perceptual = true;
+			m_cluster_fit = true;
+			m_force_etc1s = false;
+			m_use_color4 = true;
+			m_flip_bias = 0.0f;
+		}
+	};
+
+	struct etc1_solution_coordinates
+	{
+		inline etc1_solution_coordinates() :
+			m_unscaled_color(0, 0, 0, 0),
+			m_inten_table(0),
+			m_color4(false)
+		{
+		}
+
+		inline etc1_solution_coordinates(uint32_t r, uint32_t g, uint32_t b, uint32_t inten_table, bool color4) :
+			m_unscaled_color((uint8_t)r, (uint8_t)g, (uint8_t)b, 255),
+			m_inten_table((uint8_t)inten_table),
+			m_color4(color4)
+		{
+		}
+
+		inline etc1_solution_coordinates(const color_rgba& c, uint32_t inten_table, bool color4) :
+			m_unscaled_color(c),
+			m_inten_table(inten_table),
+			m_color4(color4)
+		{
+		}
+
+		inline etc1_solution_coordinates(const etc1_solution_coordinates& other)
+		{
+			*this = other;
+		}
+
+		inline etc1_solution_coordinates& operator= (const etc1_solution_coordinates& rhs)
+		{
+			m_unscaled_color = rhs.m_unscaled_color;
+			m_inten_table = rhs.m_inten_table;
+			m_color4 = rhs.m_color4;
+			return *this;
+		}
+
+		inline void clear()
+		{
+			m_unscaled_color.clear();
+			m_inten_table = 0;
+			m_color4 = false;
+		}
+
+		inline void init(const color_rgba& c, uint32_t inten_table, bool color4)
+		{
+			m_unscaled_color = c;
+			m_inten_table = inten_table;
+			m_color4 = color4;
+		}
+
+		inline color_rgba get_scaled_color() const
+		{
+			int br, bg, bb;
+			if (m_color4)
+			{
+				br = m_unscaled_color.r | (m_unscaled_color.r << 4);
+				bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
+				bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
+			}
+			else
+			{
+				br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
+				bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
+				bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
+			}
+			return color_rgba((uint8_t)br, (uint8_t)bg, (uint8_t)bb, 255);
+		}
+
+		// returns true if anything was clamped
+		inline void get_block_colors(color_rgba* pBlock_colors)
+		{
+			int br, bg, bb;
+			if (m_color4)
+			{
+				br = m_unscaled_color.r | (m_unscaled_color.r << 4);
+				bg = m_unscaled_color.g | (m_unscaled_color.g << 4);
+				bb = m_unscaled_color.b | (m_unscaled_color.b << 4);
+			}
+			else
+			{
+				br = (m_unscaled_color.r >> 2) | (m_unscaled_color.r << 3);
+				bg = (m_unscaled_color.g >> 2) | (m_unscaled_color.g << 3);
+				bb = (m_unscaled_color.b >> 2) | (m_unscaled_color.b << 3);
+			}
+			const int* pInten_table = g_etc1_inten_tables[m_inten_table];
+			pBlock_colors[0].set(br + pInten_table[0], bg + pInten_table[0], bb + pInten_table[0], 255);
+			pBlock_colors[1].set(br + pInten_table[1], bg + pInten_table[1], bb + pInten_table[1], 255);
+			pBlock_colors[2].set(br + pInten_table[2], bg + pInten_table[2], bb + pInten_table[2], 255);
+			pBlock_colors[3].set(br + pInten_table[3], bg + pInten_table[3], bb + pInten_table[3], 255);
+		}
+
+		color_rgba m_unscaled_color;
+		uint32_t m_inten_table;
+		bool m_color4;
+	};
+
+	class etc1_optimizer
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(etc1_optimizer);
+
+	public:
+		etc1_optimizer()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_pParams = nullptr;
+			m_pResult = nullptr;
+			m_pSorted_luma = nullptr;
+			m_pSorted_luma_indices = nullptr;
+		}
+
+		struct params;
+
+		typedef bool(*evaluate_solution_override_func)(uint64_t &error, const params &p, const color_rgba* pBlock_colors, const uint8_t* pSelectors, const etc1_solution_coordinates& coords);
+
+		struct params : basis_etc1_pack_params
+		{
+			params()
+			{
+				clear();
+			}
+
+			params(const basis_etc1_pack_params& base_params)
+			{
+				clear_optimizer_params();
+
+				*static_cast<basis_etc1_pack_params *>(this) = base_params;
+			}
+
+			void clear()
+			{
+				clear_optimizer_params();
+			}
+
+			void clear_optimizer_params()
+			{
+				basis_etc1_pack_params::clear();
+
+				m_num_src_pixels = 0;
+				m_pSrc_pixels = 0;
+
+				m_use_color4 = false;
+				static const int s_default_scan_delta[] = { 0 };
+				m_pScan_deltas = s_default_scan_delta;
+				m_scan_delta_size = 1;
+
+				m_base_color5.clear();
+				m_constrain_against_base_color5 = false;
+
+				m_refinement = true;
+
+				m_pForce_selectors = nullptr;
+			}
+
+			uint32_t m_num_src_pixels;
+			const color_rgba* m_pSrc_pixels;
+
+			bool m_use_color4;
+			const int* m_pScan_deltas;
+			uint32_t m_scan_delta_size;
+
+			color_rgba m_base_color5;
+			bool m_constrain_against_base_color5;
+
+			bool m_refinement;
+
+			const uint8_t* m_pForce_selectors;
+		};
+
+		struct results
+		{
+			uint64_t m_error;
+			color_rgba m_block_color_unscaled;
+			uint32_t m_block_inten_table;
+			uint32_t m_n;
+			uint8_t* m_pSelectors;
+			bool m_block_color4;
+
+			inline results& operator= (const results& rhs)
+			{
+				m_block_color_unscaled = rhs.m_block_color_unscaled;
+				m_block_color4 = rhs.m_block_color4;
+				m_block_inten_table = rhs.m_block_inten_table;
+				m_error = rhs.m_error;
+				memcpy(m_pSelectors, rhs.m_pSelectors, minimum(rhs.m_n, m_n));
+				return *this;
+			}
+		};
+
+		void init(const params& params, results& result);
+		bool compute();
+
+		const params* get_params() const { return m_pParams; }
+
+	private:
+		struct potential_solution
+		{
+			potential_solution() : m_coords(), m_error(UINT64_MAX), m_valid(false)
+			{
+			}
+
+			etc1_solution_coordinates  m_coords;
+			basisu::vector<uint8_t>    m_selectors;
+			uint64_t                     m_error;
+			bool                       m_valid;
+
+			void clear()
+			{
+				m_coords.clear();
+				m_selectors.resize(0);
+				m_error = UINT64_MAX;
+				m_valid = false;
+			}
+
+			bool are_selectors_all_equal() const
+			{
+				if (!m_selectors.size())
+					return false;
+				const uint32_t s = m_selectors[0];
+				for (uint32_t i = 1; i < m_selectors.size(); i++)
+					if (m_selectors[i] != s)
+						return false;
+				return true;
+			}
+		};
+
+		const params* m_pParams;
+		results* m_pResult;
+
+		int m_limit;
+
+		vec3F m_avg_color;
+		int m_br, m_bg, m_bb;
+		int m_max_comp_spread;
+		basisu::vector<uint16_t> m_luma;
+		basisu::vector<uint32_t> m_sorted_luma;
+		basisu::vector<uint32_t> m_sorted_luma_indices;
+		const uint32_t* m_pSorted_luma_indices;
+		uint32_t* m_pSorted_luma;
+
+		basisu::vector<uint8_t> m_selectors;
+		basisu::vector<uint8_t> m_best_selectors;
+
+		potential_solution m_best_solution;
+		potential_solution m_trial_solution;
+		basisu::vector<uint8_t> m_temp_selectors;
+
+		enum { cSolutionsTriedHashBits = 10, cTotalSolutionsTriedHashSize = 1 << cSolutionsTriedHashBits, cSolutionsTriedHashMask = cTotalSolutionsTriedHashSize - 1 };
+		uint8_t m_solutions_tried[cTotalSolutionsTriedHashSize / 8];
+		
+		void get_nearby_inten_tables(uint32_t idx, int &first_inten_table, int &last_inten_table)
+		{
+			first_inten_table = maximum<int>(idx - 1, 0);
+			last_inten_table = minimum<int>(cETC1IntenModifierValues, idx + 1);
+		}
+		
+		bool check_for_redundant_solution(const etc1_solution_coordinates& coords);
+		bool evaluate_solution_slow(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
+		bool evaluate_solution_fast(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution);
+
+		inline bool evaluate_solution(const etc1_solution_coordinates& coords, potential_solution& trial_solution, potential_solution* pBest_solution)
+		{
+			if (m_pParams->m_quality >= cETCQualityMedium)
+				return evaluate_solution_slow(coords, trial_solution, pBest_solution);
+			else
+				return evaluate_solution_fast(coords, trial_solution, pBest_solution);
+		}
+
+		void refine_solution(uint32_t max_refinement_trials);
+		void compute_internal_neighborhood(int scan_r, int scan_g, int scan_b);
+		void compute_internal_cluster_fit(uint32_t total_perms_to_try);
+	};
+
+	struct pack_etc1_block_context
+	{
+		etc1_optimizer m_optimizer;
+	};
+	
+	void pack_etc1_solid_color_init();
+	uint64_t pack_etc1_block_solid_color(etc_block& block, const uint8_t* pColor);
+
+	// ETC EAC
+	extern const int8_t g_etc2_eac_tables[16][8];
+	extern const int8_t g_etc2_eac_tables8[16][8];
+
+	const uint32_t ETC2_EAC_MIN_VALUE_SELECTOR = 3, ETC2_EAC_MAX_VALUE_SELECTOR = 7;
+
+	struct eac_a8_block
+	{
+		uint16_t m_base : 8;
+		uint16_t m_table : 4;
+		uint16_t m_multiplier : 4;
+
+		uint8_t m_selectors[6];
+
+		inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const
+		{
+			assert((x < 4) && (y < 4));
+			return static_cast<uint32_t>((selector_bits >> (45 - (y + x * 4) * 3)) & 7);
+		}
+
+		inline uint64_t get_selector_bits() const
+		{
+			uint64_t pixels = ((uint64_t)m_selectors[0] << 40) | ((uint64_t)m_selectors[1] << 32) | ((uint64_t)m_selectors[2] << 24) | ((uint64_t)m_selectors[3] << 16) | ((uint64_t)m_selectors[4] << 8) | m_selectors[5];
+			return pixels;
+		}
+
+		inline void set_selector_bits(uint64_t pixels)
+		{
+			m_selectors[0] = (uint8_t)(pixels >> 40);
+			m_selectors[1] = (uint8_t)(pixels >> 32);
+			m_selectors[2] = (uint8_t)(pixels >> 24);
+			m_selectors[3] = (uint8_t)(pixels >> 16);
+			m_selectors[4] = (uint8_t)(pixels >> 8);
+			m_selectors[5] = (uint8_t)(pixels);
+		}
+
+		void set_selector(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < 4) && (y < 4) && (s < 8));
+
+			const uint32_t ofs = 45 - (y + x * 4) * 3;
+
+			uint64_t pixels = get_selector_bits();
+
+			pixels &= ~(7ULL << ofs);
+			pixels |= (static_cast<uint64_t>(s) << ofs);
+
+			set_selector_bits(pixels);
+		}
+	};
+
+	struct etc2_rgba_block
+	{
+		eac_a8_block m_alpha;
+		etc_block m_rgb;
+	};
+
+	struct pack_eac_a8_results
+	{
+		uint32_t m_base;
+		uint32_t m_table;
+		uint32_t m_multiplier;
+		uint8_vec m_selectors;
+		uint8_vec m_selectors_temp;
+	};
+
+	uint64_t pack_eac_a8(pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask = UINT32_MAX);
+	void pack_eac_a8(eac_a8_block* pBlock, const uint8_t* pPixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask = UINT32_MAX);
+		
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.cpp b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
new file mode 100644
index 0000000000..324fc8e447
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.cpp
@@ -0,0 +1,2967 @@
+// basisu_frontend.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// TODO: 
+// This code originally supported full ETC1 and ETC1S, so there's some legacy stuff to be cleaned up in here.
+// Add endpoint tiling support (where we force adjacent blocks to use the same endpoints during quantization), for a ~10% or more increase in bitrate at same SSIM. The backend already supports this.
+//
+#include "../transcoder/basisu.h"
+#include "basisu_frontend.h"
+#include <unordered_set>
+#include <unordered_map>
+
+#if BASISU_SUPPORT_SSE
+#define CPPSPMD_NAME(a) a##_sse41
+#include "basisu_kernels_declares.h"
+#endif
+
+#define BASISU_FRONTEND_VERIFY(c) do { if (!(c)) handle_verify_failure(__LINE__); } while(0)
+
+namespace basisu
+{
+	const uint32_t cMaxCodebookCreationThreads = 8;
+
+	const uint32_t BASISU_MAX_ENDPOINT_REFINEMENT_STEPS = 3;
+	//const uint32_t BASISU_MAX_SELECTOR_REFINEMENT_STEPS = 3;
+
+	const uint32_t BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE = 16;
+	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 = 32;
+	const uint32_t BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT = 16;
+	
+	// TODO - How to handle internal verifies in the basisu lib
+	static inline void handle_verify_failure(int line)
+	{
+			fprintf(stderr, "ERROR: basisu_frontend: verify check failed at line %i!\n", line);
+			abort();
+	}
+			
+	bool basisu_frontend::init(const params &p)
+	{
+#if 0
+		// HACK HACK
+		FILE* pFile;
+		fopen_s(&pFile, "tv.bin", "rb");
+		if (pFile)
+		{
+			debug_printf("Using tv.bin\n");
+
+			fseek(pFile, 0, SEEK_END);
+			uint32_t size = ftell(pFile);
+			fseek(pFile, 0, SEEK_SET);
+
+			uint32_t tv = size / sizeof(vec6F_quantizer::training_vec_with_weight);
+
+			basisu::vector<vec6F_quantizer::training_vec_with_weight> v(tv);
+			fread(&v[0], 1, sizeof(v[0]) * tv, pFile);
+
+			for (uint32_t i = 0; i < tv; i++)
+				m_endpoint_clusterizer.add_training_vec(v[i].first, v[i].second);
+
+			m_endpoint_clusterizer.generate(16128);
+			basisu::vector<uint_vec> codebook;
+			m_endpoint_clusterizer.retrieve(codebook);
+
+			printf("Generated %u entries\n", (uint32_t)codebook.size());
+
+			fclose(pFile);
+			exit(0);
+		}
+#endif
+
+		if (p.m_use_hybrid_selector_codebooks)
+		{
+			if (!p.m_pGlobal_sel_codebook)
+			{
+				debug_printf("basisu_frontend::init: No global sel codebook!\n");
+				assert(0);
+				return false;
+			}
+		}
+
+		debug_printf("basisu_frontend::init: Multithreaded: %u, NumEndpointClusters: %u, NumSelectorClusters: %u, Perceptual: %u, CompressionLevel: %u\n",
+			p.m_multithreaded, p.m_max_endpoint_clusters, p.m_max_selector_clusters, p.m_perceptual, p.m_compression_level);
+
+		debug_printf("Global sel codebook pal bits: %u, Global sel codebook mod bits: %u, Use hybrid selector codebook: %u, Hybrid codebook quality thresh: %f\n",
+			p.m_num_global_sel_codebook_pal_bits,
+			p.m_num_global_sel_codebook_mod_bits,
+			p.m_use_hybrid_selector_codebooks,
+			p.m_hybrid_codebook_quality_thresh);
+				
+		if ((p.m_max_endpoint_clusters < 1) || (p.m_max_endpoint_clusters > cMaxEndpointClusters))
+			return false;
+		if ((p.m_max_selector_clusters < 1) || (p.m_max_selector_clusters > cMaxSelectorClusters))
+			return false;
+
+		m_source_blocks.resize(0);
+		append_vector(m_source_blocks, p.m_pSource_blocks, p.m_num_source_blocks);
+
+		m_params = p;
+
+		m_encoded_blocks.resize(m_params.m_num_source_blocks);
+		memset(&m_encoded_blocks[0], 0, m_encoded_blocks.size() * sizeof(m_encoded_blocks[0]));
+			
+		m_num_endpoint_codebook_iterations = 1;
+		m_num_selector_codebook_iterations = 1;
+
+		switch (p.m_compression_level)
+		{
+		case 0:
+		{
+			m_endpoint_refinement = false;
+			m_use_hierarchical_endpoint_codebooks = true;
+			m_use_hierarchical_selector_codebooks = true;
+			break;
+		}
+		case 1:
+		{
+			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = true;
+			m_use_hierarchical_selector_codebooks = true;
+
+			break;
+		}
+		case 2:
+		{
+			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = true;
+			m_use_hierarchical_selector_codebooks = true;
+
+			break;
+		}
+		case 3:
+		{
+			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = false;
+			m_use_hierarchical_selector_codebooks = false;
+			break;
+		}
+		case 4:
+		{
+			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = true;
+			m_use_hierarchical_selector_codebooks = true;
+			m_num_endpoint_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
+			m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
+			break;
+		}
+		case 5:
+		{
+			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = false;
+			m_use_hierarchical_selector_codebooks = false;
+			m_num_endpoint_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
+			m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS;
+			break;
+		}
+		case 6:
+		default:
+		{
+			m_endpoint_refinement = true;
+			m_use_hierarchical_endpoint_codebooks = false;
+			m_use_hierarchical_selector_codebooks = false;
+			m_num_endpoint_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS*2;
+			m_num_selector_codebook_iterations = BASISU_MAX_ENDPOINT_REFINEMENT_STEPS*2;
+			break;
+		}
+
+		}
+
+		if (m_params.m_disable_hierarchical_endpoint_codebooks)
+			m_use_hierarchical_endpoint_codebooks = false;
+
+		debug_printf("Endpoint refinement: %u, Hierarchical endpoint codebooks: %u, Hierarchical selector codebooks: %u, Endpoint codebook iters: %u, Selector codebook iters: %u\n", 
+			m_endpoint_refinement, m_use_hierarchical_endpoint_codebooks, m_use_hierarchical_selector_codebooks, m_num_endpoint_codebook_iterations, m_num_selector_codebook_iterations);
+
+		return true;
+	}
+
+	bool basisu_frontend::compress()
+	{
+		debug_printf("basisu_frontend::compress\n");
+
+		m_total_blocks = m_params.m_num_source_blocks;
+		m_total_pixels = m_total_blocks * cPixelBlockTotalPixels;
+
+		init_etc1_images();
+
+		if (m_params.m_pGlobal_codebooks)
+		{
+			init_global_codebooks();
+		}
+		else
+		{
+			init_endpoint_training_vectors();
+
+			generate_endpoint_clusters();
+				
+			for (uint32_t refine_endpoint_step = 0; refine_endpoint_step < m_num_endpoint_codebook_iterations; refine_endpoint_step++)
+			{
+				BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
+
+				if (refine_endpoint_step)
+				{
+					introduce_new_endpoint_clusters();
+				}
+
+				generate_endpoint_codebook(refine_endpoint_step);
+
+				if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
+				{
+					char buf[256];
+					snprintf(buf, sizeof(buf), "endpoint_cluster_vis_pre_%u.png", refine_endpoint_step);
+					dump_endpoint_clusterization_visualization(buf, false);
+				}
+
+				bool early_out = false;
+
+				if (m_endpoint_refinement)
+				{
+					//dump_endpoint_clusterization_visualization("endpoint_clusters_before_refinement.png");
+
+					if (!refine_endpoint_clusterization())
+						early_out = true;
+
+					if ((m_params.m_tex_type == basist::cBASISTexTypeVideoFrames) && (!refine_endpoint_step) && (m_num_endpoint_codebook_iterations == 1))
+					{
+						eliminate_redundant_or_empty_endpoint_clusters();
+						generate_endpoint_codebook(refine_endpoint_step);
+					}
+
+					if ((m_params.m_debug_images) && (m_params.m_dump_endpoint_clusterization))
+					{
+						char buf[256];
+						snprintf(buf, sizeof(buf), "endpoint_cluster_vis_post_%u.png", refine_endpoint_step);
+
+						dump_endpoint_clusterization_visualization(buf, false);
+						snprintf(buf, sizeof(buf), "endpoint_cluster_colors_vis_post_%u.png", refine_endpoint_step);
+
+						dump_endpoint_clusterization_visualization(buf, true);
+					}
+				}
+						
+				eliminate_redundant_or_empty_endpoint_clusters();
+
+				if (m_params.m_debug_stats)
+					debug_printf("Total endpoint clusters: %u\n", (uint32_t)m_endpoint_clusters.size());
+
+				if (early_out)
+					break;
+			}
+
+			BASISU_FRONTEND_VERIFY(check_etc1s_constraints());
+
+			generate_block_endpoint_clusters();
+
+			create_initial_packed_texture();
+
+			generate_selector_clusters();
+
+			if (m_use_hierarchical_selector_codebooks)
+				compute_selector_clusters_within_each_parent_cluster();
+				
+			if (m_params.m_compression_level == 0)
+			{
+				create_optimized_selector_codebook(0);
+
+				find_optimal_selector_clusters_for_each_block();
+			
+				introduce_special_selector_clusters();
+			}
+			else
+			{
+				const uint32_t num_refine_selector_steps = m_params.m_pGlobal_sel_codebook ? 1 : m_num_selector_codebook_iterations;
+				for (uint32_t refine_selector_steps = 0; refine_selector_steps < num_refine_selector_steps; refine_selector_steps++)
+				{
+					create_optimized_selector_codebook(refine_selector_steps);
+
+					find_optimal_selector_clusters_for_each_block();
+
+					introduce_special_selector_clusters();
+				
+					if ((m_params.m_compression_level >= 4) || (m_params.m_tex_type == basist::cBASISTexTypeVideoFrames))
+					{
+						if (!refine_block_endpoints_given_selectors())
+							break;
+					}
+				}
+			}
+						
+			optimize_selector_codebook();
+
+			if (m_params.m_debug_stats)
+				debug_printf("Total selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size());
+		}
+
+		finalize();
+
+		if (m_params.m_validate)
+		{
+			if (!validate_output())
+				return false;
+		}
+
+		debug_printf("basisu_frontend::compress: Done\n");
+
+		return true;
+	}
+
+	bool basisu_frontend::init_global_codebooks()
+	{
+		const basist::basisu_lowlevel_etc1s_transcoder* pTranscoder = m_params.m_pGlobal_codebooks;
+
+		const basist::basisu_lowlevel_etc1s_transcoder::endpoint_vec& endpoints = pTranscoder->get_endpoints();
+		const basist::basisu_lowlevel_etc1s_transcoder::selector_vec& selectors = pTranscoder->get_selectors();
+				
+		m_endpoint_cluster_etc_params.resize(endpoints.size());
+		for (uint32_t i = 0; i < endpoints.size(); i++)
+		{
+			m_endpoint_cluster_etc_params[i].m_inten_table[0] = endpoints[i].m_inten5;
+			m_endpoint_cluster_etc_params[i].m_inten_table[1] = endpoints[i].m_inten5;
+
+			m_endpoint_cluster_etc_params[i].m_color_unscaled[0].set(endpoints[i].m_color5.r, endpoints[i].m_color5.g, endpoints[i].m_color5.b, 255);
+			m_endpoint_cluster_etc_params[i].m_color_used[0] = true;
+			m_endpoint_cluster_etc_params[i].m_valid = true;
+		}
+
+		m_optimized_cluster_selectors.resize(selectors.size());
+		for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)
+		{
+			for (uint32_t y = 0; y < 4; y++)
+				for (uint32_t x = 0; x < 4; x++)
+					m_optimized_cluster_selectors[i].set_selector(x, y, selectors[i].get_selector(x, y));
+		}
+
+		m_block_endpoint_clusters_indices.resize(m_total_blocks);
+
+		m_orig_encoded_blocks.resize(m_total_blocks);
+
+		m_block_selector_cluster_index.resize(m_total_blocks);
+
+#if 0
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+		{
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job([this, first_index, last_index] {
+#endif
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					const etc_block& blk = m_etc1_blocks_etc1s[block_index];
+
+					const uint32_t block_endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+
+					etc_block trial_blk;
+					trial_blk.set_block_color5_etc1s(blk.m_color_unscaled[0]);
+					trial_blk.set_flip_bit(true);
+
+					uint64_t best_err = UINT64_MAX;
+					uint32_t best_index = 0;
+
+					for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)
+					{
+						trial_blk.set_raw_selector_bits(m_optimized_cluster_selectors[i].get_raw_selector_bits());
+
+						const uint64_t cur_err = trial_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+						if (cur_err < best_err)
+						{
+							best_err = cur_err;
+							best_index = i;
+							if (!cur_err)
+								break;
+						}
+
+					} // block_index
+
+					m_block_selector_cluster_index[block_index] = best_index;
+				}
+
+#ifndef __EMSCRIPTEN__
+				});
+#endif
+
+		}
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		m_encoded_blocks.resize(m_total_blocks);
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+			const uint32_t endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+			const uint32_t selector_index = m_block_selector_cluster_index[block_index];
+
+			etc_block& blk = m_encoded_blocks[block_index];
+
+			blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_color_unscaled[0]);
+			blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_inten_table[0]);
+			blk.set_flip_bit(true);
+			blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_index].get_raw_selector_bits());
+		}
+#endif
+
+		// HACK HACK
+		const uint32_t NUM_PASSES = 3;
+		for (uint32_t pass = 0; pass < NUM_PASSES; pass++)
+		{
+			debug_printf("init_global_codebooks: pass %u\n", pass);
+
+			const uint32_t N = 128;
+			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job([this, first_index, last_index, pass] {
+#endif
+										
+					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+					{
+						const etc_block& blk = pass ? m_encoded_blocks[block_index] : m_etc1_blocks_etc1s[block_index];
+						const uint32_t blk_raw_selector_bits = blk.get_raw_selector_bits();
+
+						etc_block trial_blk(blk);
+						trial_blk.set_raw_selector_bits(blk_raw_selector_bits);
+						trial_blk.set_flip_bit(true);
+
+						uint64_t best_err = UINT64_MAX;
+						uint32_t best_index = 0;
+						etc_block best_block(trial_blk);
+												
+						for (uint32_t i = 0; i < m_endpoint_cluster_etc_params.size(); i++)
+						{
+							if (m_endpoint_cluster_etc_params[i].m_inten_table[0] > blk.get_inten_table(0))
+								continue;
+
+							trial_blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[i].m_color_unscaled[0]);
+							trial_blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[i].m_inten_table[0]);
+
+							const color_rgba* pSource_pixels = get_source_pixel_block(block_index).get_ptr();
+							uint64_t cur_err;
+							if (!pass)
+								cur_err = trial_blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
+							else
+								cur_err = trial_blk.evaluate_etc1_error(pSource_pixels, m_params.m_perceptual);
+
+							if (cur_err < best_err)
+							{
+								best_err = cur_err;
+								best_index = i;
+								best_block = trial_blk;
+
+								if (!cur_err)
+									break;
+							}
+						}
+
+						m_block_endpoint_clusters_indices[block_index][0] = best_index;
+						m_block_endpoint_clusters_indices[block_index][1] = best_index;
+
+						m_orig_encoded_blocks[block_index] = best_block;
+
+					} // block_index
+
+#ifndef __EMSCRIPTEN__
+					});
+#endif
+
+			}
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			m_endpoint_clusters.resize(0);
+			m_endpoint_clusters.resize(endpoints.size());
+			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+			{
+				const uint32_t endpoint_cluster_index = m_block_endpoint_clusters_indices[block_index][0];
+				m_endpoint_clusters[endpoint_cluster_index].push_back(block_index * 2);
+				m_endpoint_clusters[endpoint_cluster_index].push_back(block_index * 2 + 1);
+			}
+
+			m_block_selector_cluster_index.resize(m_total_blocks);
+
+			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job([this, first_index, last_index] {
+#endif
+
+					for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+					{
+						const uint32_t block_endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+
+						etc_block trial_blk;
+						trial_blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[block_endpoint_index].m_color_unscaled[0]);
+						trial_blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[block_endpoint_index].m_inten_table[0]);
+						trial_blk.set_flip_bit(true);
+
+						uint64_t best_err = UINT64_MAX;
+						uint32_t best_index = 0;
+
+						for (uint32_t i = 0; i < m_optimized_cluster_selectors.size(); i++)
+						{
+							trial_blk.set_raw_selector_bits(m_optimized_cluster_selectors[i].get_raw_selector_bits());
+
+							const uint64_t cur_err = trial_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+							if (cur_err < best_err)
+							{
+								best_err = cur_err;
+								best_index = i;
+								if (!cur_err)
+									break;
+							}
+
+						} // block_index
+
+						m_block_selector_cluster_index[block_index] = best_index;
+					}
+
+#ifndef __EMSCRIPTEN__
+					});
+#endif
+
+			}
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			m_encoded_blocks.resize(m_total_blocks);
+			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+			{
+				const uint32_t endpoint_index = m_block_endpoint_clusters_indices[block_index][0];
+				const uint32_t selector_index = m_block_selector_cluster_index[block_index];
+
+				etc_block& blk = m_encoded_blocks[block_index];
+
+				blk.set_block_color5_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_color_unscaled[0]);
+				blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[endpoint_index].m_inten_table[0]);
+				blk.set_flip_bit(true);
+				blk.set_raw_selector_bits(m_optimized_cluster_selectors[selector_index].get_raw_selector_bits());
+			}
+
+		} // pass
+
+		m_selector_cluster_block_indices.resize(selectors.size());
+		for (uint32_t block_index = 0; block_index < m_etc1_blocks_etc1s.size(); block_index++)
+			m_selector_cluster_block_indices[m_block_selector_cluster_index[block_index]].push_back(block_index);
+				
+		return true;
+	}
+
+	void basisu_frontend::introduce_special_selector_clusters()
+	{
+		debug_printf("introduce_special_selector_clusters\n");
+
+		if (m_params.m_pGlobal_sel_codebook)
+			return;
+
+		uint32_t total_blocks_relocated = 0;
+		const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
+
+		bool_vec block_relocated_flags(m_total_blocks);
+
+		// Make sure the selector codebook always has pure flat blocks for each possible selector, to avoid obvious artifacts.
+		// optimize_selector_codebook() will clean up any redundant clusters we create here.
+		for (uint32_t sel = 0; sel < 4; sel++)
+		{
+			etc_block blk;
+			clear_obj(blk);
+			for (uint32_t j = 0; j < 16; j++)
+				blk.set_selector(j & 3, j >> 2, sel);
+
+			int k;
+			for (k = 0; k < (int)m_optimized_cluster_selectors.size(); k++)
+				if (m_optimized_cluster_selectors[k].get_raw_selector_bits() == blk.get_raw_selector_bits())
+					break;
+			if (k < (int)m_optimized_cluster_selectors.size())
+				continue;
+
+			debug_printf("Introducing sel %u\n", sel);
+
+			const uint32_t new_selector_cluster_index = (uint32_t)m_optimized_cluster_selectors.size();
+
+			m_optimized_cluster_selectors.push_back(blk);
+			
+			vector_ensure_element_is_valid(m_selector_cluster_block_indices, new_selector_cluster_index);
+			
+			for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+			{
+				if (m_orig_encoded_blocks[block_index].get_raw_selector_bits() != blk.get_raw_selector_bits())
+					continue;
+
+				// See if using flat selectors actually decreases the block's error.
+				const uint32_t old_selector_cluster_index = m_block_selector_cluster_index[block_index];
+				
+				etc_block cur_blk;
+				const uint32_t endpoint_cluster_index = get_subblock_endpoint_cluster_index(block_index, 0);
+				cur_blk.set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(endpoint_cluster_index, false));
+				cur_blk.set_inten_tables_etc1s(get_endpoint_cluster_inten_table(endpoint_cluster_index, false));
+				cur_blk.set_raw_selector_bits(get_selector_cluster_selector_bits(old_selector_cluster_index).get_raw_selector_bits());
+				cur_blk.set_flip_bit(true);
+
+				const uint64_t cur_err = cur_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+
+				cur_blk.set_raw_selector_bits(blk.get_raw_selector_bits());
+
+				const uint64_t new_err = cur_blk.evaluate_etc1_error(get_source_pixel_block(block_index).get_ptr(), m_params.m_perceptual);
+
+				if (new_err >= cur_err)
+					continue;
+				
+				// Change the block to use the new cluster
+				m_block_selector_cluster_index[block_index] = new_selector_cluster_index;
+				
+				m_selector_cluster_block_indices[new_selector_cluster_index].push_back(block_index);
+
+				block_relocated_flags[block_index] = true;
+
+#if 0
+				int j = vector_find(m_selector_cluster_block_indices[old_selector_cluster_index], block_index);
+				if (j >= 0)
+					m_selector_cluster_block_indices[old_selector_cluster_index].erase(m_selector_cluster_block_indices[old_selector_cluster_index].begin() + j);
+#endif
+
+				total_blocks_relocated++;
+
+				m_encoded_blocks[block_index].set_raw_selector_bits(blk.get_raw_selector_bits());
+
+			} // block_index
+
+		} // sel
+
+		if (total_blocks_relocated)
+		{
+			debug_printf("Fixing selector codebook\n");
+
+			for (int selector_cluster_index = 0; selector_cluster_index < (int)initial_selector_clusters; selector_cluster_index++)
+			{
+				uint_vec& block_indices = m_selector_cluster_block_indices[selector_cluster_index];
+
+				uint32_t dst_ofs = 0;
+
+				for (uint32_t i = 0; i < block_indices.size(); i++)
+				{
+					const uint32_t block_index = block_indices[i];
+					if (!block_relocated_flags[block_index])
+						block_indices[dst_ofs++] = block_index;
+				}
+
+				block_indices.resize(dst_ofs);
+			}
+		}
+
+		debug_printf("Total blocks relocated to new flat selector clusters: %u\n", total_blocks_relocated);
+	}
+
+	// This method will change the number and ordering of the selector codebook clusters.
+	void basisu_frontend::optimize_selector_codebook()
+	{
+		debug_printf("optimize_selector_codebook\n");
+
+		const uint32_t orig_total_selector_clusters = (uint32_t)m_optimized_cluster_selectors.size();
+
+		bool_vec selector_cluster_was_used(m_optimized_cluster_selectors.size());
+		for (uint32_t i = 0; i < m_total_blocks; i++)
+			selector_cluster_was_used[m_block_selector_cluster_index[i]] = true;
+
+		int_vec old_to_new(m_optimized_cluster_selectors.size());
+		int_vec new_to_old;
+		uint32_t total_new_entries = 0;
+
+		std::unordered_map<uint32_t, uint32_t> selector_hashmap;
+
+		for (int i = 0; i < static_cast<int>(m_optimized_cluster_selectors.size()); i++)
+		{
+			if (!selector_cluster_was_used[i])
+			{
+				old_to_new[i] = -1;
+				continue;
+			}
+
+			const uint32_t raw_selector_bits = m_optimized_cluster_selectors[i].get_raw_selector_bits();
+
+			auto find_res = selector_hashmap.insert(std::make_pair(raw_selector_bits, total_new_entries));
+			if (!find_res.second)
+			{
+				old_to_new[i] = (find_res.first)->second;
+				continue;
+			}
+						
+			old_to_new[i] = total_new_entries++;
+			new_to_old.push_back(i);
+		}
+
+		debug_printf("Original selector clusters: %u, new cluster selectors: %u\n", orig_total_selector_clusters, total_new_entries);
+
+		for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)
+		{
+			BASISU_FRONTEND_VERIFY((old_to_new[m_block_selector_cluster_index[i]] >= 0) && (old_to_new[m_block_selector_cluster_index[i]] < (int)total_new_entries));
+			m_block_selector_cluster_index[i] = old_to_new[m_block_selector_cluster_index[i]];
+		}
+
+		basisu::vector<etc_block> new_optimized_cluster_selectors(m_optimized_cluster_selectors.size() ? total_new_entries : 0);
+		basist::etc1_global_selector_codebook_entry_id_vec new_optimized_cluster_selector_global_cb_ids(m_optimized_cluster_selector_global_cb_ids.size() ? total_new_entries : 0);
+		basisu::vector<uint_vec> new_selector_cluster_indices(m_selector_cluster_block_indices.size() ? total_new_entries : 0);
+		bool_vec new_selector_cluster_uses_global_cb(m_selector_cluster_uses_global_cb.size() ? total_new_entries : 0);
+
+		for (uint32_t i = 0; i < total_new_entries; i++)
+		{
+			if (m_optimized_cluster_selectors.size())
+				new_optimized_cluster_selectors[i] = m_optimized_cluster_selectors[new_to_old[i]];
+
+			if (m_optimized_cluster_selector_global_cb_ids.size())
+				new_optimized_cluster_selector_global_cb_ids[i] = m_optimized_cluster_selector_global_cb_ids[new_to_old[i]];
+
+			//if (m_selector_cluster_block_indices.size())
+			//	new_selector_cluster_indices[i] = m_selector_cluster_block_indices[new_to_old[i]];
+
+			if (m_selector_cluster_uses_global_cb.size())
+				new_selector_cluster_uses_global_cb[i] = m_selector_cluster_uses_global_cb[new_to_old[i]];
+		}
+
+		for (uint32_t i = 0; i < m_block_selector_cluster_index.size(); i++)
+		{
+			new_selector_cluster_indices[m_block_selector_cluster_index[i]].push_back(i);
+		}
+				
+		m_optimized_cluster_selectors.swap(new_optimized_cluster_selectors);
+		m_optimized_cluster_selector_global_cb_ids.swap(new_optimized_cluster_selector_global_cb_ids);
+		m_selector_cluster_block_indices.swap(new_selector_cluster_indices);
+		m_selector_cluster_uses_global_cb.swap(new_selector_cluster_uses_global_cb);
+
+		// This isn't strictly necessary - doing it for completeness/future sanity.
+		if (m_selector_clusters_within_each_parent_cluster.size())
+		{
+			for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+				for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+					m_selector_clusters_within_each_parent_cluster[i][j] = old_to_new[m_selector_clusters_within_each_parent_cluster[i][j]];
+		}
+								
+		debug_printf("optimize_selector_codebook: Before: %u After: %u\n", orig_total_selector_clusters, total_new_entries);
+	}
+
+	void basisu_frontend::init_etc1_images()
+	{
+		debug_printf("basisu_frontend::init_etc1_images\n");
+
+		interval_timer tm;
+		tm.start();
+				
+		m_etc1_blocks_etc1s.resize(m_total_blocks);
+
+		const uint32_t N = 4096;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+		{
+			const uint32_t first_index = block_index_iter;                                        
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);       
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+#endif
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++) 
+				{
+					const pixel_block &source_blk = get_source_pixel_block(block_index);
+
+					etc1_optimizer optimizer;
+					etc1_optimizer::params optimizer_params;
+					etc1_optimizer::results optimizer_results;
+			
+					if (m_params.m_compression_level == 0)
+						optimizer_params.m_quality = cETCQualityFast;
+					else if (m_params.m_compression_level == 1)
+						optimizer_params.m_quality = cETCQualityMedium;
+					else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+						optimizer_params.m_quality = cETCQualityUber;
+						
+					optimizer_params.m_num_src_pixels = 16;
+					optimizer_params.m_pSrc_pixels = source_blk.get_ptr();
+					optimizer_params.m_perceptual = m_params.m_perceptual;
+
+					uint8_t selectors[16];
+					optimizer_results.m_pSelectors = selectors;
+					optimizer_results.m_n = 16;
+
+					optimizer.init(optimizer_params, optimizer_results);
+					if (!optimizer.compute())
+						BASISU_FRONTEND_VERIFY(false);
+
+					etc_block &blk = m_etc1_blocks_etc1s[block_index];
+
+					memset(&blk, 0, sizeof(blk));
+					blk.set_block_color5_etc1s(optimizer_results.m_block_color_unscaled);
+					blk.set_inten_tables_etc1s(optimizer_results.m_block_inten_table);
+					blk.set_flip_bit(true);
+
+					for (uint32_t y = 0; y < 4; y++)
+						for (uint32_t x = 0; x < 4; x++)
+							blk.set_selector(x, y, selectors[x + y * 4]);
+				}
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		}
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
+	}
+
+	void basisu_frontend::init_endpoint_training_vectors()
+	{
+		debug_printf("init_endpoint_training_vectors\n");
+								
+		vec6F_quantizer::array_of_weighted_training_vecs &training_vecs = m_endpoint_clusterizer.get_training_vecs();
+		
+		training_vecs.resize(m_total_blocks * 2);
+
+		const uint32_t N = 16384;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+		{
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
+#endif
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{			
+					const etc_block &blk = m_etc1_blocks_etc1s[block_index];
+
+					color_rgba block_colors[2];
+					blk.get_block_low_high_colors(block_colors, 0);
+				
+					vec6F v;
+					v[0] = block_colors[0].r * (1.0f / 255.0f);
+					v[1] = block_colors[0].g * (1.0f / 255.0f);
+					v[2] = block_colors[0].b * (1.0f / 255.0f);
+					v[3] = block_colors[1].r * (1.0f / 255.0f);
+					v[4] = block_colors[1].g * (1.0f / 255.0f);
+					v[5] = block_colors[1].b * (1.0f / 255.0f);
+				
+					training_vecs[block_index * 2 + 0] = std::make_pair(v, 1);
+					training_vecs[block_index * 2 + 1] = std::make_pair(v, 1);
+
+				} // block_index;
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+	}
+
+	void basisu_frontend::generate_endpoint_clusters()
+	{
+		debug_printf("Begin endpoint quantization\n");
+
+		const uint32_t parent_codebook_size = (m_params.m_max_endpoint_clusters >= 256) ? BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE : 0;
+		uint32_t max_threads = 0;
+		max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
+
+		debug_printf("Using %u threads to create codebook\n", max_threads);
+		bool status = generate_hierarchical_codebook_threaded(m_endpoint_clusterizer,
+			m_params.m_max_endpoint_clusters, m_use_hierarchical_endpoint_codebooks ? parent_codebook_size : 0,
+			m_endpoint_clusters,
+			m_endpoint_parent_clusters,
+			max_threads, m_params.m_pJob_pool);
+		BASISU_FRONTEND_VERIFY(status);
+
+		if (m_use_hierarchical_endpoint_codebooks)
+		{
+			if (!m_endpoint_parent_clusters.size())
+			{
+				m_endpoint_parent_clusters.resize(0);
+				m_endpoint_parent_clusters.resize(1);
+				for (uint32_t i = 0; i < m_total_blocks; i++)
+				{
+					m_endpoint_parent_clusters[0].push_back(i*2);
+					m_endpoint_parent_clusters[0].push_back(i*2+1);
+				}
+			}
+
+			BASISU_ASSUME(BASISU_ENDPOINT_PARENT_CODEBOOK_SIZE <= UINT8_MAX);
+
+			m_block_parent_endpoint_cluster.resize(0);
+			m_block_parent_endpoint_cluster.resize(m_total_blocks);
+			vector_set_all(m_block_parent_endpoint_cluster, 0xFF);
+			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_endpoint_parent_clusters.size(); parent_cluster_index++)
+			{
+				const uint_vec &cluster = m_endpoint_parent_clusters[parent_cluster_index];
+				for (uint32_t j = 0; j < cluster.size(); j++)
+				{
+					const uint32_t block_index = cluster[j] >> 1;
+					m_block_parent_endpoint_cluster[block_index] = static_cast<uint8_t>(parent_cluster_index);
+				}
+			}
+
+			for (uint32_t i = 0; i < m_total_blocks; i++)
+			{
+				BASISU_FRONTEND_VERIFY(m_block_parent_endpoint_cluster[i] != 0xFF);
+			}
+
+			// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.
+			for (uint32_t cluster_index = 0; cluster_index < m_endpoint_clusters.size(); cluster_index++)
+			{
+				const uint_vec &cluster = m_endpoint_clusters[cluster_index];
+			
+				uint32_t parent_cluster_index = 0;
+				for (uint32_t j = 0; j < cluster.size(); j++)
+				{
+					const uint32_t block_index = cluster[j] >> 1;
+					if (!j)
+					{
+						parent_cluster_index = m_block_parent_endpoint_cluster[block_index];
+					}
+					else
+					{
+						BASISU_FRONTEND_VERIFY(m_block_parent_endpoint_cluster[block_index] == parent_cluster_index);
+					}
+				}
+			}
+		}
+								
+		if (m_params.m_debug_stats)
+			debug_printf("Total endpoint clusters: %u, parent clusters: %u\n", (uint32_t)m_endpoint_clusters.size(), (uint32_t)m_endpoint_parent_clusters.size());
+	}
+
+	void basisu_frontend::generate_block_endpoint_clusters()
+	{
+		m_block_endpoint_clusters_indices.resize(m_total_blocks);
+
+		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
+		{
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+
+			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+			{
+				const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+				const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+
+				m_block_endpoint_clusters_indices[block_index][subblock_index] = cluster_index;
+
+			} // cluster_indices_iter
+		}
+
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+			uint32_t cluster_0 = m_block_endpoint_clusters_indices[block_index][0];
+			uint32_t cluster_1 = m_block_endpoint_clusters_indices[block_index][1];
+			BASISU_FRONTEND_VERIFY(cluster_0 == cluster_1);
+		}
+	}
+
+	void basisu_frontend::compute_endpoint_clusters_within_each_parent_cluster()
+	{
+		generate_block_endpoint_clusters();
+
+		m_endpoint_clusters_within_each_parent_cluster.resize(0);
+		m_endpoint_clusters_within_each_parent_cluster.resize(m_endpoint_parent_clusters.size());
+
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+			const uint32_t cluster_index = m_block_endpoint_clusters_indices[block_index][0];
+			const uint32_t parent_cluster_index = m_block_parent_endpoint_cluster[block_index];
+
+			m_endpoint_clusters_within_each_parent_cluster[parent_cluster_index].push_back(cluster_index);
+		}
+
+		for (uint32_t i = 0; i < m_endpoint_clusters_within_each_parent_cluster.size(); i++)
+		{
+			uint_vec &cluster_indices = m_endpoint_clusters_within_each_parent_cluster[i];
+
+			BASISU_FRONTEND_VERIFY(cluster_indices.size());
+
+			vector_sort(cluster_indices);
+			
+			auto last = std::unique(cluster_indices.begin(), cluster_indices.end());
+			cluster_indices.erase(last, cluster_indices.end());
+		}
+	}
+
+	void basisu_frontend::compute_endpoint_subblock_error_vec()
+	{
+		m_subblock_endpoint_quant_err_vec.resize(0);
+
+		const uint32_t N = 512;
+		for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)
+		{
+			const uint32_t first_index = cluster_index_iter;                                    
+			const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);   
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+#endif
+
+				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+				{
+					const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+
+					assert(cluster_indices.size());
+
+					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+					{
+						basisu::vector<color_rgba> cluster_pixels(8);
+
+						const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+						const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+
+						const bool flipped = true;
+
+						const color_rgba *pSource_block_pixels = get_source_pixel_block(block_index).get_ptr();
+
+						for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
+						{
+							cluster_pixels[pixel_index] = pSource_block_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
+						}
+
+						const endpoint_cluster_etc_params &etc_params = m_endpoint_cluster_etc_params[cluster_index];
+
+						assert(etc_params.m_valid);
+																				
+						color_rgba block_colors[4];
+						etc_block::get_block_colors5(block_colors, etc_params.m_color_unscaled[0], etc_params.m_inten_table[0], true);
+
+						uint64_t total_err = 0;
+
+						for (uint32_t i = 0; i < 8; i++)
+						{
+							const color_rgba &c = cluster_pixels[i];
+
+							uint64_t best_err = UINT64_MAX;
+							//uint32_t best_index = 0;
+
+							for (uint32_t s = 0; s < 4; s++)
+							{
+								uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
+								if (err < best_err)
+								{
+									best_err = err;
+									//best_index = s;
+								}
+							}
+
+							total_err += best_err;
+						}
+
+						subblock_endpoint_quant_err quant_err;
+						quant_err.m_total_err = total_err;
+						quant_err.m_cluster_index = cluster_index;
+						quant_err.m_cluster_subblock_index = cluster_indices_iter;
+						quant_err.m_block_index = block_index;
+						quant_err.m_subblock_index = subblock_index;
+					
+						{
+							std::lock_guard<std::mutex> lock(m_lock);
+
+							m_subblock_endpoint_quant_err_vec.push_back(quant_err);
+						}
+					}
+				} // cluster_index
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		} // cluster_index_iter
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		vector_sort(m_subblock_endpoint_quant_err_vec);
+	}
+		
+	void basisu_frontend::introduce_new_endpoint_clusters()
+	{
+		debug_printf("introduce_new_endpoint_clusters\n");
+
+		generate_block_endpoint_clusters();
+
+		int num_new_endpoint_clusters = m_params.m_max_endpoint_clusters - (uint32_t)m_endpoint_clusters.size();
+		if (num_new_endpoint_clusters <= 0)
+			return;
+
+		compute_endpoint_subblock_error_vec();
+
+		const uint32_t num_orig_endpoint_clusters = (uint32_t)m_endpoint_clusters.size();
+
+		std::unordered_set<uint32_t> training_vector_was_relocated;
+
+		uint_vec cluster_sizes(num_orig_endpoint_clusters);
+		for (uint32_t i = 0; i < num_orig_endpoint_clusters; i++)
+			cluster_sizes[i] = (uint32_t)m_endpoint_clusters[i].size();
+
+		std::unordered_set<uint32_t> ignore_cluster;
+
+		while (num_new_endpoint_clusters)
+		{
+			if (m_subblock_endpoint_quant_err_vec.size() == 0)
+				break;
+
+			subblock_endpoint_quant_err subblock_to_move(m_subblock_endpoint_quant_err_vec.back());
+
+			m_subblock_endpoint_quant_err_vec.pop_back();
+
+			if (unordered_set_contains(ignore_cluster, subblock_to_move.m_cluster_index))
+				continue;
+
+			uint32_t training_vector_index = subblock_to_move.m_block_index * 2 + subblock_to_move.m_subblock_index;
+
+			if (cluster_sizes[subblock_to_move.m_cluster_index] <= 2)
+				continue;
+
+			if (unordered_set_contains(training_vector_was_relocated, training_vector_index))
+				continue;
+
+			if (unordered_set_contains(training_vector_was_relocated, training_vector_index ^ 1))
+				continue;
+
+#if 0
+			const uint32_t block_index = subblock_to_move.m_block_index;
+			const etc_block& blk = m_etc1_blocks_etc1s[block_index];
+			uint32_t ls, hs;
+			blk.get_selector_range(ls, hs);
+			if (ls != hs)
+				continue;
+#endif
+
+			//const uint32_t new_endpoint_cluster_index = (uint32_t)m_endpoint_clusters.size();
+
+			enlarge_vector(m_endpoint_clusters, 1)->push_back(training_vector_index);
+			enlarge_vector(m_endpoint_cluster_etc_params, 1);
+
+			assert(m_endpoint_clusters.size() == m_endpoint_cluster_etc_params.size());
+
+			training_vector_was_relocated.insert(training_vector_index);
+
+			m_endpoint_clusters.back().push_back(training_vector_index ^ 1);
+			training_vector_was_relocated.insert(training_vector_index ^ 1);
+
+			BASISU_FRONTEND_VERIFY(cluster_sizes[subblock_to_move.m_cluster_index] >= 2);
+			cluster_sizes[subblock_to_move.m_cluster_index] -= 2;
+						
+			ignore_cluster.insert(subblock_to_move.m_cluster_index);
+
+			num_new_endpoint_clusters--;
+		}
+
+		for (uint32_t i = 0; i < num_orig_endpoint_clusters; i++)
+		{
+			uint_vec &cluster_indices = m_endpoint_clusters[i];
+
+			uint_vec new_cluster_indices;
+			for (uint32_t j = 0; j < cluster_indices.size(); j++)
+			{
+				uint32_t training_vector_index = cluster_indices[j];
+
+				if (!unordered_set_contains(training_vector_was_relocated, training_vector_index))
+					new_cluster_indices.push_back(training_vector_index);
+			}
+
+			if (cluster_indices.size() != new_cluster_indices.size())
+			{
+				BASISU_FRONTEND_VERIFY(new_cluster_indices.size() > 0);
+				cluster_indices.swap(new_cluster_indices);
+			}
+		}
+
+		generate_block_endpoint_clusters();
+	}
+
+	// Given each endpoint cluster, gather all the block pixels which are in that cluster and compute optimized ETC1S endpoints for them.
+	// TODO: Don't optimize endpoint clusters which haven't changed.
+	void basisu_frontend::generate_endpoint_codebook(uint32_t step)
+	{
+		debug_printf("generate_endpoint_codebook\n");
+
+		m_endpoint_cluster_etc_params.resize(m_endpoint_clusters.size());
+
+		const uint32_t N = 128;
+		for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N)
+		{
+			const uint32_t first_index = cluster_index_iter;                                    
+			const uint32_t last_index = minimum<uint32_t>((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N);   
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, step ] {
+#endif
+
+				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+				{
+					const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+
+					BASISU_FRONTEND_VERIFY(cluster_indices.size());
+
+					const uint32_t total_pixels = (uint32_t)cluster_indices.size() * 8;
+
+					basisu::vector<color_rgba> cluster_pixels(total_pixels);
+
+					for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+					{
+						const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+						const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+
+						const bool flipped = true;
+
+						const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+						for (uint32_t pixel_index = 0; pixel_index < 8; pixel_index++)
+						{
+							const color_rgba &c = pBlock_pixels[g_etc1_pixel_indices[flipped][subblock_index][pixel_index]];
+							cluster_pixels[cluster_indices_iter * 8 + pixel_index] = c;
+						}
+					}
+
+					endpoint_cluster_etc_params new_subblock_params;
+						
+					{
+						etc1_optimizer optimizer;
+						etc1_solution_coordinates solutions[2];
+
+						etc1_optimizer::params cluster_optimizer_params;
+						cluster_optimizer_params.m_num_src_pixels = total_pixels;
+						cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
+
+						cluster_optimizer_params.m_use_color4 = false;
+						cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
+
+						if (m_params.m_compression_level <= 1)
+							cluster_optimizer_params.m_quality = cETCQualityMedium;
+						else if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+							cluster_optimizer_params.m_quality = cETCQualityUber;
+
+						etc1_optimizer::results cluster_optimizer_results;
+
+						basisu::vector<uint8_t> cluster_selectors(total_pixels);
+						cluster_optimizer_results.m_n = total_pixels;
+						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
+
+						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
+
+						if (!optimizer.compute())
+							BASISU_FRONTEND_VERIFY(false);
+
+						new_subblock_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
+						new_subblock_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
+						new_subblock_params.m_color_error[0] = cluster_optimizer_results.m_error;
+					} 
+
+					endpoint_cluster_etc_params &prev_etc_params = m_endpoint_cluster_etc_params[cluster_index];
+
+					bool use_new_subblock_params = false;
+					if ((!step) || (!prev_etc_params.m_valid))
+						use_new_subblock_params = true;
+					else
+					{
+						assert(prev_etc_params.m_valid);
+
+						uint64_t total_prev_err = 0;
+								
+						{
+							color_rgba block_colors[4];
+
+							etc_block::get_block_colors5(block_colors, prev_etc_params.m_color_unscaled[0], prev_etc_params.m_inten_table[0], false);
+
+							uint64_t total_err = 0;
+
+							for (uint32_t i = 0; i < total_pixels; i++)
+							{
+								const color_rgba &c = cluster_pixels[i];
+
+								uint64_t best_err = UINT64_MAX;
+								//uint32_t best_index = 0;
+
+								for (uint32_t s = 0; s < 4; s++)
+								{
+									uint64_t err = color_distance(m_params.m_perceptual, c, block_colors[s], false);
+									if (err < best_err)
+									{
+										best_err = err;
+										//best_index = s;
+									}
+								}
+
+								total_err += best_err;
+							}
+
+							total_prev_err += total_err;
+						}
+
+						// See if we should update this cluster's endpoints (if the error has actually fallen)
+						if (total_prev_err > new_subblock_params.m_color_error[0])
+						{
+							use_new_subblock_params = true;
+						}
+					}
+
+					if (use_new_subblock_params)
+					{
+						new_subblock_params.m_valid = true;
+
+						prev_etc_params = new_subblock_params;
+					}
+				
+				} // cluster_index
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		} // cluster_index_iter
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+	}
+
+	bool basisu_frontend::check_etc1s_constraints() const
+	{
+		basisu::vector<vec2U> block_clusters(m_total_blocks);
+
+		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
+		{
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+
+			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+			{
+				const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+				const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+
+				block_clusters[block_index][subblock_index] = cluster_index;
+
+			} // cluster_indices_iter
+		}
+
+		for (uint32_t i = 0; i < m_total_blocks; i++)
+		{
+			if (block_clusters[i][0] != block_clusters[i][1])
+				return false;
+		}
+
+		return true;
+	}
+
+	uint32_t basisu_frontend::refine_endpoint_clusterization()
+	{
+		debug_printf("refine_endpoint_clusterization\n");
+		
+		if (m_use_hierarchical_endpoint_codebooks)
+			compute_endpoint_clusters_within_each_parent_cluster();
+
+		basisu::vector<vec2U> block_clusters(m_total_blocks);
+
+		for (int cluster_index = 0; cluster_index < static_cast<int>(m_endpoint_clusters.size()); cluster_index++)
+		{
+			const basisu::vector<uint32_t>& cluster_indices = m_endpoint_clusters[cluster_index];
+
+			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+			{
+				const uint32_t block_index = cluster_indices[cluster_indices_iter] >> 1;
+				const uint32_t subblock_index = cluster_indices[cluster_indices_iter] & 1;
+
+				block_clusters[block_index][subblock_index] = cluster_index;
+
+			} // cluster_indices_iter
+		}
+				
+		//----------------------------------------------------------
+				
+		// Create a new endpoint clusterization
+
+		uint_vec best_cluster_indices(m_total_blocks);
+
+		const uint32_t N = 1024;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+		{
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &best_cluster_indices, &block_clusters] {
+#endif
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					const uint32_t cluster_index = block_clusters[block_index][0];
+					BASISU_FRONTEND_VERIFY(cluster_index == block_clusters[block_index][1]);
+
+					const color_rgba *pSubblock_pixels = get_source_pixel_block(block_index).get_ptr();
+					const uint32_t num_subblock_pixels = 16;
+
+					uint64_t best_cluster_err = INT64_MAX;
+					uint32_t best_cluster_index = 0;
+
+					const uint32_t block_parent_endpoint_cluster_index = m_block_parent_endpoint_cluster.size() ? m_block_parent_endpoint_cluster[block_index] : 0;
+					const uint_vec *pCluster_indices = m_endpoint_clusters_within_each_parent_cluster.size() ? &m_endpoint_clusters_within_each_parent_cluster[block_parent_endpoint_cluster_index] : nullptr;
+
+					const uint32_t total_clusters = m_use_hierarchical_endpoint_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_endpoint_clusters.size();
+			
+					for (uint32_t i = 0; i < total_clusters; i++)
+					{
+						const uint32_t cluster_iter = m_use_hierarchical_endpoint_codebooks ? (*pCluster_indices)[i] : i;
+
+						color_rgba cluster_etc_base_color(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0]);
+						uint32_t cluster_etc_inten = m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0];
+
+						uint64_t total_err = 0;
+
+						const uint32_t low_selector = 0;//subblock_etc_params_vec[j].m_low_selectors[0];
+						const uint32_t high_selector = 3;//subblock_etc_params_vec[j].m_high_selectors[0];
+						color_rgba subblock_colors[4];
+						// Can't assign it here - may result in too much error when selector quant occurs
+						if (cluster_etc_inten > m_endpoint_cluster_etc_params[cluster_index].m_inten_table[0])
+						{
+							total_err = INT64_MAX;
+							goto skip_cluster;
+						}
+
+						etc_block::get_block_colors5(subblock_colors, cluster_etc_base_color, cluster_etc_inten);
+												
+#if 0
+						for (uint32_t p = 0; p < num_subblock_pixels; p++)
+						{
+							uint64_t best_err = UINT64_MAX;
+
+							for (uint32_t r = low_selector; r <= high_selector; r++)
+							{
+								uint64_t err = color_distance(m_params.m_perceptual, pSubblock_pixels[p], subblock_colors[r], false);
+								best_err = minimum(best_err, err);
+								if (!best_err)
+									break;
+							}
+
+							total_err += best_err;
+							if (total_err > best_cluster_err)
+								break;
+						} // p
+#else
+						if (m_params.m_perceptual)
+						{
+							if (!g_cpu_supports_sse41)
+							{
+								for (uint32_t p = 0; p < num_subblock_pixels; p++)
+								{
+									uint64_t best_err = UINT64_MAX;
+
+									for (uint32_t r = low_selector; r <= high_selector; r++)
+									{
+										uint64_t err = color_distance(true, pSubblock_pixels[p], subblock_colors[r], false);
+										best_err = minimum(best_err, err);
+										if (!best_err)
+											break;
+									}
+
+									total_err += best_err;
+									if (total_err > best_cluster_err)
+										break;
+								} // p
+							}
+							else
+							{
+#if BASISU_SUPPORT_SSE
+								find_lowest_error_perceptual_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+#endif
+							}
+						}
+						else
+						{
+							if (!g_cpu_supports_sse41)
+							{
+								for (uint32_t p = 0; p < num_subblock_pixels; p++)
+								{
+									uint64_t best_err = UINT64_MAX;
+
+									for (uint32_t r = low_selector; r <= high_selector; r++)
+									{
+										uint64_t err = color_distance(false, pSubblock_pixels[p], subblock_colors[r], false);
+										best_err = minimum(best_err, err);
+										if (!best_err)
+											break;
+									}
+
+									total_err += best_err;
+									if (total_err > best_cluster_err)
+										break;
+								} // p
+							}
+							else
+							{
+#if BASISU_SUPPORT_SSE
+								find_lowest_error_linear_rgb_4_N_sse41((int64_t*)&total_err, subblock_colors, pSubblock_pixels, num_subblock_pixels, best_cluster_err);
+#endif
+							}
+						}
+#endif
+
+					skip_cluster:
+						if ((total_err < best_cluster_err) ||
+							((cluster_iter == cluster_index) && (total_err == best_cluster_err)))
+						{
+							best_cluster_err = total_err;
+							best_cluster_index = cluster_iter;
+					
+							if (!best_cluster_err)
+								break;
+						}
+					} // j
+						
+					best_cluster_indices[block_index] = best_cluster_index;
+
+				} // block_index
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+						
+		} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		basisu::vector<typename basisu::vector<uint32_t> > optimized_endpoint_clusters(m_endpoint_clusters.size());
+		uint32_t total_subblocks_reassigned = 0;
+
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+			const uint32_t training_vector_index = block_index * 2 + 0;
+
+			const uint32_t orig_cluster_index = block_clusters[block_index][0];
+			const uint32_t best_cluster_index = best_cluster_indices[block_index];
+
+			optimized_endpoint_clusters[best_cluster_index].push_back(training_vector_index);
+			optimized_endpoint_clusters[best_cluster_index].push_back(training_vector_index + 1);
+
+			if (best_cluster_index != orig_cluster_index)
+			{
+				total_subblocks_reassigned++;
+			}
+		}
+
+		debug_printf("total_subblocks_reassigned: %u\n", total_subblocks_reassigned);
+
+		m_endpoint_clusters = optimized_endpoint_clusters;
+
+		return total_subblocks_reassigned;
+	}
+
+	void basisu_frontend::eliminate_redundant_or_empty_endpoint_clusters()
+	{
+		debug_printf("eliminate_redundant_or_empty_endpoint_clusters\n");
+
+		// Step 1: Sort endpoint clusters by the base colors/intens
+
+		uint_vec sorted_endpoint_cluster_indices(m_endpoint_clusters.size());
+		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
+			sorted_endpoint_cluster_indices[i] = i;
+
+		indirect_sort((uint32_t)m_endpoint_clusters.size(), &sorted_endpoint_cluster_indices[0], &m_endpoint_cluster_etc_params[0]);
+
+		basisu::vector<basisu::vector<uint32_t> > new_endpoint_clusters(m_endpoint_clusters.size());
+		basisu::vector<endpoint_cluster_etc_params> new_subblock_etc_params(m_endpoint_clusters.size());
+		
+		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
+		{
+			uint32_t j = sorted_endpoint_cluster_indices[i];
+			new_endpoint_clusters[i] = m_endpoint_clusters[j];
+			new_subblock_etc_params[i] = m_endpoint_cluster_etc_params[j];
+		}
+
+		new_endpoint_clusters.swap(m_endpoint_clusters);
+		new_subblock_etc_params.swap(m_endpoint_cluster_etc_params);
+
+		// Step 2: Eliminate redundant endpoint clusters, or empty endpoint clusters
+
+		new_endpoint_clusters.resize(0);
+		new_subblock_etc_params.resize(0);
+		
+		for (int i = 0; i < (int)m_endpoint_clusters.size(); )
+		{
+			if (!m_endpoint_clusters[i].size())
+			{
+				i++;
+				continue;
+			}
+
+			int j;
+			for (j = i + 1; j < (int)m_endpoint_clusters.size(); j++)
+			{
+				if (!(m_endpoint_cluster_etc_params[i] == m_endpoint_cluster_etc_params[j]))
+					break;
+			}
+
+			new_endpoint_clusters.push_back(m_endpoint_clusters[i]);
+			new_subblock_etc_params.push_back(m_endpoint_cluster_etc_params[i]);
+						
+			for (int k = i + 1; k < j; k++)
+			{
+				append_vector(new_endpoint_clusters.back(), m_endpoint_clusters[k]);
+			}
+
+			i = j;
+		}
+				
+		if (m_endpoint_clusters.size() != new_endpoint_clusters.size())
+		{
+			if (m_params.m_debug_stats)
+				debug_printf("Eliminated %u redundant or empty clusters\n", (uint32_t)(m_endpoint_clusters.size() - new_endpoint_clusters.size()));
+
+			m_endpoint_clusters.swap(new_endpoint_clusters);
+
+			m_endpoint_cluster_etc_params.swap(new_subblock_etc_params);
+		}
+	}
+
+	void basisu_frontend::create_initial_packed_texture()
+	{
+		debug_printf("create_initial_packed_texture\n");
+
+		const uint32_t N = 4096;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+		{
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index] {
+#endif
+				
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					uint32_t cluster0 = m_block_endpoint_clusters_indices[block_index][0];
+					uint32_t cluster1 = m_block_endpoint_clusters_indices[block_index][1];
+					BASISU_FRONTEND_VERIFY(cluster0 == cluster1);
+
+					const color_rgba *pSource_pixels = get_source_pixel_block(block_index).get_ptr();
+
+					etc_block &blk = m_encoded_blocks[block_index];
+
+					color_rgba unscaled[2] = { m_endpoint_cluster_etc_params[cluster0].m_color_unscaled[0], m_endpoint_cluster_etc_params[cluster1].m_color_unscaled[0] };
+					uint32_t inten[2] = { m_endpoint_cluster_etc_params[cluster0].m_inten_table[0], m_endpoint_cluster_etc_params[cluster1].m_inten_table[0] };
+									
+					blk.set_block_color5(unscaled[0], unscaled[1]);
+					blk.set_flip_bit(true);
+
+					blk.set_inten_table(0, inten[0]);
+					blk.set_inten_table(1, inten[1]);
+
+					blk.determine_selectors(pSource_pixels, m_params.m_perceptual);
+						
+				} // block_index
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		m_orig_encoded_blocks = m_encoded_blocks;
+	}
+
+	void basisu_frontend::compute_selector_clusters_within_each_parent_cluster()
+	{
+		uint_vec block_selector_cluster_indices(m_total_blocks);
+
+		for (int cluster_index = 0; cluster_index < static_cast<int>(m_selector_cluster_block_indices.size()); cluster_index++)
+		{
+			const basisu::vector<uint32_t>& cluster_indices = m_selector_cluster_block_indices[cluster_index];
+
+			for (uint32_t cluster_indices_iter = 0; cluster_indices_iter < cluster_indices.size(); cluster_indices_iter++)
+			{
+				const uint32_t block_index = cluster_indices[cluster_indices_iter];
+				
+				block_selector_cluster_indices[block_index] = cluster_index;
+
+			} // cluster_indices_iter
+
+		} // cluster_index
+
+		m_selector_clusters_within_each_parent_cluster.resize(0);
+		m_selector_clusters_within_each_parent_cluster.resize(m_selector_parent_cluster_block_indices.size());
+
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+			const uint32_t cluster_index = block_selector_cluster_indices[block_index];
+			const uint32_t parent_cluster_index = m_block_parent_selector_cluster[block_index];
+
+			m_selector_clusters_within_each_parent_cluster[parent_cluster_index].push_back(cluster_index);
+		}
+
+		for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+		{
+			uint_vec &cluster_indices = m_selector_clusters_within_each_parent_cluster[i];
+
+			BASISU_FRONTEND_VERIFY(cluster_indices.size());
+
+			vector_sort(cluster_indices);
+			
+			auto last = std::unique(cluster_indices.begin(), cluster_indices.end());
+			cluster_indices.erase(last, cluster_indices.end());
+		}
+	}
+
+	void basisu_frontend::generate_selector_clusters()
+	{
+		debug_printf("generate_selector_clusters\n");
+
+		typedef vec<16, float> vec16F;
+		typedef tree_vector_quant<vec16F> vec16F_clusterizer;
+				
+		vec16F_clusterizer::array_of_weighted_training_vecs training_vecs(m_total_blocks);
+				
+		const uint32_t N = 4096;
+		for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+		{
+			const uint32_t first_index = block_index_iter;
+			const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] {
+#endif
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					const etc_block &blk = m_encoded_blocks[block_index];
+
+					vec16F v;
+					for (uint32_t y = 0; y < 4; y++)
+						for (uint32_t x = 0; x < 4; x++)
+							v[x + y * 4] = static_cast<float>(blk.get_selector(x, y));
+
+					const uint32_t subblock_index = (blk.get_inten_table(0) > blk.get_inten_table(1)) ? 0 : 1;
+
+					color_rgba block_colors[2];
+					blk.get_block_low_high_colors(block_colors, subblock_index);
+
+					const uint32_t dist = color_distance(m_params.m_perceptual, block_colors[0], block_colors[1], false);
+
+					const uint32_t cColorDistToWeight = 300;
+					const uint32_t cMaxWeight = 4096;
+					uint32_t weight = clamp<uint32_t>(dist / cColorDistToWeight, 1, cMaxWeight);
+						
+					training_vecs[block_index].first = v;
+					training_vecs[block_index].second = weight;
+				
+				} // block_index
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		vec16F_clusterizer selector_clusterizer;
+		for (uint32_t i = 0; i < m_total_blocks; i++)
+			selector_clusterizer.add_training_vec(training_vecs[i].first, training_vecs[i].second);
+
+		const int selector_parent_codebook_size = (m_params.m_compression_level <= 1) ? BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 : BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT;
+		const uint32_t parent_codebook_size = (m_params.m_max_selector_clusters >= 256) ? selector_parent_codebook_size : 0;
+		debug_printf("Using selector parent codebook size %u\n", parent_codebook_size);
+
+		uint32_t max_threads = 0;
+		max_threads = m_params.m_multithreaded ? minimum<int>(std::thread::hardware_concurrency(), cMaxCodebookCreationThreads) : 0;
+
+		bool status = generate_hierarchical_codebook_threaded(selector_clusterizer,
+			m_params.m_max_selector_clusters, m_use_hierarchical_selector_codebooks ? parent_codebook_size : 0,
+			m_selector_cluster_block_indices,
+			m_selector_parent_cluster_block_indices,
+			max_threads, m_params.m_pJob_pool);
+		BASISU_FRONTEND_VERIFY(status);
+
+		if (m_use_hierarchical_selector_codebooks)
+		{
+			if (!m_selector_parent_cluster_block_indices.size())
+			{
+				m_selector_parent_cluster_block_indices.resize(0);
+				m_selector_parent_cluster_block_indices.resize(1);
+				for (uint32_t i = 0; i < m_total_blocks; i++)
+					m_selector_parent_cluster_block_indices[0].push_back(i);
+			}
+
+			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_01 <= UINT8_MAX);
+			BASISU_ASSUME(BASISU_SELECTOR_PARENT_CODEBOOK_SIZE_COMP_LEVEL_DEFAULT <= UINT8_MAX);
+
+			m_block_parent_selector_cluster.resize(0);
+			m_block_parent_selector_cluster.resize(m_total_blocks);
+			vector_set_all(m_block_parent_selector_cluster, 0xFF);
+
+			for (uint32_t parent_cluster_index = 0; parent_cluster_index < m_selector_parent_cluster_block_indices.size(); parent_cluster_index++)
+			{
+				const uint_vec &cluster = m_selector_parent_cluster_block_indices[parent_cluster_index];
+				for (uint32_t j = 0; j < cluster.size(); j++)
+					m_block_parent_selector_cluster[cluster[j]] = static_cast<uint8_t>(parent_cluster_index);
+			}
+			for (uint32_t i = 0; i < m_total_blocks; i++)
+			{
+				BASISU_FRONTEND_VERIFY(m_block_parent_selector_cluster[i] != 0xFF);
+			}
+
+			// Ensure that all the blocks within each cluster are all in the same parent cluster, or something is very wrong.
+			for (uint32_t cluster_index = 0; cluster_index < m_selector_cluster_block_indices.size(); cluster_index++)
+			{
+				const uint_vec &cluster = m_selector_cluster_block_indices[cluster_index];
+			
+				uint32_t parent_cluster_index = 0;
+				for (uint32_t j = 0; j < cluster.size(); j++)
+				{
+					const uint32_t block_index = cluster[j];
+					if (!j)
+					{
+						parent_cluster_index = m_block_parent_selector_cluster[block_index];
+					}
+					else
+					{
+						BASISU_FRONTEND_VERIFY(m_block_parent_selector_cluster[block_index] == parent_cluster_index);
+					}
+				}
+			}
+		}
+
+		debug_printf("Total selector clusters: %u, total parent selector clusters: %u\n", (uint32_t)m_selector_cluster_block_indices.size(), (uint32_t)m_selector_parent_cluster_block_indices.size());
+	}
+
+	void basisu_frontend::create_optimized_selector_codebook(uint32_t iter)
+	{
+		debug_printf("create_optimized_selector_codebook\n");
+
+		const uint32_t total_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size();
+
+		debug_printf("Total selector clusters (from m_selector_cluster_block_indices.size()): %u\n", (uint32_t)m_selector_cluster_block_indices.size());
+
+		m_optimized_cluster_selectors.resize(total_selector_clusters);
+
+		if ((m_params.m_pGlobal_sel_codebook) && (!m_params.m_use_hybrid_selector_codebooks))
+		{
+			uint32_t total_clusters_processed = 0;
+
+			m_optimized_cluster_selector_global_cb_ids.resize(total_selector_clusters);
+
+			const uint32_t N = 256;
+			for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)
+			{
+				const uint32_t first_index = cluster_index_iter;                                    
+				const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);   
+
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &total_clusters_processed, &total_selector_clusters] {
+#endif
+					
+					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+					{
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
+
+						if (!cluster_block_indices.size())
+							continue;
+
+						etc_block_vec etc_blocks;
+						pixel_block_vec pixel_blocks;
+
+						for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
+						{
+							const uint32_t block_index = cluster_block_indices[cluster_block_index];
+
+							etc_blocks.push_back(m_encoded_blocks[block_index]);
+
+							pixel_blocks.push_back(get_source_pixel_block(block_index));
+						}
+
+						uint32_t palette_index;
+						basist::etc1_global_palette_entry_modifier palette_modifier;
+
+		#if 0
+						m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
+							palette_index, palette_modifier,
+							m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#else
+						etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook,
+							(uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
+							palette_index, palette_modifier,
+							m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#endif
+
+						m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
+
+						basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
+
+						for (uint32_t y = 0; y < 4; y++)
+							for (uint32_t x = 0; x < 4; x++)
+								m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
+
+						{
+							std::lock_guard<std::mutex> lock(m_lock);
+
+							total_clusters_processed++;
+							if ((total_clusters_processed % 63) == 0)
+								debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
+						}
+
+					} // cluster_index
+
+#ifndef __EMSCRIPTEN__
+				} );
+#endif
+
+			} // cluster_index_iter
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+		}
+		else
+		{
+			const bool uses_hybrid_sel_codebook = ((m_params.m_pGlobal_sel_codebook) && (m_params.m_use_hybrid_selector_codebooks));
+			if (uses_hybrid_sel_codebook)
+			{
+				m_selector_cluster_uses_global_cb.resize(total_selector_clusters);
+				m_optimized_cluster_selector_global_cb_ids.resize(total_selector_clusters);
+			}
+
+			uint32_t total_clusters_processed = 0;
+
+			// For each selector codebook entry, and for each of the 4x4 selectors, determine which selector minimizes the error across all the blocks that use that quantized selector.
+
+			const uint32_t N = 256;
+			for (uint32_t cluster_index_iter = 0; cluster_index_iter < total_selector_clusters; cluster_index_iter += N)
+			{
+				const uint32_t first_index = cluster_index_iter;                                    
+				const uint32_t last_index = minimum<uint32_t>((uint32_t)total_selector_clusters, cluster_index_iter + N);   
+
+#ifndef __EMSCRIPTEN__			
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &uses_hybrid_sel_codebook, &total_clusters_processed, &total_selector_clusters] {
+#endif
+					
+					for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+					{
+						const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[cluster_index];
+
+						if (!cluster_block_indices.size())
+							continue;
+
+						uint64_t overall_best_err = 0;
+
+						for (uint32_t y = 0; y < 4; y++)
+						{
+							for (uint32_t x = 0; x < 4; x++)
+							{
+								uint64_t best_err = UINT64_MAX;
+								uint32_t best_s = 0;
+
+								for (uint32_t s = 0; s < 4; s++)
+								{
+									uint32_t total_err = 0;
+
+									for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
+									{
+										const uint32_t block_index = cluster_block_indices[cluster_block_index];
+
+										const etc_block &blk = m_encoded_blocks[block_index];
+
+										const color_rgba &orig_color = get_source_pixel_block(block_index)(x, y);
+
+										color_rgba block_color;
+										blk.get_block_color(block_color, blk.get_subblock_index(x, y), s);
+										total_err += color_distance(m_params.m_perceptual, block_color, orig_color, false);
+
+										if (total_err > best_err)
+											break;
+
+									} // block_index
+
+									if (total_err < best_err)
+									{
+										best_err = total_err;
+										best_s = s;
+										if (!best_err)
+											break;
+									}
+
+								} // s
+
+								m_optimized_cluster_selectors[cluster_index].set_selector(x, y, best_s);
+
+								overall_best_err += best_err;
+
+							} // x
+						} // y
+
+						if (uses_hybrid_sel_codebook)
+						{
+							etc_block_vec etc_blocks;
+							pixel_block_vec pixel_blocks;
+
+							for (uint32_t cluster_block_index = 0; cluster_block_index < cluster_block_indices.size(); cluster_block_index++)
+							{
+								const uint32_t block_index = cluster_block_indices[cluster_block_index];
+
+								etc_blocks.push_back(m_encoded_blocks[block_index]);
+
+								pixel_blocks.push_back(get_source_pixel_block(block_index));
+							}
+
+							uint32_t palette_index;
+							basist::etc1_global_palette_entry_modifier palette_modifier;
+
+		#if 0
+							uint64_t best_global_cb_err = m_params.m_pGlobal_sel_codebook->find_best_entry(etc_blocks.size(), pixel_blocks.get_ptr(), etc_blocks.get_ptr(),
+								palette_index, palette_modifier,
+								m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#else
+							uint64_t best_global_cb_err = etc1_global_selector_codebook_find_best_entry(*m_params.m_pGlobal_sel_codebook, (uint32_t)etc_blocks.size(), &pixel_blocks[0], &etc_blocks[0],
+								palette_index, palette_modifier,
+								m_params.m_perceptual, 1 << m_params.m_num_global_sel_codebook_pal_bits, 1 << m_params.m_num_global_sel_codebook_mod_bits);
+		#endif
+
+							if (best_global_cb_err <= overall_best_err * m_params.m_hybrid_codebook_quality_thresh)
+							{
+								m_selector_cluster_uses_global_cb[cluster_index] = true;
+
+								m_optimized_cluster_selector_global_cb_ids[cluster_index].set(palette_index, palette_modifier);
+
+								basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(palette_index, palette_modifier));
+
+								for (uint32_t y = 0; y < 4; y++)
+									for (uint32_t x = 0; x < 4; x++)
+										m_optimized_cluster_selectors[cluster_index].set_selector(x, y, pal_entry(x, y));
+							}
+							else
+							{
+								m_optimized_cluster_selector_global_cb_ids[cluster_index].set(0, basist::etc1_global_palette_entry_modifier(0));
+
+								m_selector_cluster_uses_global_cb[cluster_index] = false;
+							}
+						}
+
+						if (uses_hybrid_sel_codebook)
+						{
+							std::lock_guard<std::mutex> lock(m_lock);
+		
+							total_clusters_processed++;
+							if ((total_clusters_processed % 63) == 0)
+								debug_printf("Global selector palette optimization: %3.1f%% complete\n", total_clusters_processed * 100.0f / total_selector_clusters);
+						}
+
+					} // cluster_index
+
+#ifndef __EMSCRIPTEN__
+				} );
+#endif
+
+			} // cluster_index_iter
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+		} // if (m_params.m_pGlobal_sel_codebook)
+				
+		if (m_params.m_debug_images)
+		{
+			uint32_t max_selector_cluster_size = 0;
+
+			for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+				max_selector_cluster_size = maximum<uint32_t>(max_selector_cluster_size, (uint32_t)m_selector_cluster_block_indices[i].size());
+
+			if ((max_selector_cluster_size * 5) < 32768)
+			{
+				const uint32_t x_spacer_len = 16;
+				image selector_cluster_vis(x_spacer_len + max_selector_cluster_size * 5, (uint32_t)m_selector_cluster_block_indices.size() * 5);
+
+				for (uint32_t selector_cluster_index = 0; selector_cluster_index < m_selector_cluster_block_indices.size(); selector_cluster_index++)
+				{
+					const basisu::vector<uint32_t> &cluster_block_indices = m_selector_cluster_block_indices[selector_cluster_index];
+
+					for (uint32_t y = 0; y < 4; y++)
+						for (uint32_t x = 0; x < 4; x++)
+							selector_cluster_vis.set_clipped(x_spacer_len + x - 12, selector_cluster_index * 5 + y, color_rgba((m_optimized_cluster_selectors[selector_cluster_index].get_selector(x, y) * 255) / 3));
+
+					for (uint32_t i = 0; i < cluster_block_indices.size(); i++)
+					{
+						uint32_t block_index = cluster_block_indices[i];
+
+						const etc_block &blk = m_orig_encoded_blocks[block_index];
+						
+						for (uint32_t y = 0; y < 4; y++)
+							for (uint32_t x = 0; x < 4; x++)
+								selector_cluster_vis.set_clipped(x_spacer_len + x + 5 * i, selector_cluster_index * 5 + y, color_rgba((blk.get_selector(x, y) * 255) / 3));
+					}
+				}
+
+				char buf[256];
+				snprintf(buf, sizeof(buf), "selector_cluster_vis_%u.png", iter);
+				save_png(buf, selector_cluster_vis);
+			}
+		}
+	}
+
+	void basisu_frontend::find_optimal_selector_clusters_for_each_block()
+	{
+		debug_printf("find_optimal_selector_clusters_for_each_block\n");
+
+		// Sanity checks
+		BASISU_FRONTEND_VERIFY(m_selector_cluster_block_indices.size() == m_optimized_cluster_selectors.size());
+		for (uint32_t i = 0; i < m_selector_clusters_within_each_parent_cluster.size(); i++)
+		{
+			for (uint32_t j = 0; j < m_selector_clusters_within_each_parent_cluster[i].size(); j++)
+			{
+				BASISU_FRONTEND_VERIFY(m_selector_clusters_within_each_parent_cluster[i][j] < m_optimized_cluster_selectors.size());
+			}
+		}
+
+		m_block_selector_cluster_index.resize(m_total_blocks);
+							
+		if (m_params.m_compression_level == 0)
+		{
+			// Don't do anything, just leave the blocks in their original selector clusters.
+			for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+			{
+				for (uint32_t j = 0; j < m_selector_cluster_block_indices[i].size(); j++)
+					m_block_selector_cluster_index[m_selector_cluster_block_indices[i][j]] = i;
+			}
+		}
+		else
+		{
+			// Note that this method may leave some empty clusters (i.e. arrays with no block indices), including at the end.
+			basisu::vector< basisu::vector<uint32_t> > new_cluster_indices(m_optimized_cluster_selectors.size());
+						
+			// For each block: Determine which quantized selectors best encode that block, given its quantized endpoints.
+
+			basisu::vector<uint8_t> unpacked_optimized_cluster_selectors(16 * m_optimized_cluster_selectors.size());
+			for (uint32_t cluster_index = 0; cluster_index < m_optimized_cluster_selectors.size(); cluster_index++)
+			{
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						unpacked_optimized_cluster_selectors[cluster_index * 16 + y * 4 + x] = (uint8_t)m_optimized_cluster_selectors[cluster_index].get_selector(x, y);
+					}
+				}
+			}
+						
+			const uint32_t N = 1024;
+			for (uint32_t block_index_iter = 0; block_index_iter < m_total_blocks; block_index_iter += N)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(m_total_blocks, first_index + N);
+
+#ifndef __EMSCRIPTEN__
+				m_params.m_pJob_pool->add_job( [this, first_index, last_index, &new_cluster_indices, &unpacked_optimized_cluster_selectors] {
+#endif
+
+				for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+				{
+					const color_rgba* pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+					etc_block& blk = m_encoded_blocks[block_index];
+			
+					color_rgba trial_block_colors[4];
+					blk.get_block_colors(trial_block_colors, 0);
+
+					// precompute errors for the i-th block pixel and selector sel: [sel][i]
+					uint32_t trial_errors[4][16];
+
+					for (int sel = 0; sel < 4; ++sel)
+					{
+						for (int i = 0; i < 16; ++i)
+						{
+							trial_errors[sel][i] = color_distance(m_params.m_perceptual, pBlock_pixels[i], trial_block_colors[sel], false);
+						}
+					}
+
+					uint64_t best_cluster_err = INT64_MAX;
+					uint32_t best_cluster_index = 0;
+
+					const uint32_t parent_selector_cluster = m_block_parent_selector_cluster.size() ? m_block_parent_selector_cluster[block_index] : 0;
+					const uint_vec *pCluster_indices = m_selector_clusters_within_each_parent_cluster.size() ? &m_selector_clusters_within_each_parent_cluster[parent_selector_cluster] : nullptr;
+
+					const uint32_t total_clusters = m_use_hierarchical_selector_codebooks ? (uint32_t)pCluster_indices->size() : (uint32_t)m_selector_cluster_block_indices.size();
+
+#if 0
+					for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
+					{
+						const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+
+						const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+
+						uint64_t trial_err = 0;
+						for (int y = 0; y < 4; y++)
+						{
+							for (int x = 0; x < 4; x++)
+							{
+								const uint32_t sel = cluster_blk.get_selector(x, y);
+
+								trial_err += color_distance(m_params.m_perceptual, trial_block_colors[sel], pBlock_pixels[x + y * 4], false);
+								if (trial_err > best_cluster_err)
+									goto early_out;
+							}
+						}
+
+						if (trial_err < best_cluster_err)
+						{
+							best_cluster_err = trial_err;
+							best_cluster_index = cluster_index;
+							if (!best_cluster_err)
+								break;
+						}
+
+					early_out:
+						;
+					}
+#else
+					if (m_params.m_perceptual)
+					{
+						for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
+						{
+							const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+							//const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+
+							uint64_t trial_err = 0;
+																
+							for (int i = 0; i < 16; i++)
+							{
+								const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
+										
+								trial_err += trial_errors[sel][i];
+								if (trial_err > best_cluster_err)
+									goto early_out;
+							}
+
+							if (trial_err < best_cluster_err)
+							{
+								best_cluster_err = trial_err;
+								best_cluster_index = cluster_index;
+								if (!best_cluster_err)
+									break;
+							}
+
+						early_out:
+							;
+
+						} // cluster_iter
+					}
+					else
+					{
+						for (uint32_t cluster_iter = 0; cluster_iter < total_clusters; cluster_iter++)
+						{
+							const uint32_t cluster_index = m_use_hierarchical_selector_codebooks ? (*pCluster_indices)[cluster_iter] : cluster_iter;
+							//const etc_block& cluster_blk = m_optimized_cluster_selectors[cluster_index];
+
+							uint64_t trial_err = 0;
+
+							for (int i = 0; i < 16; i++)
+							{
+								const uint32_t sel = unpacked_optimized_cluster_selectors[cluster_index * 16 + i];
+
+								trial_err += trial_errors[sel][i];
+								if (trial_err > best_cluster_err)
+									goto early_out2;
+							}
+
+							if (trial_err < best_cluster_err)
+							{
+								best_cluster_err = trial_err;
+								best_cluster_index = cluster_index;
+								if (!best_cluster_err)
+									break;
+							}
+
+						early_out2:
+							;
+
+						} // cluster_iter
+					}
+#endif
+
+					blk.set_raw_selector_bits(m_optimized_cluster_selectors[best_cluster_index].get_raw_selector_bits());
+
+					m_block_selector_cluster_index[block_index] = best_cluster_index;
+	
+					{
+						std::lock_guard<std::mutex> lock(m_lock);
+
+						vector_ensure_element_is_valid(new_cluster_indices, best_cluster_index);
+						new_cluster_indices[best_cluster_index].push_back(block_index);
+					}
+					
+				} // block_index
+
+#ifndef __EMSCRIPTEN__
+				} );
+#endif
+
+			} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->wait_for_all();
+#endif
+
+			m_selector_cluster_block_indices.swap(new_cluster_indices);
+		}
+
+		for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++)
+			vector_sort(m_selector_cluster_block_indices[i]);
+	}
+
+	// TODO: Remove old ETC1 specific stuff, and thread this.
+	uint32_t basisu_frontend::refine_block_endpoints_given_selectors()
+	{
+		debug_printf("refine_block_endpoints_given_selectors\n");
+				
+		for (int block_index = 0; block_index < static_cast<int>(m_total_blocks); block_index++)
+		{
+			//uint32_t selector_cluster = m_block_selector_cluster_index(block_x, block_y);
+			vec2U &endpoint_clusters = m_block_endpoint_clusters_indices[block_index];
+
+			m_endpoint_cluster_etc_params[endpoint_clusters[0]].m_subblocks.push_back(block_index * 2);
+
+			m_endpoint_cluster_etc_params[endpoint_clusters[1]].m_subblocks.push_back(block_index * 2 + 1);
+		}
+
+		uint32_t total_subblocks_refined = 0;
+		uint32_t total_subblocks_examined = 0;
+
+		for (uint32_t endpoint_cluster_index = 0; endpoint_cluster_index < m_endpoint_cluster_etc_params.size(); endpoint_cluster_index++)
+		{
+			endpoint_cluster_etc_params &subblock_params = m_endpoint_cluster_etc_params[endpoint_cluster_index];
+
+			const uint_vec &subblocks = subblock_params.m_subblocks;
+			//uint32_t total_pixels = subblock.m_subblocks.size() * 8;
+
+			basisu::vector<color_rgba> subblock_colors[2]; // [use_individual_mode]
+			uint8_vec subblock_selectors[2];
+
+			uint64_t cur_subblock_err[2] = { 0, 0 };
+
+			for (uint32_t subblock_iter = 0; subblock_iter < subblocks.size(); subblock_iter++)
+			{
+				uint32_t training_vector_index = subblocks[subblock_iter];
+
+				uint32_t block_index = training_vector_index >> 1;
+				uint32_t subblock_index = training_vector_index & 1;
+				const bool is_flipped = true;
+
+				const etc_block &blk = m_encoded_blocks[block_index];
+
+				const bool use_individual_mode = !blk.get_diff_bit();
+
+				const color_rgba *pSource_block_pixels = get_source_pixel_block(block_index).get_ptr();
+
+				color_rgba unpacked_block_pixels[16];
+				unpack_etc1(blk, unpacked_block_pixels);
+
+				for (uint32_t i = 0; i < 8; i++)
+				{
+					const uint32_t pixel_index = g_etc1_pixel_indices[is_flipped][subblock_index][i];
+					const etc_coord2 &coords = g_etc1_pixel_coords[is_flipped][subblock_index][i];
+
+					subblock_colors[use_individual_mode].push_back(pSource_block_pixels[pixel_index]);
+
+					cur_subblock_err[use_individual_mode] += color_distance(m_params.m_perceptual, pSource_block_pixels[pixel_index], unpacked_block_pixels[pixel_index], false);
+
+					subblock_selectors[use_individual_mode].push_back(static_cast<uint8_t>(blk.get_selector(coords.m_x, coords.m_y)));
+				}
+			} // subblock_iter
+
+			etc1_optimizer::results cluster_optimizer_results[2];
+			bool results_valid[2] = { false, false };
+
+			clear_obj(cluster_optimizer_results);
+
+			basisu::vector<uint8_t> cluster_selectors[2];
+
+			for (uint32_t use_individual_mode = 0; use_individual_mode < 2; use_individual_mode++)
+			{
+				const uint32_t total_pixels = (uint32_t)subblock_colors[use_individual_mode].size();
+
+				if (!total_pixels)
+					continue;
+
+				total_subblocks_examined += total_pixels / 8;
+
+				etc1_optimizer optimizer;
+				etc1_solution_coordinates solutions[2];
+
+				etc1_optimizer::params cluster_optimizer_params;
+				cluster_optimizer_params.m_num_src_pixels = total_pixels;
+				cluster_optimizer_params.m_pSrc_pixels = &subblock_colors[use_individual_mode][0];
+
+				cluster_optimizer_params.m_use_color4 = use_individual_mode != 0;
+				cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
+
+				cluster_optimizer_params.m_pForce_selectors = &subblock_selectors[use_individual_mode][0];
+				cluster_optimizer_params.m_quality = cETCQualityUber;
+
+				cluster_selectors[use_individual_mode].resize(total_pixels);
+
+				cluster_optimizer_results[use_individual_mode].m_n = total_pixels;
+				cluster_optimizer_results[use_individual_mode].m_pSelectors = &cluster_selectors[use_individual_mode][0];
+
+				optimizer.init(cluster_optimizer_params, cluster_optimizer_results[use_individual_mode]);
+
+				if (!optimizer.compute())
+					continue;
+
+				if (cluster_optimizer_results[use_individual_mode].m_error < cur_subblock_err[use_individual_mode])
+					results_valid[use_individual_mode] = true;
+
+			} // use_individual_mode
+
+			for (uint32_t use_individual_mode = 0; use_individual_mode < 2; use_individual_mode++)
+			{
+				if (!results_valid[use_individual_mode])
+					continue;
+
+				uint32_t num_passes = use_individual_mode ? 1 : 2;
+
+				bool all_passed5 = true;
+
+				for (uint32_t pass = 0; pass < num_passes; pass++)
+				{
+					for (uint32_t subblock_iter = 0; subblock_iter < subblocks.size(); subblock_iter++)
+					{
+						const uint32_t training_vector_index = subblocks[subblock_iter];
+
+						const uint32_t block_index = training_vector_index >> 1;
+						const uint32_t subblock_index = training_vector_index & 1;
+						//const bool is_flipped = true;
+
+						etc_block &blk = m_encoded_blocks[block_index];
+
+						if (!blk.get_diff_bit() != static_cast<bool>(use_individual_mode != 0))
+							continue;
+
+						if (use_individual_mode)
+						{
+							blk.set_base4_color(subblock_index, etc_block::pack_color4(cluster_optimizer_results[1].m_block_color_unscaled, false));
+							blk.set_inten_table(subblock_index, cluster_optimizer_results[1].m_block_inten_table);
+
+							subblock_params.m_color_error[1] = cluster_optimizer_results[1].m_error;
+							subblock_params.m_inten_table[1] = cluster_optimizer_results[1].m_block_inten_table;
+							subblock_params.m_color_unscaled[1] = cluster_optimizer_results[1].m_block_color_unscaled;
+
+							total_subblocks_refined++;
+						}
+						else
+						{
+							const uint16_t base_color5 = blk.get_base5_color();
+							const uint16_t delta_color3 = blk.get_delta3_color();
+
+							uint32_t r[2], g[2], b[2];
+							etc_block::unpack_color5(r[0], g[0], b[0], base_color5, false);
+							bool success = etc_block::unpack_color5(r[1], g[1], b[1], base_color5, delta_color3, false);
+							assert(success);
+							BASISU_NOTE_UNUSED(success);
+
+							r[subblock_index] = cluster_optimizer_results[0].m_block_color_unscaled.r;
+							g[subblock_index] = cluster_optimizer_results[0].m_block_color_unscaled.g;
+							b[subblock_index] = cluster_optimizer_results[0].m_block_color_unscaled.b;
+
+							color_rgba colors[2] = { color_rgba(r[0], g[0], b[0], 255), color_rgba(r[1], g[1], b[1], 255) };
+
+							if (!etc_block::try_pack_color5_delta3(colors))
+							{
+								all_passed5 = false;
+								break;
+							}
+
+							if ((pass == 1) && (all_passed5))
+							{
+								blk.set_block_color5(colors[0], colors[1]);
+								blk.set_inten_table(subblock_index, cluster_optimizer_results[0].m_block_inten_table);
+
+								subblock_params.m_color_error[0] = cluster_optimizer_results[0].m_error;
+								subblock_params.m_inten_table[0] = cluster_optimizer_results[0].m_block_inten_table;
+								subblock_params.m_color_unscaled[0] = cluster_optimizer_results[0].m_block_color_unscaled;
+
+								total_subblocks_refined++;
+							}
+						}
+
+					} // subblock_iter
+
+				} // pass
+
+			} // use_individual_mode
+
+		} // endpoint_cluster_index
+
+		if (m_params.m_debug_stats)
+			debug_printf("Total subblock endpoints refined: %u (%3.1f%%)\n", total_subblocks_refined, total_subblocks_refined * 100.0f / total_subblocks_examined);
+				
+		return total_subblocks_refined;
+	}
+
+	void basisu_frontend::dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors)
+	{
+		debug_printf("dump_endpoint_clusterization_visualization\n");
+
+		uint32_t max_endpoint_cluster_size = 0;
+
+		basisu::vector<uint32_t> cluster_sizes(m_endpoint_clusters.size());
+		basisu::vector<uint32_t> sorted_cluster_indices(m_endpoint_clusters.size());
+		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
+		{
+			max_endpoint_cluster_size = maximum<uint32_t>(max_endpoint_cluster_size, (uint32_t)m_endpoint_clusters[i].size());
+			cluster_sizes[i] = (uint32_t)m_endpoint_clusters[i].size();
+		}
+
+		if (!max_endpoint_cluster_size)
+			return;
+
+		for (uint32_t i = 0; i < m_endpoint_clusters.size(); i++)
+			sorted_cluster_indices[i] = i;
+
+		//indexed_heap_sort(endpoint_clusters.size(), cluster_sizes.get_ptr(), sorted_cluster_indices.get_ptr());
+
+		image endpoint_cluster_vis(12 + minimum<uint32_t>(max_endpoint_cluster_size, 2048) * 5, (uint32_t)m_endpoint_clusters.size() * 3);
+
+		for (uint32_t unsorted_cluster_iter = 0; unsorted_cluster_iter < m_endpoint_clusters.size(); unsorted_cluster_iter++)
+		{
+			const uint32_t cluster_iter = sorted_cluster_indices[unsorted_cluster_iter];
+
+			etc_block blk;
+			blk.clear();
+			blk.set_flip_bit(false);
+			blk.set_diff_bit(true);
+			blk.set_inten_tables_etc1s(m_endpoint_cluster_etc_params[cluster_iter].m_inten_table[0]);
+			blk.set_base5_color(etc_block::pack_color5(m_endpoint_cluster_etc_params[cluster_iter].m_color_unscaled[0], false));
+
+			color_rgba blk_colors[4];
+			blk.get_block_colors(blk_colors, 0);
+			for (uint32_t i = 0; i < 4; i++)
+				endpoint_cluster_vis.fill_box(i * 2, 3 * unsorted_cluster_iter, 2, 2, blk_colors[i]);
+
+			for (uint32_t subblock_iter = 0; subblock_iter < m_endpoint_clusters[cluster_iter].size(); subblock_iter++)
+			{
+				uint32_t training_vector_index = m_endpoint_clusters[cluster_iter][subblock_iter];
+
+				const uint32_t block_index = training_vector_index >> 1;
+				const uint32_t subblock_index = training_vector_index & 1;
+
+				const etc_block& blk2 = m_etc1_blocks_etc1s[block_index];
+
+				const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+				color_rgba subblock_pixels[8];
+
+				if (vis_endpoint_colors)
+				{
+					color_rgba colors[2];
+					blk2.get_block_low_high_colors(colors, subblock_index);
+					for (uint32_t i = 0; i < 8; i++)
+						subblock_pixels[i] = colors[subblock_index];
+				}
+				else
+				{
+					for (uint32_t i = 0; i < 8; i++)
+						subblock_pixels[i] = pBlock_pixels[g_etc1_pixel_indices[blk2.get_flip_bit()][subblock_index][i]];
+				}
+
+				endpoint_cluster_vis.set_block_clipped(subblock_pixels, 12 + 5 * subblock_iter, 3 * unsorted_cluster_iter, 4, 2);
+			}
+		}
+
+		save_png(pFilename, endpoint_cluster_vis);
+		debug_printf("Wrote debug visualization file %s\n", pFilename);
+	}
+
+	void basisu_frontend::finalize()
+	{
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+			for (uint32_t subblock_index = 0; subblock_index < 2; subblock_index++)
+			{
+				const uint32_t endpoint_cluster_index = get_subblock_endpoint_cluster_index(block_index, subblock_index);
+
+				m_endpoint_cluster_etc_params[endpoint_cluster_index].m_color_used[0] = true;
+			}
+		}
+	}
+
+	// The backend has remapped the block endpoints while optimizing the output symbols for better rate distortion performance, so let's go and reoptimize the endpoint codebook.
+	// This is currently the only place where the backend actually goes and changes the quantization and calls the frontend to fix things up. 
+	// This is basically a bottom up clusterization stage, where some leaves can be combined.
+	void basisu_frontend::reoptimize_remapped_endpoints(const uint_vec &new_block_endpoints, int_vec &old_to_new_endpoint_cluster_indices, bool optimize_final_codebook, uint_vec *pBlock_selector_indices)
+	{
+		debug_printf("reoptimize_remapped_endpoints\n");
+
+		basisu::vector<uint_vec> new_endpoint_cluster_block_indices(m_endpoint_clusters.size());
+		for (uint32_t i = 0; i < new_block_endpoints.size(); i++)
+			new_endpoint_cluster_block_indices[new_block_endpoints[i]].push_back(i);
+
+		basisu::vector<uint8_t> cluster_valid(new_endpoint_cluster_block_indices.size());
+		basisu::vector<uint8_t> cluster_improved(new_endpoint_cluster_block_indices.size());
+		
+		const uint32_t N = 256;
+		for (uint32_t cluster_index_iter = 0; cluster_index_iter < new_endpoint_cluster_block_indices.size(); cluster_index_iter += N)
+		{
+			const uint32_t first_index = cluster_index_iter;                                    
+			const uint32_t last_index = minimum<uint32_t>((uint32_t)new_endpoint_cluster_block_indices.size(), cluster_index_iter + N);   
+
+#ifndef __EMSCRIPTEN__
+			m_params.m_pJob_pool->add_job( [this, first_index, last_index, &cluster_improved, &cluster_valid, &new_endpoint_cluster_block_indices, &pBlock_selector_indices ] {
+#endif
+
+				for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++)
+				{
+					const basisu::vector<uint32_t>& cluster_block_indices = new_endpoint_cluster_block_indices[cluster_index];
+
+					if (!cluster_block_indices.size())
+						continue;
+
+					const uint32_t total_pixels = (uint32_t)cluster_block_indices.size() * 16;
+
+					basisu::vector<color_rgba> cluster_pixels(total_pixels);
+					uint8_vec force_selectors(total_pixels);
+
+					etc_block blk;
+					blk.set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(cluster_index, false));
+					blk.set_inten_tables_etc1s(get_endpoint_cluster_inten_table(cluster_index, false));
+					blk.set_flip_bit(true);
+						
+					uint64_t cur_err = 0;
+
+					for (uint32_t cluster_block_indices_iter = 0; cluster_block_indices_iter < cluster_block_indices.size(); cluster_block_indices_iter++)
+					{
+						const uint32_t block_index = cluster_block_indices[cluster_block_indices_iter];
+				
+						const color_rgba *pBlock_pixels = get_source_pixel_block(block_index).get_ptr();
+
+						memcpy(&cluster_pixels[cluster_block_indices_iter * 16], pBlock_pixels, 16 * sizeof(color_rgba));
+
+						const uint32_t selector_cluster_index = pBlock_selector_indices ? (*pBlock_selector_indices)[block_index] : get_block_selector_cluster_index(block_index);
+
+						const etc_block &blk_selectors = get_selector_cluster_selector_bits(selector_cluster_index);
+
+						blk.set_raw_selector_bits(blk_selectors.get_raw_selector_bits());
+
+						cur_err += blk.evaluate_etc1_error(pBlock_pixels, m_params.m_perceptual);
+				
+						for (uint32_t y = 0; y < 4; y++)
+							for (uint32_t x = 0; x < 4; x++)
+								force_selectors[cluster_block_indices_iter * 16 + x + y * 4] = static_cast<uint8_t>(blk_selectors.get_selector(x, y));
+					}
+
+					endpoint_cluster_etc_params new_endpoint_cluster_etc_params;
+						
+					{
+						etc1_optimizer optimizer;
+						etc1_solution_coordinates solutions[2];
+
+						etc1_optimizer::params cluster_optimizer_params;
+						cluster_optimizer_params.m_num_src_pixels = total_pixels;
+						cluster_optimizer_params.m_pSrc_pixels = &cluster_pixels[0];
+
+						cluster_optimizer_params.m_use_color4 = false;
+						cluster_optimizer_params.m_perceptual = m_params.m_perceptual;
+						cluster_optimizer_params.m_pForce_selectors = &force_selectors[0];
+
+						if (m_params.m_compression_level == BASISU_MAX_COMPRESSION_LEVEL)
+							cluster_optimizer_params.m_quality = cETCQualityUber;
+						else
+							cluster_optimizer_params.m_quality = cETCQualitySlow;
+
+						etc1_optimizer::results cluster_optimizer_results;
+
+						basisu::vector<uint8_t> cluster_selectors(total_pixels);
+						cluster_optimizer_results.m_n = total_pixels;
+						cluster_optimizer_results.m_pSelectors = &cluster_selectors[0];
+
+						optimizer.init(cluster_optimizer_params, cluster_optimizer_results);
+
+						if (!optimizer.compute())
+							BASISU_FRONTEND_VERIFY(false);
+
+						new_endpoint_cluster_etc_params.m_color_unscaled[0] = cluster_optimizer_results.m_block_color_unscaled;
+						new_endpoint_cluster_etc_params.m_inten_table[0] = cluster_optimizer_results.m_block_inten_table;
+						new_endpoint_cluster_etc_params.m_color_error[0] = cluster_optimizer_results.m_error;
+						new_endpoint_cluster_etc_params.m_color_used[0] = true;
+						new_endpoint_cluster_etc_params.m_valid = true;
+					}
+
+					if (new_endpoint_cluster_etc_params.m_color_error[0] < cur_err)
+					{
+						m_endpoint_cluster_etc_params[cluster_index] = new_endpoint_cluster_etc_params;
+				
+						cluster_improved[cluster_index] = true;
+					}
+
+					cluster_valid[cluster_index] = true;
+
+				} // cluster_index
+
+#ifndef __EMSCRIPTEN__
+			} );
+#endif
+
+		} // cluster_index_iter
+
+#ifndef __EMSCRIPTEN__
+		m_params.m_pJob_pool->wait_for_all();
+#endif
+				
+		uint32_t total_unused_clusters = 0;
+		uint32_t total_improved_clusters = 0;
+		
+		old_to_new_endpoint_cluster_indices.resize(m_endpoint_clusters.size());
+		vector_set_all(old_to_new_endpoint_cluster_indices, -1);
+				
+		int total_new_endpoint_clusters = 0;
+
+		for (uint32_t old_cluster_index = 0; old_cluster_index < m_endpoint_clusters.size(); old_cluster_index++)
+		{
+			if (!cluster_valid[old_cluster_index])
+				total_unused_clusters++;
+			else
+				old_to_new_endpoint_cluster_indices[old_cluster_index] = total_new_endpoint_clusters++;
+
+			if (cluster_improved[old_cluster_index])
+				total_improved_clusters++;
+		}
+
+		debug_printf("Total unused clusters: %u\n", total_unused_clusters);
+		debug_printf("Total improved_clusters: %u\n", total_improved_clusters);
+		debug_printf("Total endpoint clusters: %u\n", total_new_endpoint_clusters);
+
+		if (optimize_final_codebook)
+		{
+			cluster_subblock_etc_params_vec new_endpoint_cluster_etc_params(total_new_endpoint_clusters);
+
+			for (uint32_t old_cluster_index = 0; old_cluster_index < m_endpoint_clusters.size(); old_cluster_index++)
+			{
+				if (old_to_new_endpoint_cluster_indices[old_cluster_index] >= 0)
+					new_endpoint_cluster_etc_params[old_to_new_endpoint_cluster_indices[old_cluster_index]] = m_endpoint_cluster_etc_params[old_cluster_index];
+			}
+
+			debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 1\n");
+
+			basisu::vector<uint_vec> new_endpoint_clusters(total_new_endpoint_clusters);
+
+			for (uint32_t block_index = 0; block_index < new_block_endpoints.size(); block_index++)
+			{
+				const uint32_t old_endpoint_cluster_index = new_block_endpoints[block_index];
+			
+				const int new_endpoint_cluster_index = old_to_new_endpoint_cluster_indices[old_endpoint_cluster_index];
+				BASISU_FRONTEND_VERIFY(new_endpoint_cluster_index >= 0);
+
+				BASISU_FRONTEND_VERIFY(new_endpoint_cluster_index < (int)new_endpoint_clusters.size());
+
+				new_endpoint_clusters[new_endpoint_cluster_index].push_back(block_index * 2 + 0);
+				new_endpoint_clusters[new_endpoint_cluster_index].push_back(block_index * 2 + 1);
+
+				BASISU_FRONTEND_VERIFY(new_endpoint_cluster_index < (int)new_endpoint_cluster_etc_params.size());
+
+				new_endpoint_cluster_etc_params[new_endpoint_cluster_index].m_subblocks.push_back(block_index * 2 + 0);
+				new_endpoint_cluster_etc_params[new_endpoint_cluster_index].m_subblocks.push_back(block_index * 2 + 1);
+									
+				m_block_endpoint_clusters_indices[block_index][0] = new_endpoint_cluster_index;
+				m_block_endpoint_clusters_indices[block_index][1] = new_endpoint_cluster_index;
+			}
+
+			debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 2\n");
+		
+			m_endpoint_clusters = new_endpoint_clusters;
+			m_endpoint_cluster_etc_params = new_endpoint_cluster_etc_params;
+
+			eliminate_redundant_or_empty_endpoint_clusters();
+
+			debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 3\n");
+
+			for (uint32_t new_cluster_index = 0; new_cluster_index < m_endpoint_clusters.size(); new_cluster_index++)
+			{
+				for (uint32_t cluster_block_iter = 0; cluster_block_iter < m_endpoint_clusters[new_cluster_index].size(); cluster_block_iter++)
+				{
+					const uint32_t subblock_index = m_endpoint_clusters[new_cluster_index][cluster_block_iter];
+					const uint32_t block_index = subblock_index >> 1;
+
+					m_block_endpoint_clusters_indices[block_index][0] = new_cluster_index;
+					m_block_endpoint_clusters_indices[block_index][1] = new_cluster_index;
+
+					const uint32_t old_cluster_index = new_block_endpoints[block_index];
+
+					old_to_new_endpoint_cluster_indices[old_cluster_index] = new_cluster_index;
+				}
+			}
+
+			debug_printf("basisu_frontend::reoptimize_remapped_endpoints: stage 4\n");
+
+			for (uint32_t block_index = 0; block_index < m_encoded_blocks.size(); block_index++)
+			{
+				const uint32_t endpoint_cluster_index = get_subblock_endpoint_cluster_index(block_index, 0);
+
+				m_encoded_blocks[block_index].set_block_color5_etc1s(get_endpoint_cluster_unscaled_color(endpoint_cluster_index, false));
+				m_encoded_blocks[block_index].set_inten_tables_etc1s(get_endpoint_cluster_inten_table(endpoint_cluster_index, false));
+			}
+
+			debug_printf("Final (post-RDO) endpoint clusters: %u\n", m_endpoint_clusters.size());
+		}
+						
+		//debug_printf("validate_output: %u\n", validate_output());
+	}
+	
+	bool basisu_frontend::validate_output() const
+	{
+		debug_printf("validate_output\n");
+
+		if (!check_etc1s_constraints())
+			return false;
+
+		for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++)
+		{
+//#define CHECK(x) do { if (!(x)) { DebugBreak(); return false; } } while(0)
+#define CHECK(x) BASISU_FRONTEND_VERIFY(x);
+
+			CHECK(get_output_block(block_index).get_flip_bit() == true);
+			
+			const bool diff_flag = get_diff_flag(block_index);
+			CHECK(diff_flag == true);
+
+			etc_block blk;
+			memset(&blk, 0, sizeof(blk));
+			blk.set_flip_bit(true);
+			blk.set_diff_bit(true);
+
+			const uint32_t endpoint_cluster0_index = get_subblock_endpoint_cluster_index(block_index, 0);
+			const uint32_t endpoint_cluster1_index = get_subblock_endpoint_cluster_index(block_index, 1);
+
+			// basisu only supports ETC1S, so these must be equal.
+			CHECK(endpoint_cluster0_index == endpoint_cluster1_index);
+			
+			CHECK(blk.set_block_color5_check(get_endpoint_cluster_unscaled_color(endpoint_cluster0_index, false), get_endpoint_cluster_unscaled_color(endpoint_cluster1_index, false)));
+
+			CHECK(get_endpoint_cluster_color_is_used(endpoint_cluster0_index, false));
+			
+			blk.set_inten_table(0, get_endpoint_cluster_inten_table(endpoint_cluster0_index, false));
+			blk.set_inten_table(1, get_endpoint_cluster_inten_table(endpoint_cluster1_index, false));
+
+			const uint32_t selector_cluster_index = get_block_selector_cluster_index(block_index);
+			CHECK(selector_cluster_index < get_total_selector_clusters());
+
+			CHECK(vector_find(get_selector_cluster_block_indices(selector_cluster_index), block_index) != -1);
+
+			blk.set_raw_selector_bits(get_selector_cluster_selector_bits(selector_cluster_index).get_raw_selector_bits());
+
+			const etc_block &rdo_output_block = get_output_block(block_index);
+
+			CHECK(rdo_output_block.get_flip_bit() == blk.get_flip_bit());
+			CHECK(rdo_output_block.get_diff_bit() == blk.get_diff_bit());
+			CHECK(rdo_output_block.get_inten_table(0) == blk.get_inten_table(0));
+			CHECK(rdo_output_block.get_inten_table(1) == blk.get_inten_table(1));
+			CHECK(rdo_output_block.get_base5_color() == blk.get_base5_color());
+			CHECK(rdo_output_block.get_delta3_color() == blk.get_delta3_color());
+			CHECK(rdo_output_block.get_raw_selector_bits() == blk.get_raw_selector_bits());
+
+			if (m_params.m_pGlobal_sel_codebook)
+			{
+				bool used_global_cb = true;
+				if (m_params.m_use_hybrid_selector_codebooks)
+					used_global_cb = m_selector_cluster_uses_global_cb[selector_cluster_index];
+
+				if (used_global_cb)
+				{
+					basist::etc1_global_selector_codebook_entry_id pal_id(get_selector_cluster_global_selector_entry_ids()[selector_cluster_index]);
+
+					basist::etc1_selector_palette_entry pal_entry(m_params.m_pGlobal_sel_codebook->get_entry(pal_id));
+
+					for (uint32_t y = 0; y < 4; y++)
+					{
+						for (uint32_t x = 0; x < 4; x++)
+						{
+							CHECK(pal_entry(x, y) == blk.get_selector(x, y));
+						}
+					}
+				}
+			}
+
+#undef CHECK
+		}
+
+		return true;
+	}
+
+	void basisu_frontend::dump_debug_image(const char *pFilename, uint32_t first_block, uint32_t num_blocks_x, uint32_t num_blocks_y, bool output_blocks)
+	{
+		gpu_image g;
+		g.init(texture_format::cETC1, num_blocks_x * 4, num_blocks_y * 4);
+
+		for (uint32_t y = 0; y < num_blocks_y; y++)
+		{
+			for (uint32_t x = 0; x < num_blocks_x; x++)
+			{
+				const uint32_t block_index = first_block + x + y * num_blocks_x;
+
+				etc_block &blk = *(etc_block *)g.get_block_ptr(x, y);
+
+				if (output_blocks)
+					blk = get_output_block(block_index);
+				else
+				{
+					const bool diff_flag = get_diff_flag(block_index);
+
+					blk.set_diff_bit(diff_flag);
+					blk.set_flip_bit(true);
+
+					const uint32_t endpoint_cluster0_index = get_subblock_endpoint_cluster_index(block_index, 0);
+					const uint32_t endpoint_cluster1_index = get_subblock_endpoint_cluster_index(block_index, 1);
+
+					if (diff_flag)
+						blk.set_block_color5(get_endpoint_cluster_unscaled_color(endpoint_cluster0_index, false), get_endpoint_cluster_unscaled_color(endpoint_cluster1_index, false));
+					else
+						blk.set_block_color4(get_endpoint_cluster_unscaled_color(endpoint_cluster0_index, true), get_endpoint_cluster_unscaled_color(endpoint_cluster1_index, true));
+
+					blk.set_inten_table(0, get_endpoint_cluster_inten_table(endpoint_cluster0_index, !diff_flag));
+					blk.set_inten_table(1, get_endpoint_cluster_inten_table(endpoint_cluster1_index, !diff_flag));
+
+					const uint32_t selector_cluster_index = get_block_selector_cluster_index(block_index);
+					blk.set_raw_selector_bits(get_selector_cluster_selector_bits(selector_cluster_index).get_raw_selector_bits());
+				}
+			}
+		}
+
+		image img;
+		g.unpack(img);
+
+		save_png(pFilename, img);
+	}
+
+} // namespace basisu
+
diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.h b/thirdparty/basis_universal/encoder/basisu_frontend.h
new file mode 100644
index 0000000000..4ff6d40466
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_frontend.h
@@ -0,0 +1,360 @@
+// basisu_frontend.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_enc.h"
+#include "basisu_etc.h"
+#include "basisu_gpu_texture.h"
+#include "basisu_global_selector_palette_helpers.h"
+#include "../transcoder/basisu_file_headers.h"
+#include "../transcoder/basisu_transcoder.h"
+
+namespace basisu
+{
+	struct vec2U
+	{
+		uint32_t m_comps[2];
+
+		vec2U() { }
+		vec2U(uint32_t a, uint32_t b) { set(a, b); }
+
+		void set(uint32_t a, uint32_t b) { m_comps[0] = a; m_comps[1] = b; }
+
+		uint32_t operator[] (uint32_t i) const { assert(i < 2); return m_comps[i]; }
+		uint32_t &operator[] (uint32_t i) { assert(i < 2); return m_comps[i]; }
+	};
+
+	const uint32_t BASISU_DEFAULT_COMPRESSION_LEVEL = 2;
+	const uint32_t BASISU_MAX_COMPRESSION_LEVEL = 6;
+
+	class basisu_frontend
+	{
+		BASISU_NO_EQUALS_OR_COPY_CONSTRUCT(basisu_frontend);
+
+	public:
+
+		basisu_frontend() :
+			m_total_blocks(0),
+			m_total_pixels(0),
+			m_endpoint_refinement(false),
+			m_use_hierarchical_endpoint_codebooks(false),
+			m_use_hierarchical_selector_codebooks(false),
+			m_num_endpoint_codebook_iterations(0),
+			m_num_selector_codebook_iterations(0)
+		{
+		}
+
+		enum
+		{
+			cMaxEndpointClusters = 16128,
+						
+			cMaxSelectorClusters = 16128,
+		};
+
+		struct params
+		{
+			params() :
+				m_num_source_blocks(0),
+				m_pSource_blocks(NULL),
+				m_max_endpoint_clusters(256),
+				m_max_selector_clusters(256),
+				m_compression_level(BASISU_DEFAULT_COMPRESSION_LEVEL),
+				m_perceptual(true),
+				m_debug_stats(false),
+				m_debug_images(false),
+																
+				m_dump_endpoint_clusterization(true),
+				m_validate(false),
+				m_multithreaded(false),
+				m_disable_hierarchical_endpoint_codebooks(false),
+				m_pGlobal_sel_codebook(NULL),
+				m_num_global_sel_codebook_pal_bits(0),
+				m_num_global_sel_codebook_mod_bits(0),
+				m_use_hybrid_selector_codebooks(false),
+				m_hybrid_codebook_quality_thresh(0.0f),
+				m_tex_type(basist::cBASISTexType2D),
+				m_pGlobal_codebooks(nullptr),
+				
+				m_pJob_pool(nullptr)
+			{
+			}
+
+			uint32_t m_num_source_blocks;
+			pixel_block *m_pSource_blocks;
+
+			uint32_t m_max_endpoint_clusters;
+			uint32_t m_max_selector_clusters;
+
+			uint32_t m_compression_level;
+
+			bool m_perceptual;
+			bool m_debug_stats;
+			bool m_debug_images;
+			bool m_dump_endpoint_clusterization;
+			bool m_validate;
+			bool m_multithreaded;
+			bool m_disable_hierarchical_endpoint_codebooks;
+			
+			const basist::etc1_global_selector_codebook *m_pGlobal_sel_codebook;
+			uint32_t m_num_global_sel_codebook_pal_bits;
+			uint32_t m_num_global_sel_codebook_mod_bits;
+			bool m_use_hybrid_selector_codebooks;
+			float m_hybrid_codebook_quality_thresh;
+			basist::basis_texture_type m_tex_type;
+			const basist::basisu_lowlevel_etc1s_transcoder *m_pGlobal_codebooks;
+			
+			job_pool *m_pJob_pool;
+		};
+
+		bool init(const params &p);
+
+		bool compress();
+
+		const params &get_params() const { return m_params; }
+
+		const pixel_block &get_source_pixel_block(uint32_t i) const { return m_source_blocks[i]; }
+
+		// RDO output blocks
+		uint32_t get_total_output_blocks() const { return static_cast<uint32_t>(m_encoded_blocks.size()); }
+
+		const etc_block &get_output_block(uint32_t block_index) const { return m_encoded_blocks[block_index]; }
+		const etc_block_vec &get_output_blocks() const { return m_encoded_blocks; }
+
+		// "Best" ETC1S blocks
+		const etc_block &get_etc1s_block(uint32_t block_index) const { return m_etc1_blocks_etc1s[block_index]; }
+
+		// Per-block flags
+		bool get_diff_flag(uint32_t block_index) const { return m_encoded_blocks[block_index].get_diff_bit(); }
+
+		// Endpoint clusters
+		uint32_t get_total_endpoint_clusters() const { return static_cast<uint32_t>(m_endpoint_clusters.size()); }
+		uint32_t get_subblock_endpoint_cluster_index(uint32_t block_index, uint32_t subblock_index) const { return m_block_endpoint_clusters_indices[block_index][subblock_index]; }
+
+		const color_rgba &get_endpoint_cluster_unscaled_color(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_color_unscaled[individual_mode]; }
+		uint32_t get_endpoint_cluster_inten_table(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_inten_table[individual_mode]; }
+
+		bool get_endpoint_cluster_color_is_used(uint32_t cluster_index, bool individual_mode) const { return m_endpoint_cluster_etc_params[cluster_index].m_color_used[individual_mode]; }
+
+		// Selector clusters
+		uint32_t get_total_selector_clusters() const { return static_cast<uint32_t>(m_selector_cluster_block_indices.size()); }
+		uint32_t get_block_selector_cluster_index(uint32_t block_index) const { return m_block_selector_cluster_index[block_index]; }
+		const etc_block &get_selector_cluster_selector_bits(uint32_t cluster_index) const { return m_optimized_cluster_selectors[cluster_index]; }
+
+		const basist::etc1_global_selector_codebook_entry_id_vec &get_selector_cluster_global_selector_entry_ids() const { return m_optimized_cluster_selector_global_cb_ids; }
+		const bool_vec &get_selector_cluster_uses_global_cb_vec() const { return m_selector_cluster_uses_global_cb; }
+
+		// Returns block indices using each selector cluster
+		const uint_vec &get_selector_cluster_block_indices(uint32_t selector_cluster_index) const { return m_selector_cluster_block_indices[selector_cluster_index]; }
+
+		void dump_debug_image(const char *pFilename, uint32_t first_block, uint32_t num_blocks_x, uint32_t num_blocks_y, bool output_blocks);
+		
+		void reoptimize_remapped_endpoints(const uint_vec &new_block_endpoints, int_vec &old_to_new_endpoint_cluster_indices, bool optimize_final_codebook, uint_vec *pBlock_selector_indices = nullptr);
+
+	private:
+		params m_params;
+		uint32_t m_total_blocks;
+		uint32_t m_total_pixels;
+
+		bool m_endpoint_refinement;
+		bool m_use_hierarchical_endpoint_codebooks;
+		bool m_use_hierarchical_selector_codebooks;
+
+		uint32_t m_num_endpoint_codebook_iterations;
+		uint32_t m_num_selector_codebook_iterations;
+
+		// Source pixels for each blocks
+		pixel_block_vec m_source_blocks;
+
+		// The quantized ETC1S texture.
+		etc_block_vec m_encoded_blocks;
+		
+		// Quantized blocks after endpoint quant, but before selector quant
+		etc_block_vec m_orig_encoded_blocks; 
+				
+		// Full quality ETC1S texture
+		etc_block_vec m_etc1_blocks_etc1s;
+				
+		typedef vec<6, float> vec6F;
+		
+		// Endpoint clusterizer
+		typedef tree_vector_quant<vec6F> vec6F_quantizer;
+		vec6F_quantizer m_endpoint_clusterizer;
+
+		// For each endpoint cluster: An array of which subblock indices (block_index*2+subblock) are located in that cluster.
+		// Array of block indices for each endpoint cluster
+		basisu::vector<uint_vec> m_endpoint_clusters;
+
+		// Array of block indices for each parent endpoint cluster
+		basisu::vector<uint_vec> m_endpoint_parent_clusters;
+		
+		// Each block's parent cluster index
+		uint8_vec m_block_parent_endpoint_cluster; 
+
+		// Array of endpoint cluster indices for each parent endpoint cluster
+		basisu::vector<uint_vec> m_endpoint_clusters_within_each_parent_cluster;
+				
+		struct endpoint_cluster_etc_params
+		{
+			endpoint_cluster_etc_params()
+			{
+				clear();
+			}
+
+			void clear()
+			{
+				clear_obj(m_color_unscaled);
+				clear_obj(m_inten_table);
+				clear_obj(m_color_error);
+				m_subblocks.clear();
+
+				clear_obj(m_color_used);
+				m_valid = false;
+			}
+
+			// TODO: basisu doesn't use individual mode.
+			color_rgba m_color_unscaled[2]; // [use_individual_mode]
+			uint32_t m_inten_table[2];
+
+			uint64_t m_color_error[2];
+
+			uint_vec m_subblocks;
+
+			bool m_color_used[2];
+
+			bool m_valid;
+
+			bool operator== (const endpoint_cluster_etc_params &other) const
+			{
+				for (uint32_t i = 0; i < 2; i++)
+				{
+					if (m_color_unscaled[i] != other.m_color_unscaled[i])
+						return false;
+				}
+
+				if (m_inten_table[0] != other.m_inten_table[0])
+					return false;
+				if (m_inten_table[1] != other.m_inten_table[1])
+					return false;
+
+				return true;
+			}
+
+			bool operator< (const endpoint_cluster_etc_params &other) const
+			{
+				for (uint32_t i = 0; i < 2; i++)
+				{
+					if (m_color_unscaled[i] < other.m_color_unscaled[i])
+						return true;
+					else if (m_color_unscaled[i] != other.m_color_unscaled[i])
+						return false;
+				}
+
+				if (m_inten_table[0] < other.m_inten_table[0])
+					return true;
+				else if (m_inten_table[0] == other.m_inten_table[0])
+				{
+					if (m_inten_table[1] < other.m_inten_table[1])
+						return true;
+				}
+
+				return false;
+			}
+		};
+
+		typedef basisu::vector<endpoint_cluster_etc_params> cluster_subblock_etc_params_vec;
+		
+		// Each endpoint cluster's ETC1S parameters 
+		cluster_subblock_etc_params_vec m_endpoint_cluster_etc_params;
+
+		// The endpoint cluster index used by each ETC1 subblock.
+		basisu::vector<vec2U> m_block_endpoint_clusters_indices;
+				
+		// The block(s) within each selector cluster
+		// Note: If you add anything here that uses selector cluster indicies, be sure to update optimize_selector_codebook()!
+		basisu::vector<uint_vec> m_selector_cluster_block_indices;
+
+		// The selector bits for each selector cluster.
+		basisu::vector<etc_block> m_optimized_cluster_selectors;
+
+		// The block(s) within each parent selector cluster.
+		basisu::vector<uint_vec> m_selector_parent_cluster_block_indices;
+		
+		// Each block's parent selector cluster
+		uint8_vec m_block_parent_selector_cluster;
+
+		// Array of selector cluster indices for each parent selector cluster
+		basisu::vector<uint_vec> m_selector_clusters_within_each_parent_cluster;
+
+		basist::etc1_global_selector_codebook_entry_id_vec m_optimized_cluster_selector_global_cb_ids;
+		bool_vec m_selector_cluster_uses_global_cb;
+
+		// Each block's selector cluster index
+		basisu::vector<uint32_t> m_block_selector_cluster_index;
+
+		struct subblock_endpoint_quant_err
+		{
+			uint64_t m_total_err;
+			uint32_t m_cluster_index;
+			uint32_t m_cluster_subblock_index;
+			uint32_t m_block_index;
+			uint32_t m_subblock_index;
+
+			bool operator< (const subblock_endpoint_quant_err &rhs) const
+			{
+				if (m_total_err < rhs.m_total_err)
+					return true;
+				else if (m_total_err == rhs.m_total_err)
+				{
+					if (m_block_index < rhs.m_block_index)
+						return true;
+					else if (m_block_index == rhs.m_block_index)
+						return m_subblock_index < rhs.m_subblock_index;
+				}
+				return false;
+			}
+		};
+
+		// The sorted subblock endpoint quant error for each endpoint cluster
+		basisu::vector<subblock_endpoint_quant_err> m_subblock_endpoint_quant_err_vec;
+
+		std::mutex m_lock;
+
+		//-----------------------------------------------------------------------------
+
+		void init_etc1_images();
+		bool init_global_codebooks();
+		void init_endpoint_training_vectors();
+		void dump_endpoint_clusterization_visualization(const char *pFilename, bool vis_endpoint_colors);
+		void generate_endpoint_clusters();
+		void compute_endpoint_subblock_error_vec();
+		void introduce_new_endpoint_clusters();
+		void generate_endpoint_codebook(uint32_t step);
+		uint32_t refine_endpoint_clusterization();
+		void eliminate_redundant_or_empty_endpoint_clusters();
+		void generate_block_endpoint_clusters();
+		void compute_endpoint_clusters_within_each_parent_cluster();
+		void compute_selector_clusters_within_each_parent_cluster();
+		void create_initial_packed_texture();
+		void generate_selector_clusters();
+		void create_optimized_selector_codebook(uint32_t iter);
+		void find_optimal_selector_clusters_for_each_block();
+		uint32_t refine_block_endpoints_given_selectors();
+		void finalize();
+		bool validate_output() const;
+		void introduce_special_selector_clusters();
+		void optimize_selector_codebook();
+		bool check_etc1s_constraints() const;
+	};
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.cpp b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.cpp
new file mode 100644
index 0000000000..102fc24980
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.cpp
@@ -0,0 +1,71 @@
+// basiu_global_selector_palette_helpers.cpp
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_global_selector_palette_helpers.h"
+
+namespace basisu
+{
+	uint64_t etc1_global_selector_codebook_find_best_entry(const basist::etc1_global_selector_codebook &codebook,
+		uint32_t num_src_pixel_blocks, const pixel_block *pSrc_pixel_blocks, const etc_block *pBlock_endpoints,
+		uint32_t &palette_index, basist::etc1_global_palette_entry_modifier &palette_modifier,
+		bool perceptual, uint32_t max_pal_entries, uint32_t max_modifiers)
+	{
+		uint64_t best_err = UINT64_MAX;
+		uint32_t best_pal_index = 0;
+		basist::etc1_global_palette_entry_modifier best_pal_modifier;
+
+		if (!max_pal_entries)
+			max_pal_entries = codebook.size();
+
+		if (!max_modifiers)
+			max_modifiers = basist::etc1_global_palette_entry_modifier::cTotalValues;
+
+		for (uint32_t pal_index = 0; pal_index < max_pal_entries; pal_index++)
+		{
+			for (uint32_t mod_index = 0; mod_index < max_modifiers; mod_index++)
+			{
+				const basist::etc1_global_palette_entry_modifier pal_modifier(mod_index);
+
+				const basist::etc1_selector_palette_entry pal_entry(codebook.get_entry(pal_index, pal_modifier));
+
+				uint64_t trial_err = 0;
+				for (uint32_t block_index = 0; block_index < num_src_pixel_blocks; block_index++)
+				{
+					etc_block trial_block(pBlock_endpoints[block_index]);
+
+					for (uint32_t y = 0; y < 4; y++)
+						for (uint32_t x = 0; x < 4; x++)
+							trial_block.set_selector(x, y, pal_entry(x, y));
+
+					trial_err += trial_block.evaluate_etc1_error(reinterpret_cast<const basisu::color_rgba *>(pSrc_pixel_blocks[block_index].get_ptr()), perceptual);
+					if (trial_err >= best_err)
+						break;
+				}
+
+				if (trial_err < best_err)
+				{
+					best_err = trial_err;
+					best_pal_index = pal_index;
+					best_pal_modifier = pal_modifier;
+				}
+			} // mod_index
+		} // pal_index
+
+		palette_index = best_pal_index;
+		palette_modifier = best_pal_modifier;
+
+		return best_err;
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.h b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.h
new file mode 100644
index 0000000000..7c35439df8
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_global_selector_palette_helpers.h
@@ -0,0 +1,46 @@
+// File: basisu_global_selector_palette_helpers.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "../transcoder/basisu.h"
+#include "basisu_etc.h"
+#include "../transcoder/basisu_global_selector_palette.h"
+
+namespace basisu
+{
+	const uint32_t cPixelBlockWidth = 4;
+	const uint32_t cPixelBlockHeight = 4;
+	const uint32_t cPixelBlockTotalPixels = cPixelBlockWidth * cPixelBlockHeight;
+
+	struct pixel_block
+	{
+		color_rgba m_pixels[cPixelBlockHeight][cPixelBlockWidth]; // [y][x]
+
+		const color_rgba &operator() (uint32_t x, uint32_t y) const { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
+		color_rgba &operator() (uint32_t x, uint32_t y) { assert((x < cPixelBlockWidth) && (y < cPixelBlockHeight)); return m_pixels[y][x]; }
+
+		const color_rgba *get_ptr() const { return &m_pixels[0][0]; }
+		color_rgba *get_ptr() { return &m_pixels[0][0]; }
+
+		void clear() { clear_obj(*this); }
+	};
+	typedef basisu::vector<pixel_block> pixel_block_vec;
+
+	uint64_t etc1_global_selector_codebook_find_best_entry(const basist::etc1_global_selector_codebook &codebook,
+		uint32_t num_src_pixel_blocks, const pixel_block *pSrc_pixel_blocks, const etc_block *pBlock_endpoints,
+		uint32_t &palette_index, basist::etc1_global_palette_entry_modifier &palette_modifier,
+		bool perceptual, uint32_t max_pal_entries, uint32_t max_modifiers);
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
new file mode 100644
index 0000000000..3f9fb67bdd
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp
@@ -0,0 +1,1622 @@
+// basisu_gpu_texture.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_gpu_texture.h"
+#include "basisu_enc.h"
+#include "basisu_pvrtc1_4.h"
+#include "basisu_astc_decomp.h"
+#include "basisu_bc7enc.h"
+
+namespace basisu
+{
+	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels)
+	{
+		static_assert(sizeof(eac_a8_block) == 8, "sizeof(eac_a8_block) == 8");
+
+		const eac_a8_block *pBlock = static_cast<const eac_a8_block *>(pBlock_bits);
+
+		const int8_t *pTable = g_etc2_eac_tables[pBlock->m_table];
+		
+		const uint64_t selector_bits = pBlock->get_selector_bits();
+		
+		const int32_t base = pBlock->m_base;
+		const int32_t mul = pBlock->m_multiplier;
+
+		pPixels[0].a = clamp255(base + pTable[pBlock->get_selector(0, 0, selector_bits)] * mul);
+		pPixels[1].a = clamp255(base + pTable[pBlock->get_selector(1, 0, selector_bits)] * mul);
+		pPixels[2].a = clamp255(base + pTable[pBlock->get_selector(2, 0, selector_bits)] * mul);
+		pPixels[3].a = clamp255(base + pTable[pBlock->get_selector(3, 0, selector_bits)] * mul);
+
+		pPixels[4].a = clamp255(base + pTable[pBlock->get_selector(0, 1, selector_bits)] * mul);
+		pPixels[5].a = clamp255(base + pTable[pBlock->get_selector(1, 1, selector_bits)] * mul);
+		pPixels[6].a = clamp255(base + pTable[pBlock->get_selector(2, 1, selector_bits)] * mul);
+		pPixels[7].a = clamp255(base + pTable[pBlock->get_selector(3, 1, selector_bits)] * mul);
+
+		pPixels[8].a = clamp255(base + pTable[pBlock->get_selector(0, 2, selector_bits)] * mul);
+		pPixels[9].a = clamp255(base + pTable[pBlock->get_selector(1, 2, selector_bits)] * mul);
+		pPixels[10].a = clamp255(base + pTable[pBlock->get_selector(2, 2, selector_bits)] * mul);
+		pPixels[11].a = clamp255(base + pTable[pBlock->get_selector(3, 2, selector_bits)] * mul);
+
+		pPixels[12].a = clamp255(base + pTable[pBlock->get_selector(0, 3, selector_bits)] * mul);
+		pPixels[13].a = clamp255(base + pTable[pBlock->get_selector(1, 3, selector_bits)] * mul);
+		pPixels[14].a = clamp255(base + pTable[pBlock->get_selector(2, 3, selector_bits)] * mul);
+		pPixels[15].a = clamp255(base + pTable[pBlock->get_selector(3, 3, selector_bits)] * mul);
+	}
+
+	struct bc1_block
+	{
+		enum { cTotalEndpointBytes = 2, cTotalSelectorBytes = 4 };
+
+		uint8_t m_low_color[cTotalEndpointBytes];
+		uint8_t m_high_color[cTotalEndpointBytes];
+		uint8_t m_selectors[cTotalSelectorBytes];
+				
+		inline uint32_t get_high_color() const	{ return m_high_color[0] | (m_high_color[1] << 8U); }
+		inline uint32_t get_low_color() const { return m_low_color[0] | (m_low_color[1] << 8U); }
+
+		static void unpack_color(uint32_t c, uint32_t &r, uint32_t &g, uint32_t &b) 
+		{
+			r = (c >> 11) & 31;
+			g = (c >> 5) & 63;
+			b = c & 31;
+			
+			r = (r << 3) | (r >> 2);
+			g = (g << 2) | (g >> 4);
+			b = (b << 3) | (b >> 2);
+		}
+
+		inline uint32_t get_selector(uint32_t x, uint32_t y) const { assert((x < 4U) && (y < 4U)); return (m_selectors[y] >> (x * 2)) & 3; }
+	};
+
+	// Returns true if the block uses 3 color punchthrough alpha mode.
+	bool unpack_bc1(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha)
+	{
+		static_assert(sizeof(bc1_block) == 8, "sizeof(bc1_block) == 8");
+
+		const bc1_block *pBlock = static_cast<const bc1_block *>(pBlock_bits);
+
+		const uint32_t l = pBlock->get_low_color();
+		const uint32_t h = pBlock->get_high_color();
+
+		color_rgba c[4];
+
+		uint32_t r0, g0, b0, r1, g1, b1;
+		bc1_block::unpack_color(l, r0, g0, b0);
+		bc1_block::unpack_color(h, r1, g1, b1);
+
+		c[0].set_noclamp_rgba(r0, g0, b0, 255);
+		c[1].set_noclamp_rgba(r1, g1, b1, 255);
+
+		bool used_punchthrough = false;
+
+		if (l > h)
+		{
+			c[2].set_noclamp_rgba((r0 * 2 + r1) / 3, (g0 * 2 + g1) / 3, (b0 * 2 + b1) / 3, 255);
+			c[3].set_noclamp_rgba((r1 * 2 + r0) / 3, (g1 * 2 + g0) / 3, (b1 * 2 + b0) / 3, 255);
+		}
+		else
+		{
+			c[2].set_noclamp_rgba((r0 + r1) / 2, (g0 + g1) / 2, (b0 + b1) / 2, 255);
+			c[3].set_noclamp_rgba(0, 0, 0, 0);
+			used_punchthrough = true;
+		}
+
+		if (set_alpha)
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0] = c[pBlock->get_selector(0, y)]; 
+				pPixels[1] = c[pBlock->get_selector(1, y)]; 
+				pPixels[2] = c[pBlock->get_selector(2, y)]; 
+				pPixels[3] = c[pBlock->get_selector(3, y)];
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0].set_rgb(c[pBlock->get_selector(0, y)]); 
+				pPixels[1].set_rgb(c[pBlock->get_selector(1, y)]); 
+				pPixels[2].set_rgb(c[pBlock->get_selector(2, y)]); 
+				pPixels[3].set_rgb(c[pBlock->get_selector(3, y)]);
+			}
+		}
+
+		return used_punchthrough;
+	}
+
+	bool unpack_bc1_nv(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha)
+	{
+		static_assert(sizeof(bc1_block) == 8, "sizeof(bc1_block) == 8");
+
+		const bc1_block *pBlock = static_cast<const bc1_block *>(pBlock_bits);
+
+		const uint32_t l = pBlock->get_low_color();
+		const uint32_t h = pBlock->get_high_color();
+
+		color_rgba c[4];
+
+		int r0 = (l >> 11) & 31;
+		int g0 = (l >> 5) & 63;
+		int b0 = l & 31;
+		int r1 = (h >> 11) & 31;
+		int g1 = (h >> 5) & 63;
+		int b1 = h & 31;
+
+		c[0].b = (uint8_t)((3 * b0 * 22) / 8);
+		c[0].g = (uint8_t)((g0 << 2) | (g0 >> 4));
+		c[0].r = (uint8_t)((3 * r0 * 22) / 8);
+		c[0].a = 0xFF;
+
+		c[1].r = (uint8_t)((3 * r1 * 22) / 8);
+		c[1].g = (uint8_t)((g1 << 2) | (g1 >> 4));
+		c[1].b = (uint8_t)((3 * b1 * 22) / 8);
+		c[1].a = 0xFF;
+
+		int gdiff = c[1].g - c[0].g;
+
+		bool used_punchthrough = false;
+
+		if (l > h)
+		{
+			c[2].r = (uint8_t)(((2 * r0 + r1) * 22) / 8);
+			c[2].g = (uint8_t)(((256 * c[0].g + gdiff/4 + 128 + gdiff * 80) / 256));
+			c[2].b = (uint8_t)(((2 * b0 + b1) * 22) / 8);
+			c[2].a = 0xFF;
+
+			c[3].r = (uint8_t)(((2 * r1 + r0) * 22) / 8);
+			c[3].g = (uint8_t)((256 * c[1].g - gdiff/4 + 128 - gdiff * 80) / 256);
+			c[3].b = (uint8_t)(((2 * b1 + b0) * 22) / 8);
+			c[3].a = 0xFF;
+		}
+		else
+		{
+			c[2].r = (uint8_t)(((r0 + r1) * 33) / 8);
+			c[2].g = (uint8_t)((256 * c[0].g + gdiff/4 + 128 + gdiff * 128) / 256);
+			c[2].b = (uint8_t)(((b0 + b1) * 33) / 8);
+			c[2].a = 0xFF;
+
+			c[3].set_noclamp_rgba(0, 0, 0, 0);
+			used_punchthrough = true;
+		}
+
+		if (set_alpha)
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0] = c[pBlock->get_selector(0, y)]; 
+				pPixels[1] = c[pBlock->get_selector(1, y)]; 
+				pPixels[2] = c[pBlock->get_selector(2, y)]; 
+				pPixels[3] = c[pBlock->get_selector(3, y)];
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0].set_rgb(c[pBlock->get_selector(0, y)]); 
+				pPixels[1].set_rgb(c[pBlock->get_selector(1, y)]); 
+				pPixels[2].set_rgb(c[pBlock->get_selector(2, y)]); 
+				pPixels[3].set_rgb(c[pBlock->get_selector(3, y)]);
+			}
+		}
+
+		return used_punchthrough;
+	}
+
+	static inline int interp_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 * 43 + c1 * 21 + 32) >> 6; }
+	static inline int interp_half_5_6_amd(int c0, int c1) { assert(c0 < 256 && c1 < 256); return (c0 + c1 + 1) >> 1; }
+
+	bool unpack_bc1_amd(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha)
+	{
+		const bc1_block *pBlock = static_cast<const bc1_block *>(pBlock_bits);
+
+		const uint32_t l = pBlock->get_low_color();
+		const uint32_t h = pBlock->get_high_color();
+
+		color_rgba c[4];
+
+		uint32_t r0, g0, b0, r1, g1, b1;
+		bc1_block::unpack_color(l, r0, g0, b0);
+		bc1_block::unpack_color(h, r1, g1, b1);
+
+		c[0].set_noclamp_rgba(r0, g0, b0, 255);
+		c[1].set_noclamp_rgba(r1, g1, b1, 255);
+				
+		bool used_punchthrough = false;
+
+		if (l > h)
+		{
+			c[2].set_noclamp_rgba(interp_5_6_amd(r0, r1), interp_5_6_amd(g0, g1), interp_5_6_amd(b0, b1), 255);
+			c[3].set_noclamp_rgba(interp_5_6_amd(r1, r0), interp_5_6_amd(g1, g0), interp_5_6_amd(b1, b0), 255);
+		}
+		else
+		{
+			c[2].set_noclamp_rgba(interp_half_5_6_amd(r0, r1), interp_half_5_6_amd(g0, g1), interp_half_5_6_amd(b0, b1), 255);
+			c[3].set_noclamp_rgba(0, 0, 0, 0);
+			used_punchthrough = true;
+		}
+
+		if (set_alpha)
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0] = c[pBlock->get_selector(0, y)]; 
+				pPixels[1] = c[pBlock->get_selector(1, y)]; 
+				pPixels[2] = c[pBlock->get_selector(2, y)]; 
+				pPixels[3] = c[pBlock->get_selector(3, y)];
+			}
+		}
+		else
+		{
+			for (uint32_t y = 0; y < 4; y++, pPixels += 4)
+			{
+				pPixels[0].set_rgb(c[pBlock->get_selector(0, y)]); 
+				pPixels[1].set_rgb(c[pBlock->get_selector(1, y)]); 
+				pPixels[2].set_rgb(c[pBlock->get_selector(2, y)]); 
+				pPixels[3].set_rgb(c[pBlock->get_selector(3, y)]);
+			}
+		}
+
+		return used_punchthrough;
+	}
+
+	struct bc4_block
+	{
+		enum { cBC4SelectorBits = 3, cTotalSelectorBytes = 6, cMaxSelectorValues = 8 };
+		uint8_t m_endpoints[2];
+
+		uint8_t m_selectors[cTotalSelectorBytes];
+
+		inline uint32_t get_low_alpha() const { return m_endpoints[0]; }
+		inline uint32_t get_high_alpha() const { return m_endpoints[1]; }
+		inline bool is_alpha6_block() const { return get_low_alpha() <= get_high_alpha(); }
+
+		inline uint64_t get_selector_bits() const
+		{ 
+			return ((uint64_t)((uint32_t)m_selectors[0] | ((uint32_t)m_selectors[1] << 8U) | ((uint32_t)m_selectors[2] << 16U) | ((uint32_t)m_selectors[3] << 24U))) |
+				(((uint64_t)m_selectors[4]) << 32U) |
+				(((uint64_t)m_selectors[5]) << 40U);
+		}
+
+		inline uint32_t get_selector(uint32_t x, uint32_t y, uint64_t selector_bits) const
+		{
+			assert((x < 4U) && (y < 4U));
+			return (selector_bits >> (((y * 4) + x) * cBC4SelectorBits)) & (cMaxSelectorValues - 1);
+		}
+				
+		static inline uint32_t get_block_values6(uint8_t *pDst, uint32_t l, uint32_t h)
+		{
+			pDst[0] = static_cast<uint8_t>(l);
+			pDst[1] = static_cast<uint8_t>(h);
+			pDst[2] = static_cast<uint8_t>((l * 4 + h) / 5);
+			pDst[3] = static_cast<uint8_t>((l * 3 + h * 2) / 5);
+			pDst[4] = static_cast<uint8_t>((l * 2 + h * 3) / 5);
+			pDst[5] = static_cast<uint8_t>((l + h * 4) / 5);
+			pDst[6] = 0;
+			pDst[7] = 255;
+			return 6;
+		}
+
+		static inline uint32_t get_block_values8(uint8_t *pDst, uint32_t l, uint32_t h)
+		{
+			pDst[0] = static_cast<uint8_t>(l);
+			pDst[1] = static_cast<uint8_t>(h);
+			pDst[2] = static_cast<uint8_t>((l * 6 + h) / 7);
+			pDst[3] = static_cast<uint8_t>((l * 5 + h * 2) / 7);
+			pDst[4] = static_cast<uint8_t>((l * 4 + h * 3) / 7);
+			pDst[5] = static_cast<uint8_t>((l * 3 + h * 4) / 7);
+			pDst[6] = static_cast<uint8_t>((l * 2 + h * 5) / 7);
+			pDst[7] = static_cast<uint8_t>((l + h * 6) / 7);
+			return 8;
+		}
+
+		static inline uint32_t get_block_values(uint8_t *pDst, uint32_t l, uint32_t h)
+		{
+			if (l > h)
+				return get_block_values8(pDst, l, h);
+			else
+				return get_block_values6(pDst, l, h);
+		}
+	};
+
+	void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride)
+	{
+		static_assert(sizeof(bc4_block) == 8, "sizeof(bc4_block) == 8");
+
+		const bc4_block *pBlock = static_cast<const bc4_block *>(pBlock_bits);
+
+		uint8_t sel_values[8];
+		bc4_block::get_block_values(sel_values, pBlock->get_low_alpha(), pBlock->get_high_alpha());
+
+		const uint64_t selector_bits = pBlock->get_selector_bits();
+
+		for (uint32_t y = 0; y < 4; y++, pPixels += (stride * 4U))
+		{
+			pPixels[0] = sel_values[pBlock->get_selector(0, y, selector_bits)];
+			pPixels[stride * 1] = sel_values[pBlock->get_selector(1, y, selector_bits)];
+			pPixels[stride * 2] = sel_values[pBlock->get_selector(2, y, selector_bits)];
+			pPixels[stride * 3] = sel_values[pBlock->get_selector(3, y, selector_bits)];
+		}
+	}
+	
+	// Returns false if the block uses 3-color punchthrough alpha mode, which isn't supported on some GPU's for BC3.
+	bool unpack_bc3(const void *pBlock_bits, color_rgba *pPixels)
+	{
+		bool success = true;
+
+		if (unpack_bc1((const uint8_t *)pBlock_bits + sizeof(bc4_block), pPixels, true))
+			success = false;
+
+		unpack_bc4(pBlock_bits, &pPixels[0].a, sizeof(color_rgba));
+		
+		return success;
+	}
+
+	// writes RG
+	void unpack_bc5(const void *pBlock_bits, color_rgba *pPixels)
+	{
+		unpack_bc4(pBlock_bits, &pPixels[0].r, sizeof(color_rgba));
+		unpack_bc4((const uint8_t *)pBlock_bits + sizeof(bc4_block), &pPixels[0].g, sizeof(color_rgba));
+	}
+
+	// ATC isn't officially documented, so I'm assuming these references:
+	// http://www.guildsoftware.com/papers/2012.Converting.DXTC.to.ATC.pdf
+	// https://github.com/Triang3l/S3TConv/blob/master/s3tconv_atitc.c
+	// The paper incorrectly says the ATC lerp factors are 1/3 and 2/3, but they are actually 3/8 and 5/8.
+	void unpack_atc(const void* pBlock_bits, color_rgba* pPixels)
+	{
+		const uint8_t* pBytes = static_cast<const uint8_t*>(pBlock_bits);
+
+		const uint16_t color0 = pBytes[0] | (pBytes[1] << 8U);
+		const uint16_t color1 = pBytes[2] | (pBytes[3] << 8U);
+		uint32_t sels = pBytes[4] | (pBytes[5] << 8U) | (pBytes[6] << 16U) | (pBytes[7] << 24U);
+
+		const bool mode = (color0 & 0x8000) != 0;
+
+		color_rgba c[4];
+
+		c[0].set((color0 >> 10) & 31, (color0 >> 5) & 31, color0 & 31, 255);
+		c[0].r = (c[0].r << 3) | (c[0].r >> 2);
+		c[0].g = (c[0].g << 3) | (c[0].g >> 2);
+		c[0].b = (c[0].b << 3) | (c[0].b >> 2);
+
+		c[3].set((color1 >> 11) & 31, (color1 >> 5) & 63, color1 & 31, 255);
+		c[3].r = (c[3].r << 3) | (c[3].r >> 2);
+		c[3].g = (c[3].g << 2) | (c[3].g >> 4);
+		c[3].b = (c[3].b << 3) | (c[3].b >> 2);
+
+		if (mode)
+		{
+			c[1].set(basisu::maximum(0, c[0].r - (c[3].r >> 2)), basisu::maximum(0, c[0].g - (c[3].g >> 2)), basisu::maximum(0, c[0].b - (c[3].b >> 2)), 255);
+			c[2] = c[0];
+			c[0].set(0, 0, 0, 255);
+		}
+		else
+		{
+			c[1].r = (c[0].r * 5 + c[3].r * 3) >> 3;
+			c[1].g = (c[0].g * 5 + c[3].g * 3) >> 3;
+			c[1].b = (c[0].b * 5 + c[3].b * 3) >> 3;
+
+			c[2].r = (c[0].r * 3 + c[3].r * 5) >> 3;
+			c[2].g = (c[0].g * 3 + c[3].g * 5) >> 3;
+			c[2].b = (c[0].b * 3 + c[3].b * 5) >> 3;
+		}
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t s = sels & 3;
+			
+			pPixels[i] = c[s];
+							
+			sels >>= 2;
+		}
+	}
+
+	// BC7 mode 0-7 decompression.
+	// Instead of one monster routine to unpack all the BC7 modes, we're lumping the 3 subset, 2 subset, 1 subset, and dual plane modes together into simple shared routines.
+
+	static inline uint32_t bc7_dequant(uint32_t val, uint32_t pbit, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(pbit < 2); assert(val_bits >= 4 && val_bits <= 8); const uint32_t total_bits = val_bits + 1; val = (val << 1) | pbit; val <<= (8 - total_bits); val |= (val >> total_bits); assert(val <= 255); return val; }
+	static inline uint32_t bc7_dequant(uint32_t val, uint32_t val_bits) { assert(val < (1U << val_bits)); assert(val_bits >= 4 && val_bits <= 8); val <<= (8 - val_bits); val |= (val >> val_bits); assert(val <= 255); return val; }
+
+	static inline uint32_t bc7_interp2(uint32_t l, uint32_t h, uint32_t w) { assert(w < 4); return (l * (64 - basist::g_bc7_weights2[w]) + h * basist::g_bc7_weights2[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp3(uint32_t l, uint32_t h, uint32_t w) { assert(w < 8); return (l * (64 - basist::g_bc7_weights3[w]) + h * basist::g_bc7_weights3[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp4(uint32_t l, uint32_t h, uint32_t w) { assert(w < 16); return (l * (64 - basist::g_bc7_weights4[w]) + h * basist::g_bc7_weights4[w] + 32) >> 6; }
+	static inline uint32_t bc7_interp(uint32_t l, uint32_t h, uint32_t w, uint32_t bits)
+	{
+		assert(l <= 255 && h <= 255);
+		switch (bits)
+		{
+		case 2: return bc7_interp2(l, h, w);
+		case 3: return bc7_interp3(l, h, w);
+		case 4: return bc7_interp4(l, h, w);
+		default: 
+			break;
+		}
+		return 0;
+	}
+		
+	bool unpack_bc7_mode0_2(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		//const uint32_t SUBSETS = 3;
+		const uint32_t ENDPOINTS = 6;
+		const uint32_t COMPS = 3;
+		const uint32_t WEIGHT_BITS = (mode == 0) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 0) ? 4 : 5;
+		const uint32_t PBITS = (mode == 0) ? 6 : 0;
+		const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t part = read_bits32(pBuf, bit_offset, (mode == 0) ? 4 : 6);
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS);
+
+		uint32_t pbits[6];
+		for (uint32_t p = 0; p < PBITS; p++)
+			pbits[p] = read_bits32(pBuf, bit_offset, 1);
+
+		uint32_t weights[16];
+		for (uint32_t i = 0; i < 16; i++)
+			weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == basist::g_bc7_table_anchor_index_third_subset_1[part]) || (i == basist::g_bc7_table_anchor_index_third_subset_2[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS);
+
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)((c == 3) ? 255 : (PBITS ? bc7_dequant(endpoints[e][c], pbits[e], ENDPOINT_BITS) : bc7_dequant(endpoints[e][c], ENDPOINT_BITS)));
+
+		color_rgba block_colors[3][8];
+		for (uint32_t s = 0; s < 3; s++)
+			for (uint32_t i = 0; i < WEIGHT_VALS; i++)
+			{
+				for (uint32_t c = 0; c < 3; c++)
+					block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS);
+				block_colors[s][i][3] = 255;
+			}
+
+		for (uint32_t i = 0; i < 16; i++)
+			pPixels[i] = block_colors[basist::g_bc7_partition3[part * 16 + i]][weights[i]];
+
+		return true;
+	}
+
+	bool unpack_bc7_mode1_3_7(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		//const uint32_t SUBSETS = 2;
+		const uint32_t ENDPOINTS = 4;
+		const uint32_t COMPS = (mode == 7) ? 4 : 3;
+		const uint32_t WEIGHT_BITS = (mode == 1) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 7) ? 5 : ((mode == 1) ? 6 : 7);
+		const uint32_t PBITS = (mode == 1) ? 2 : 4;
+		const uint32_t SHARED_PBITS = (mode == 1) ? true : false;
+		const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t part = read_bits32(pBuf, bit_offset, 6);
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, ENDPOINT_BITS);
+		
+		uint32_t pbits[4];
+		for (uint32_t p = 0; p < PBITS; p++)
+			pbits[p] = read_bits32(pBuf, bit_offset, 1);
+						
+		uint32_t weights[16];
+		for (uint32_t i = 0; i < 16; i++)
+			weights[i] = read_bits32(pBuf, bit_offset, ((!i) || (i == basist::g_bc7_table_anchor_index_second_subset[part])) ? (WEIGHT_BITS - 1) : WEIGHT_BITS);
+		
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)((c == ((mode == 7U) ? 4U : 3U)) ? 255 : bc7_dequant(endpoints[e][c], pbits[SHARED_PBITS ? (e >> 1) : e], ENDPOINT_BITS));
+		
+		color_rgba block_colors[2][8];
+		for (uint32_t s = 0; s < 2; s++)
+			for (uint32_t i = 0; i < WEIGHT_VALS; i++)
+			{
+				for (uint32_t c = 0; c < COMPS; c++)
+					block_colors[s][i][c] = (uint8_t)bc7_interp(endpoints[s * 2 + 0][c], endpoints[s * 2 + 1][c], i, WEIGHT_BITS);
+				block_colors[s][i][3] = (COMPS == 3) ? 255 : block_colors[s][i][3];
+			}
+
+		for (uint32_t i = 0; i < 16; i++)
+			pPixels[i] = block_colors[basist::g_bc7_partition2[part * 16 + i]][weights[i]];
+
+		return true;
+	}
+
+	bool unpack_bc7_mode4_5(uint32_t mode, const void* pBlock_bits, color_rgba* pPixels)
+	{
+		const uint32_t ENDPOINTS = 2;
+		const uint32_t COMPS = 4;
+		const uint32_t WEIGHT_BITS = 2;
+		const uint32_t A_WEIGHT_BITS = (mode == 4) ? 3 : 2;
+		const uint32_t ENDPOINT_BITS = (mode == 4) ? 5 : 7;
+		const uint32_t A_ENDPOINT_BITS = (mode == 4) ? 6 : 8;
+		//const uint32_t WEIGHT_VALS = 1 << WEIGHT_BITS;
+		//const uint32_t A_WEIGHT_VALS = 1 << A_WEIGHT_BITS;
+
+		uint32_t bit_offset = 0;
+		const uint8_t* pBuf = static_cast<const uint8_t*>(pBlock_bits);
+
+		if (read_bits32(pBuf, bit_offset, mode + 1) != (1U << mode)) return false;
+
+		const uint32_t comp_rot = read_bits32(pBuf, bit_offset, 2);
+		const uint32_t index_mode = (mode == 4) ? read_bits32(pBuf, bit_offset, 1) : 0;
+
+		color_rgba endpoints[ENDPOINTS];
+		for (uint32_t c = 0; c < COMPS; c++)
+			for (uint32_t e = 0; e < ENDPOINTS; e++)
+				endpoints[e][c] = (uint8_t)read_bits32(pBuf, bit_offset, (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS);
+		
+		const uint32_t weight_bits[2] = { index_mode ? A_WEIGHT_BITS : WEIGHT_BITS,  index_mode ? WEIGHT_BITS : A_WEIGHT_BITS };
+		
+		uint32_t weights[16], a_weights[16];
+		
+		for (uint32_t i = 0; i < 16; i++)
+			(index_mode ? a_weights : weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[index_mode] - ((!i) ? 1 : 0));
+
+		for (uint32_t i = 0; i < 16; i++)
+			(index_mode ? weights : a_weights)[i] = read_bits32(pBuf, bit_offset, weight_bits[1 - index_mode] - ((!i) ? 1 : 0));
+
+		assert(bit_offset == 128);
+
+		for (uint32_t e = 0; e < ENDPOINTS; e++)
+			for (uint32_t c = 0; c < 4; c++)
+				endpoints[e][c] = (uint8_t)bc7_dequant(endpoints[e][c], (c == 3) ? A_ENDPOINT_BITS : ENDPOINT_BITS);
+
+		color_rgba block_colors[8];
+		for (uint32_t i = 0; i < (1U << weight_bits[0]); i++)
+			for (uint32_t c = 0; c < 3; c++)
+				block_colors[i][c] = (uint8_t)bc7_interp(endpoints[0][c], endpoints[1][c], i, weight_bits[0]);
+
+		for (uint32_t i = 0; i < (1U << weight_bits[1]); i++)
+			block_colors[i][3] = (uint8_t)bc7_interp(endpoints[0][3], endpoints[1][3], i, weight_bits[1]);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			pPixels[i] = block_colors[weights[i]];
+			pPixels[i].a = block_colors[a_weights[i]].a;
+			if (comp_rot >= 1)
+				std::swap(pPixels[i].a, pPixels[i].m_comps[comp_rot - 1]);
+		}
+
+		return true;
+	}
+
+	struct bc7_mode_6
+	{
+		struct
+		{
+			uint64_t m_mode : 7;
+			uint64_t m_r0 : 7;
+			uint64_t m_r1 : 7;
+			uint64_t m_g0 : 7;
+			uint64_t m_g1 : 7;
+			uint64_t m_b0 : 7;
+			uint64_t m_b1 : 7;
+			uint64_t m_a0 : 7;
+			uint64_t m_a1 : 7;
+			uint64_t m_p0 : 1;
+		} m_lo;
+
+		union
+		{
+			struct
+			{
+				uint64_t m_p1 : 1;
+				uint64_t m_s00 : 3;
+				uint64_t m_s10 : 4;
+				uint64_t m_s20 : 4;
+				uint64_t m_s30 : 4;
+
+				uint64_t m_s01 : 4;
+				uint64_t m_s11 : 4;
+				uint64_t m_s21 : 4;
+				uint64_t m_s31 : 4;
+
+				uint64_t m_s02 : 4;
+				uint64_t m_s12 : 4;
+				uint64_t m_s22 : 4;
+				uint64_t m_s32 : 4;
+
+				uint64_t m_s03 : 4;
+				uint64_t m_s13 : 4;
+				uint64_t m_s23 : 4;
+				uint64_t m_s33 : 4;
+
+			} m_hi;
+
+			uint64_t m_hi_bits;
+		};
+	};
+
+	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels)
+	{
+		static_assert(sizeof(bc7_mode_6) == 16, "sizeof(bc7_mode_6) == 16");
+
+		const bc7_mode_6 &block = *static_cast<const bc7_mode_6 *>(pBlock_bits);
+
+		if (block.m_lo.m_mode != (1 << 6))
+			return false;
+
+		const uint32_t r0 = (uint32_t)((block.m_lo.m_r0 << 1) | block.m_lo.m_p0);
+		const uint32_t g0 = (uint32_t)((block.m_lo.m_g0 << 1) | block.m_lo.m_p0);
+		const uint32_t b0 = (uint32_t)((block.m_lo.m_b0 << 1) | block.m_lo.m_p0);
+		const uint32_t a0 = (uint32_t)((block.m_lo.m_a0 << 1) | block.m_lo.m_p0);
+		const uint32_t r1 = (uint32_t)((block.m_lo.m_r1 << 1) | block.m_hi.m_p1);
+		const uint32_t g1 = (uint32_t)((block.m_lo.m_g1 << 1) | block.m_hi.m_p1);
+		const uint32_t b1 = (uint32_t)((block.m_lo.m_b1 << 1) | block.m_hi.m_p1);
+		const uint32_t a1 = (uint32_t)((block.m_lo.m_a1 << 1) | block.m_hi.m_p1);
+
+		color_rgba vals[16];
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t w = basist::g_bc7_weights4[i];
+			const uint32_t iw = 64 - w;
+			vals[i].set_noclamp_rgba( 
+				(r0 * iw + r1 * w + 32) >> 6, 
+				(g0 * iw + g1 * w + 32) >> 6, 
+				(b0 * iw + b1 * w + 32) >> 6, 
+				(a0 * iw + a1 * w + 32) >> 6);
+		}
+
+		pPixels[0] = vals[block.m_hi.m_s00];
+		pPixels[1] = vals[block.m_hi.m_s10];
+		pPixels[2] = vals[block.m_hi.m_s20];
+		pPixels[3] = vals[block.m_hi.m_s30];
+
+		pPixels[4] = vals[block.m_hi.m_s01];
+		pPixels[5] = vals[block.m_hi.m_s11];
+		pPixels[6] = vals[block.m_hi.m_s21];
+		pPixels[7] = vals[block.m_hi.m_s31];
+		
+		pPixels[8] = vals[block.m_hi.m_s02];
+		pPixels[9] = vals[block.m_hi.m_s12];
+		pPixels[10] = vals[block.m_hi.m_s22];
+		pPixels[11] = vals[block.m_hi.m_s32];
+
+		pPixels[12] = vals[block.m_hi.m_s03];
+		pPixels[13] = vals[block.m_hi.m_s13];
+		pPixels[14] = vals[block.m_hi.m_s23];
+		pPixels[15] = vals[block.m_hi.m_s33];
+
+		return true;
+	}
+
+	bool unpack_bc7(const void *pBlock, color_rgba *pPixels)
+	{
+		const uint32_t first_byte = static_cast<const uint8_t*>(pBlock)[0];
+
+		for (uint32_t mode = 0; mode <= 7; mode++)
+		{
+			if (first_byte & (1U << mode))
+			{
+				switch (mode)
+				{
+				case 0:
+				case 2:
+					return unpack_bc7_mode0_2(mode, pBlock, pPixels);
+				case 1:
+				case 3:
+				case 7:
+					return unpack_bc7_mode1_3_7(mode, pBlock, pPixels);
+				case 4:
+				case 5:
+					return unpack_bc7_mode4_5(mode, pBlock, pPixels);
+				case 6:
+					return unpack_bc7_mode6(pBlock, pPixels);
+				default:
+					break;
+				}
+			}
+		}
+
+		return false;
+	}
+	
+	struct fxt1_block
+	{
+		union
+		{
+			struct
+			{
+				uint64_t m_t00 : 2;
+				uint64_t m_t01 : 2;
+				uint64_t m_t02 : 2;
+				uint64_t m_t03 : 2;
+				uint64_t m_t04 : 2;
+				uint64_t m_t05 : 2;
+				uint64_t m_t06 : 2;
+				uint64_t m_t07 : 2;
+				uint64_t m_t08 : 2;
+				uint64_t m_t09 : 2;
+				uint64_t m_t10 : 2;
+				uint64_t m_t11 : 2;
+				uint64_t m_t12 : 2;
+				uint64_t m_t13 : 2;
+				uint64_t m_t14 : 2;
+				uint64_t m_t15 : 2;
+				uint64_t m_t16 : 2;
+				uint64_t m_t17 : 2;
+				uint64_t m_t18 : 2;
+				uint64_t m_t19 : 2;
+				uint64_t m_t20 : 2;
+				uint64_t m_t21 : 2;
+				uint64_t m_t22 : 2;
+				uint64_t m_t23 : 2;
+				uint64_t m_t24 : 2;
+				uint64_t m_t25 : 2;
+				uint64_t m_t26 : 2;
+				uint64_t m_t27 : 2;
+				uint64_t m_t28 : 2;
+				uint64_t m_t29 : 2;
+				uint64_t m_t30 : 2;
+				uint64_t m_t31 : 2;
+			} m_lo;
+			uint64_t m_lo_bits;
+			uint8_t m_sels[8];
+		};
+
+		union
+		{
+			struct
+			{
+#ifdef BASISU_USE_ORIGINAL_3DFX_FXT1_ENCODING
+				// This is the format that 3DFX's DECOMP.EXE tool expects, which I'm assuming is what the actual 3DFX hardware wanted.
+				// Unfortunately, color0/color1 and color2/color3 are flipped relative to the official OpenGL extension and Intel's documentation!
+				uint64_t m_b1 : 5;
+				uint64_t m_g1 : 5;
+				uint64_t m_r1 : 5;
+				uint64_t m_b0 : 5;
+				uint64_t m_g0 : 5;
+				uint64_t m_r0 : 5;
+				uint64_t m_b3 : 5;
+				uint64_t m_g3 : 5;
+				uint64_t m_r3 : 5;
+				uint64_t m_b2 : 5;
+				uint64_t m_g2 : 5;
+				uint64_t m_r2 : 5;
+#else
+				// Intel's encoding, and the encoding in the OpenGL FXT1 spec.
+				uint64_t m_b0 : 5;
+				uint64_t m_g0 : 5;
+				uint64_t m_r0 : 5;
+				uint64_t m_b1 : 5;
+				uint64_t m_g1 : 5;
+				uint64_t m_r1 : 5;
+				uint64_t m_b2 : 5;
+				uint64_t m_g2 : 5;
+				uint64_t m_r2 : 5;
+				uint64_t m_b3 : 5;
+				uint64_t m_g3 : 5;
+				uint64_t m_r3 : 5;
+#endif
+				uint64_t m_alpha : 1;
+				uint64_t m_glsb : 2;
+				uint64_t m_mode : 1;
+			} m_hi;
+
+			uint64_t m_hi_bits;
+		};
+	};
+
+	static color_rgba expand_565(const color_rgba& c)
+	{
+		return color_rgba((c.r << 3) | (c.r >> 2), (c.g << 2) | (c.g >> 4), (c.b << 3) | (c.b >> 2), 255);
+	}
+
+	// We only support CC_MIXED non-alpha blocks here because that's the only mode the transcoder uses at the moment.
+	bool unpack_fxt1(const void *p, color_rgba *pPixels)
+	{
+		const fxt1_block* pBlock = static_cast<const fxt1_block*>(p);
+
+		if (pBlock->m_hi.m_mode == 0)
+			return false;
+		if (pBlock->m_hi.m_alpha == 1)
+			return false;
+				
+		color_rgba colors[4];
+
+		colors[0].r = pBlock->m_hi.m_r0;
+		colors[0].g = (uint8_t)((pBlock->m_hi.m_g0 << 1) | ((pBlock->m_lo.m_t00 >> 1) ^ (pBlock->m_hi.m_glsb & 1)));
+		colors[0].b = pBlock->m_hi.m_b0;
+		colors[0].a = 255;
+
+		colors[1].r = pBlock->m_hi.m_r1;
+		colors[1].g = (uint8_t)((pBlock->m_hi.m_g1 << 1) | (pBlock->m_hi.m_glsb & 1));
+		colors[1].b = pBlock->m_hi.m_b1;
+		colors[1].a = 255;
+
+		colors[2].r = pBlock->m_hi.m_r2;
+		colors[2].g = (uint8_t)((pBlock->m_hi.m_g2 << 1) | ((pBlock->m_lo.m_t16 >> 1) ^ (pBlock->m_hi.m_glsb >> 1)));
+		colors[2].b = pBlock->m_hi.m_b2;
+		colors[2].a = 255;
+
+		colors[3].r = pBlock->m_hi.m_r3;
+		colors[3].g = (uint8_t)((pBlock->m_hi.m_g3 << 1) | (pBlock->m_hi.m_glsb >> 1));
+		colors[3].b = pBlock->m_hi.m_b3;
+		colors[3].a = 255;
+
+		for (uint32_t i = 0; i < 4; i++)
+			colors[i] = expand_565(colors[i]);
+
+		color_rgba block0_colors[4];
+		block0_colors[0] = colors[0];
+		block0_colors[1] = color_rgba((colors[0].r * 2 + colors[1].r + 1) / 3, (colors[0].g * 2 + colors[1].g + 1) / 3, (colors[0].b * 2 + colors[1].b + 1) / 3, 255);
+		block0_colors[2] = color_rgba((colors[1].r * 2 + colors[0].r + 1) / 3, (colors[1].g * 2 + colors[0].g + 1) / 3, (colors[1].b * 2 + colors[0].b + 1) / 3, 255);
+		block0_colors[3] = colors[1];
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t sel = (pBlock->m_sels[i >> 2] >> ((i & 3) * 2)) & 3;
+
+			const uint32_t x = i & 3;
+			const uint32_t y = i >> 2;
+			pPixels[x + y * 8] = block0_colors[sel];
+		}
+
+		color_rgba block1_colors[4];
+		block1_colors[0] = colors[2];
+		block1_colors[1] = color_rgba((colors[2].r * 2 + colors[3].r + 1) / 3, (colors[2].g * 2 + colors[3].g + 1) / 3, (colors[2].b * 2 + colors[3].b + 1) / 3, 255);
+		block1_colors[2] = color_rgba((colors[3].r * 2 + colors[2].r + 1) / 3, (colors[3].g * 2 + colors[2].g + 1) / 3, (colors[3].b * 2 + colors[2].b + 1) / 3, 255);
+		block1_colors[3] = colors[3];
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t sel = (pBlock->m_sels[4 + (i >> 2)] >> ((i & 3) * 2)) & 3;
+			
+			const uint32_t x = i & 3;
+			const uint32_t y = i >> 2;
+			pPixels[4 + x + y * 8] = block1_colors[sel];
+		}
+
+		return true;
+	}
+
+	struct pvrtc2_block
+	{
+		uint8_t m_modulation[4];
+
+		union
+		{
+			union
+			{
+				// Opaque mode: RGB colora=554 and colorb=555
+				struct
+				{
+					uint32_t m_mod_flag : 1;
+					uint32_t m_blue_a : 4;
+					uint32_t m_green_a : 5;
+					uint32_t m_red_a : 5;
+					uint32_t m_hard_flag : 1;
+					uint32_t m_blue_b : 5;
+					uint32_t m_green_b : 5;
+					uint32_t m_red_b : 5;
+					uint32_t m_opaque_flag : 1;
+
+				} m_opaque_color_data;
+
+				// Transparent mode: RGBA colora=4433 and colorb=4443
+				struct
+				{
+					uint32_t m_mod_flag : 1;
+					uint32_t m_blue_a : 3;
+					uint32_t m_green_a : 4;
+					uint32_t m_red_a : 4;
+					uint32_t m_alpha_a : 3;
+					uint32_t m_hard_flag : 1;
+					uint32_t m_blue_b : 4;
+					uint32_t m_green_b : 4;
+					uint32_t m_red_b : 4;
+					uint32_t m_alpha_b : 3;
+					uint32_t m_opaque_flag : 1;
+
+				} m_trans_color_data;
+			};
+
+			uint32_t m_color_data_bits;
+		};
+	};
+
+	static color_rgba convert_rgb_555_to_888(const color_rgba& col)
+	{
+		return color_rgba((col[0] << 3) | (col[0] >> 2), (col[1] << 3) | (col[1] >> 2), (col[2] << 3) | (col[2] >> 2), 255);
+	}
+	
+	static color_rgba convert_rgba_5554_to_8888(const color_rgba& col)
+	{
+		return color_rgba((col[0] << 3) | (col[0] >> 2), (col[1] << 3) | (col[1] >> 2), (col[2] << 3) | (col[2] >> 2), (col[3] << 4) | col[3]);
+	}
+
+	// PVRTC2 is currently limited to only what our transcoder outputs (non-interpolated, hard_flag=1 modulation=0). In this mode, PVRTC2 looks much like BC1/ATC.
+	bool unpack_pvrtc2(const void *p, color_rgba *pPixels)
+	{
+		const pvrtc2_block* pBlock = static_cast<const pvrtc2_block*>(p);
+
+		if ((!pBlock->m_opaque_color_data.m_hard_flag) || (pBlock->m_opaque_color_data.m_mod_flag))
+		{
+			// This mode isn't supported by the transcoder, so we aren't bothering with it here.
+			return false;
+		}
+
+		color_rgba colors[4];
+
+		if (pBlock->m_opaque_color_data.m_opaque_flag)
+		{
+			// colora=554
+			color_rgba color_a(pBlock->m_opaque_color_data.m_red_a, pBlock->m_opaque_color_data.m_green_a, (pBlock->m_opaque_color_data.m_blue_a << 1) | (pBlock->m_opaque_color_data.m_blue_a >> 3), 255);
+			
+			// colora=555
+			color_rgba color_b(pBlock->m_opaque_color_data.m_red_b, pBlock->m_opaque_color_data.m_green_b, pBlock->m_opaque_color_data.m_blue_b, 255);
+						
+			colors[0] = convert_rgb_555_to_888(color_a);
+			colors[3] = convert_rgb_555_to_888(color_b);
+
+			colors[1].set((colors[0].r * 5 + colors[3].r * 3) / 8, (colors[0].g * 5 + colors[3].g * 3) / 8, (colors[0].b * 5 + colors[3].b * 3) / 8, 255);
+			colors[2].set((colors[0].r * 3 + colors[3].r * 5) / 8, (colors[0].g * 3 + colors[3].g * 5) / 8, (colors[0].b * 3 + colors[3].b * 5) / 8, 255);
+		}
+		else
+		{
+			// colora=4433 
+			color_rgba color_a(
+				(pBlock->m_trans_color_data.m_red_a << 1) | (pBlock->m_trans_color_data.m_red_a >> 3), 
+				(pBlock->m_trans_color_data.m_green_a << 1) | (pBlock->m_trans_color_data.m_green_a >> 3),
+				(pBlock->m_trans_color_data.m_blue_a << 2) | (pBlock->m_trans_color_data.m_blue_a >> 1), 
+				pBlock->m_trans_color_data.m_alpha_a << 1);
+
+			//colorb=4443
+			color_rgba color_b(
+				(pBlock->m_trans_color_data.m_red_b << 1) | (pBlock->m_trans_color_data.m_red_b >> 3),
+				(pBlock->m_trans_color_data.m_green_b << 1) | (pBlock->m_trans_color_data.m_green_b >> 3),
+				(pBlock->m_trans_color_data.m_blue_b << 1) | (pBlock->m_trans_color_data.m_blue_b >> 3),
+				(pBlock->m_trans_color_data.m_alpha_b << 1) | 1);
+
+			colors[0] = convert_rgba_5554_to_8888(color_a);
+			colors[3] = convert_rgba_5554_to_8888(color_b);
+		}
+
+		colors[1].set((colors[0].r * 5 + colors[3].r * 3) / 8, (colors[0].g * 5 + colors[3].g * 3) / 8, (colors[0].b * 5 + colors[3].b * 3) / 8, (colors[0].a * 5 + colors[3].a * 3) / 8);
+		colors[2].set((colors[0].r * 3 + colors[3].r * 5) / 8, (colors[0].g * 3 + colors[3].g * 5) / 8, (colors[0].b * 3 + colors[3].b * 5) / 8, (colors[0].a * 3 + colors[3].a * 5) / 8);
+
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t sel = (pBlock->m_modulation[i >> 2] >> ((i & 3) * 2)) & 3;
+			pPixels[i] = colors[sel];
+		}
+
+		return true;
+	}
+
+	struct etc2_eac_r11
+	{
+		uint64_t m_base	: 8;
+		uint64_t m_table	: 4;
+		uint64_t m_mul		: 4;
+		uint64_t m_sels_0 : 8;
+		uint64_t m_sels_1 : 8;
+		uint64_t m_sels_2 : 8;
+		uint64_t m_sels_3 : 8;
+		uint64_t m_sels_4 : 8;
+		uint64_t m_sels_5 : 8;
+
+		uint64_t get_sels() const
+		{
+			return ((uint64_t)m_sels_0 << 40U) | ((uint64_t)m_sels_1 << 32U) | ((uint64_t)m_sels_2 << 24U) | ((uint64_t)m_sels_3 << 16U) | ((uint64_t)m_sels_4 << 8U) | m_sels_5;
+		}
+
+		void set_sels(uint64_t v)
+		{
+			m_sels_0 = (v >> 40U) & 0xFF;
+			m_sels_1 = (v >> 32U) & 0xFF;
+			m_sels_2 = (v >> 24U) & 0xFF;
+			m_sels_3 = (v >> 16U) & 0xFF;
+			m_sels_4 = (v >> 8U) & 0xFF;
+			m_sels_5 = v & 0xFF;
+		}
+	};
+
+	struct etc2_eac_rg11
+	{
+		etc2_eac_r11 m_c[2];
+	};
+
+	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c)
+	{
+		const etc2_eac_r11* pBlock = static_cast<const etc2_eac_r11*>(p);
+		const uint64_t sels = pBlock->get_sels();
+
+		const int base = (int)pBlock->m_base * 8 + 4;
+		const int mul = pBlock->m_mul ? ((int)pBlock->m_mul * 8) : 1;
+		const int table = (int)pBlock->m_table;
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				const uint32_t shift = 45 - ((y + x * 4) * 3);
+				
+				const uint32_t sel = (uint32_t)((sels >> shift) & 7);
+				
+				int val = base + g_etc2_eac_tables[table][sel] * mul;
+				val = clamp<int>(val, 0, 2047);
+
+				// Convert to 8-bits with rounding
+				//pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1024) / 2047);
+				pPixels[x + y * 4].m_comps[c] = static_cast<uint8_t>((val * 255 + 1023) / 2047);
+
+			} // x
+		} // y
+	}
+
+	void unpack_etc2_eac_rg(const void* p, color_rgba* pPixels)
+	{
+		for (uint32_t c = 0; c < 2; c++)
+		{
+			const etc2_eac_r11* pBlock = &static_cast<const etc2_eac_rg11*>(p)->m_c[c];
+
+			unpack_etc2_eac_r(pBlock, pPixels, c);
+		}
+	}
+	
+	void unpack_uastc(const void* p, color_rgba* pPixels)
+	{
+		basist::unpack_uastc(*static_cast<const basist::uastc_block*>(p), (basist::color32 *)pPixels, false);
+	}
+	
+	// Unpacks to RGBA, R, RG, or A
+	bool unpack_block(texture_format fmt, const void* pBlock, color_rgba* pPixels)
+	{
+		switch (fmt)
+		{
+		case texture_format::cBC1:
+		{
+			unpack_bc1(pBlock, pPixels, true);
+			break;
+		}
+		case texture_format::cBC1_NV:
+		{
+			unpack_bc1_nv(pBlock, pPixels, true);
+			break;
+		}
+		case texture_format::cBC1_AMD:
+		{
+			unpack_bc1_amd(pBlock, pPixels, true);
+			break;
+		}
+		case texture_format::cBC3:
+		{
+			return unpack_bc3(pBlock, pPixels);
+		}
+		case texture_format::cBC4:
+		{
+			// Unpack to R
+			unpack_bc4(pBlock, &pPixels[0].r, sizeof(color_rgba));
+			break;
+		}
+		case texture_format::cBC5:
+		{
+			unpack_bc5(pBlock, pPixels);
+			break;
+		}
+		case texture_format::cBC7:
+		{
+			return unpack_bc7(pBlock, pPixels);
+		}
+		// Full ETC2 color blocks (planar/T/H modes) is currently unsupported in basisu, but we do support ETC2 with alpha (using ETC1 for color)
+		case texture_format::cETC2_RGB:
+		case texture_format::cETC1:
+		case texture_format::cETC1S:
+		{
+			return unpack_etc1(*static_cast<const etc_block*>(pBlock), pPixels);
+		}
+		case texture_format::cETC2_RGBA:
+		{
+			if (!unpack_etc1(static_cast<const etc_block*>(pBlock)[1], pPixels))
+				return false;
+			unpack_etc2_eac(pBlock, pPixels);
+			break;
+		}
+		case texture_format::cETC2_ALPHA:
+		{
+			// Unpack to A
+			unpack_etc2_eac(pBlock, pPixels);
+			break;
+		}
+		case texture_format::cASTC4x4:
+		{
+			const bool astc_srgb = false;
+			basisu_astc::astc::decompress(reinterpret_cast<uint8_t*>(pPixels), static_cast<const uint8_t*>(pBlock), astc_srgb, 4, 4);
+			break;
+		}
+		case texture_format::cATC_RGB:
+		{
+			unpack_atc(pBlock, pPixels);
+			break;
+		}
+		case texture_format::cATC_RGBA_INTERPOLATED_ALPHA:
+		{
+			unpack_atc(static_cast<const uint8_t*>(pBlock) + 8, pPixels);
+			unpack_bc4(pBlock, &pPixels[0].a, sizeof(color_rgba));
+			break;
+		}
+		case texture_format::cFXT1_RGB:
+		{
+			unpack_fxt1(pBlock, pPixels);
+			break;
+		}
+		case texture_format::cPVRTC2_4_RGBA:
+		{
+			unpack_pvrtc2(pBlock, pPixels);
+			break;
+		}
+		case texture_format::cETC2_R11_EAC:
+		{
+			unpack_etc2_eac_r(static_cast<const etc2_eac_r11 *>(pBlock), pPixels, 0);
+			break;
+		}
+		case texture_format::cETC2_RG11_EAC:
+		{
+			unpack_etc2_eac_rg(pBlock, pPixels);
+			break;
+		}
+		case texture_format::cUASTC4x4:
+		{
+			unpack_uastc(pBlock, pPixels);
+			break;
+		}
+		default:
+		{
+			assert(0);
+			// TODO
+			return false;
+		}
+		}
+		return true;
+	}
+
+	bool gpu_image::unpack(image& img) const
+	{
+		img.resize(get_pixel_width(), get_pixel_height());
+		img.set_all(g_black_color);
+
+		if (!img.get_width() || !img.get_height())
+			return true;
+
+		if ((m_fmt == texture_format::cPVRTC1_4_RGB) || (m_fmt == texture_format::cPVRTC1_4_RGBA))
+		{
+			pvrtc4_image pi(m_width, m_height);
+			
+			if (get_total_blocks() != pi.get_total_blocks())
+				return false;
+			
+			memcpy(&pi.get_blocks()[0], get_ptr(), get_size_in_bytes());
+
+			pi.deswizzle();
+
+			pi.unpack_all_pixels(img);
+
+			return true;
+		}
+
+		assert((m_block_width <= cMaxBlockSize) && (m_block_height <= cMaxBlockSize));
+		color_rgba pixels[cMaxBlockSize * cMaxBlockSize];
+		for (uint32_t i = 0; i < cMaxBlockSize * cMaxBlockSize; i++)
+			pixels[i] = g_black_color;
+
+		bool success = true;
+
+		for (uint32_t by = 0; by < m_blocks_y; by++)
+		{
+			for (uint32_t bx = 0; bx < m_blocks_x; bx++)
+			{
+				const void* pBlock = get_block_ptr(bx, by);
+
+				if (!unpack_block(m_fmt, pBlock, pixels))
+					success = false;
+
+				img.set_block_clipped(pixels, bx * m_block_width, by * m_block_height, m_block_width, m_block_height);
+			} // bx
+		} // by
+
+		return success;
+	}
+		
+	static const uint8_t g_ktx_file_id[12] = { 0xAB, 0x4B, 0x54, 0x58, 0x20, 0x31, 0x31, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A };
+
+	// KTX/GL enums
+	enum
+	{
+		KTX_ENDIAN = 0x04030201, 
+		KTX_OPPOSITE_ENDIAN = 0x01020304,
+		KTX_ETC1_RGB8_OES = 0x8D64,
+		KTX_RED = 0x1903,
+		KTX_RG = 0x8227,
+		KTX_RGB = 0x1907,
+		KTX_RGBA = 0x1908,
+		KTX_COMPRESSED_RGB_S3TC_DXT1_EXT = 0x83F0,
+		KTX_COMPRESSED_RGBA_S3TC_DXT5_EXT = 0x83F3,
+		KTX_COMPRESSED_RED_RGTC1_EXT = 0x8DBB,
+		KTX_COMPRESSED_RED_GREEN_RGTC2_EXT = 0x8DBD,
+		KTX_COMPRESSED_RGB8_ETC2 = 0x9274,
+		KTX_COMPRESSED_RGBA8_ETC2_EAC = 0x9278,
+		KTX_COMPRESSED_RGBA_BPTC_UNORM = 0x8E8C,
+		KTX_COMPRESSED_SRGB_ALPHA_BPTC_UNORM = 0x8E8D,
+		KTX_COMPRESSED_RGB_PVRTC_4BPPV1_IMG = 0x8C00,
+		KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG = 0x8C02,
+		KTX_COMPRESSED_RGBA_ASTC_4x4_KHR = 0x93B0,
+		KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR = 0x93D0,
+		KTX_COMPRESSED_RGBA_UASTC_4x4_KHR = 0x94CC, // TODO - Use proper value!
+		KTX_ATC_RGB_AMD = 0x8C92,
+		KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD = 0x87EE,
+		KTX_COMPRESSED_RGB_FXT1_3DFX = 0x86B0,
+		KTX_COMPRESSED_RGBA_FXT1_3DFX = 0x86B1,
+		KTX_COMPRESSED_RGBA_PVRTC_4BPPV2_IMG = 0x9138,
+		KTX_COMPRESSED_R11_EAC = 0x9270,
+		KTX_COMPRESSED_RG11_EAC = 0x9272
+	};
+		
+	struct ktx_header
+	{
+		uint8_t m_identifier[12];
+		packed_uint<4> m_endianness;
+		packed_uint<4> m_glType;
+		packed_uint<4> m_glTypeSize;
+		packed_uint<4> m_glFormat;
+		packed_uint<4> m_glInternalFormat;
+		packed_uint<4> m_glBaseInternalFormat;
+		packed_uint<4> m_pixelWidth;
+		packed_uint<4> m_pixelHeight;
+		packed_uint<4> m_pixelDepth;
+		packed_uint<4> m_numberOfArrayElements;
+		packed_uint<4> m_numberOfFaces;
+		packed_uint<4> m_numberOfMipmapLevels;
+		packed_uint<4> m_bytesOfKeyValueData;
+
+		void clear() { clear_obj(*this);	}
+	};
+
+	// Input is a texture array of mipmapped gpu_image's: gpu_images[array_index][level_index]
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag)
+	{
+		if (!gpu_images.size())
+		{
+			assert(0);
+			return false;
+		}
+
+		uint32_t width = 0, height = 0, total_levels = 0;
+		basisu::texture_format fmt = texture_format::cInvalidTextureFormat;
+
+		if (cubemap_flag)
+		{
+			if ((gpu_images.size() % 6) != 0)
+			{
+				assert(0);
+				return false;
+			}
+		}
+
+		for (uint32_t array_index = 0; array_index < gpu_images.size(); array_index++)
+		{
+			const gpu_image_vec &levels = gpu_images[array_index];
+
+			if (!levels.size())
+			{
+				// Empty mip chain
+				assert(0);
+				return false;
+			}
+
+			if (!array_index)
+			{
+				width = levels[0].get_pixel_width();
+				height = levels[0].get_pixel_height();
+				total_levels = (uint32_t)levels.size();
+				fmt = levels[0].get_format();
+			}
+			else
+			{
+				if ((width != levels[0].get_pixel_width()) ||
+				    (height != levels[0].get_pixel_height()) ||
+				    (total_levels != levels.size()))
+				{
+					// All cubemap/texture array faces must be the same dimension
+					assert(0);
+					return false;
+				}
+			}
+
+			for (uint32_t level_index = 0; level_index < levels.size(); level_index++)
+			{
+				if (level_index)
+				{
+					if ( (levels[level_index].get_pixel_width() != maximum<uint32_t>(1, levels[0].get_pixel_width() >> level_index)) ||
+							(levels[level_index].get_pixel_height() != maximum<uint32_t>(1, levels[0].get_pixel_height() >> level_index)) )
+					{
+						// Malformed mipmap chain
+						assert(0);
+						return false;
+					}
+				}
+
+				if (fmt != levels[level_index].get_format())
+				{
+					// All input textures must use the same GPU format
+					assert(0);
+					return false;
+				}
+			}
+		}
+
+		uint32_t internal_fmt = KTX_ETC1_RGB8_OES, base_internal_fmt = KTX_RGB;
+
+		switch (fmt)
+		{
+		case texture_format::cBC1:
+		case texture_format::cBC1_NV:
+		case texture_format::cBC1_AMD:
+		{
+			internal_fmt = KTX_COMPRESSED_RGB_S3TC_DXT1_EXT;
+			break;
+		}
+		case texture_format::cBC3:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_S3TC_DXT5_EXT;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cBC4:
+		{
+			internal_fmt = KTX_COMPRESSED_RED_RGTC1_EXT;// KTX_COMPRESSED_LUMINANCE_LATC1_EXT;
+			base_internal_fmt = KTX_RED;
+			break;
+		}
+		case texture_format::cBC5:
+		{
+			internal_fmt = KTX_COMPRESSED_RED_GREEN_RGTC2_EXT;
+			base_internal_fmt = KTX_RG;
+			break;
+		}
+		case texture_format::cETC1:
+		case texture_format::cETC1S:
+		{
+			internal_fmt = KTX_ETC1_RGB8_OES;
+			break;
+		}
+		case texture_format::cETC2_RGB:
+		{
+			internal_fmt = KTX_COMPRESSED_RGB8_ETC2;
+			break;
+		}
+		case texture_format::cETC2_RGBA:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA8_ETC2_EAC;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cBC7:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_BPTC_UNORM;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cPVRTC1_4_RGB:
+		{
+			internal_fmt = KTX_COMPRESSED_RGB_PVRTC_4BPPV1_IMG;
+			break;
+		}
+		case texture_format::cPVRTC1_4_RGBA:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cASTC4x4:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_ASTC_4x4_KHR;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cATC_RGB:
+		{
+			internal_fmt = KTX_ATC_RGB_AMD;
+			break;
+		}
+		case texture_format::cATC_RGBA_INTERPOLATED_ALPHA:
+		{
+			internal_fmt = KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cETC2_R11_EAC:
+		{
+			internal_fmt = KTX_COMPRESSED_R11_EAC;
+			base_internal_fmt = KTX_RED;
+			break;
+		}
+		case texture_format::cETC2_RG11_EAC:
+		{
+			internal_fmt = KTX_COMPRESSED_RG11_EAC;
+			base_internal_fmt = KTX_RG;
+			break;
+		}
+		case texture_format::cUASTC4x4:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_UASTC_4x4_KHR;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		case texture_format::cFXT1_RGB:
+		{
+			internal_fmt = KTX_COMPRESSED_RGB_FXT1_3DFX;
+			break;
+		}
+		case texture_format::cPVRTC2_4_RGBA:
+		{
+			internal_fmt = KTX_COMPRESSED_RGBA_PVRTC_4BPPV2_IMG;
+			base_internal_fmt = KTX_RGBA;
+			break;
+		}
+		default:
+		{
+			// TODO
+			assert(0);
+			return false;
+		}
+		}
+		
+		ktx_header header;
+		header.clear();
+		memcpy(&header.m_identifier, g_ktx_file_id, sizeof(g_ktx_file_id));
+		header.m_endianness = KTX_ENDIAN;
+		
+		header.m_pixelWidth = width;
+		header.m_pixelHeight = height;
+		
+		header.m_glInternalFormat = internal_fmt;
+		header.m_glBaseInternalFormat = base_internal_fmt;
+
+		header.m_numberOfArrayElements = (uint32_t)(cubemap_flag ? (gpu_images.size() / 6) : gpu_images.size());
+		if (header.m_numberOfArrayElements == 1)
+			header.m_numberOfArrayElements = 0;
+
+		header.m_numberOfMipmapLevels = total_levels;
+		header.m_numberOfFaces = cubemap_flag ? 6 : 1;
+
+		append_vector(ktx_data, (uint8_t *)&header, sizeof(header));
+
+		for (uint32_t level_index = 0; level_index < total_levels; level_index++)
+		{
+			uint32_t img_size = gpu_images[0][level_index].get_size_in_bytes();
+			
+			if ((header.m_numberOfFaces == 1) || (header.m_numberOfArrayElements > 1))
+			{
+				img_size = img_size * header.m_numberOfFaces * maximum<uint32_t>(1, header.m_numberOfArrayElements);
+			}
+
+			assert(img_size && ((img_size & 3) == 0));
+
+			packed_uint<4> packed_img_size(img_size);
+			append_vector(ktx_data, (uint8_t *)&packed_img_size, sizeof(packed_img_size));
+
+			uint32_t bytes_written = 0;
+
+			for (uint32_t array_index = 0; array_index < maximum<uint32_t>(1, header.m_numberOfArrayElements); array_index++)
+			{
+				for (uint32_t face_index = 0; face_index < header.m_numberOfFaces; face_index++)
+				{
+					const gpu_image& img = gpu_images[cubemap_flag ? (array_index * 6 + face_index) : array_index][level_index];
+
+					append_vector(ktx_data, (uint8_t *)img.get_ptr(), img.get_size_in_bytes());
+					
+					bytes_written += img.get_size_in_bytes();
+				}
+			
+			} // array_index
+
+		} // level_index
+
+		return true;
+	}
+
+	bool write_compressed_texture_file(const char* pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag)
+	{
+		std::string extension(string_tolower(string_get_extension(pFilename)));
+
+		uint8_vec filedata;
+		if (extension == "ktx")
+		{
+			if (!create_ktx_texture_file(filedata, g, cubemap_flag))
+				return false;
+		}
+		else if (extension == "pvr")
+		{
+			// TODO
+			return false;
+		}
+		else if (extension == "dds")
+		{
+			// TODO
+			return false;
+		}
+		else
+		{
+			// unsupported texture format
+			assert(0);
+			return false;
+		}
+
+		return basisu::write_vec_to_file(pFilename, filedata);
+	}
+
+	bool write_compressed_texture_file(const char* pFilename, const gpu_image& g)
+	{
+		basisu::vector<gpu_image_vec> v;
+		enlarge_vector(v, 1)->push_back(g);
+		return write_compressed_texture_file(pFilename, v, false);
+	}
+
+	//const uint32_t OUT_FILE_MAGIC = 'TEXC';
+	struct out_file_header 
+	{
+		packed_uint<4> m_magic;
+		packed_uint<4> m_pad;
+		packed_uint<4> m_width;
+		packed_uint<4> m_height;
+	};
+
+	// As no modern tool supports FXT1 format .KTX files, let's write .OUT files and make sure 3DFX's original tools shipped in 1999 can decode our encoded output.
+	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi)
+	{
+		out_file_header hdr;
+		//hdr.m_magic = OUT_FILE_MAGIC;
+		hdr.m_magic.m_bytes[0] = 67;
+		hdr.m_magic.m_bytes[1] = 88;
+		hdr.m_magic.m_bytes[2] = 69;
+		hdr.m_magic.m_bytes[3] = 84;
+		hdr.m_pad = 0;
+		hdr.m_width = gi.get_blocks_x() * 8;
+		hdr.m_height = gi.get_blocks_y() * 4;
+
+		FILE* pFile = nullptr;
+#ifdef _WIN32
+		fopen_s(&pFile, pFilename, "wb");
+#else
+		pFile = fopen(pFilename, "wb");
+#endif
+		if (!pFile)
+			return false;
+
+		fwrite(&hdr, sizeof(hdr), 1, pFile);
+		fwrite(gi.get_ptr(), gi.get_size_in_bytes(), 1, pFile);
+		
+		return fclose(pFile) != EOF;
+	}
+} // basisu
+
diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.h b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
new file mode 100644
index 0000000000..619926f5f9
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.h
@@ -0,0 +1,154 @@
+// basisu_gpu_texture.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu.h"
+#include "basisu_etc.h"
+
+namespace basisu
+{
+	// GPU texture "image"
+	class gpu_image
+	{
+	public:
+		enum { cMaxBlockSize = 12 };
+
+		gpu_image()
+		{
+			clear();
+		}
+
+		gpu_image(texture_format fmt, uint32_t width, uint32_t height)
+		{
+			init(fmt, width, height);
+		}
+
+		void clear()
+		{
+			m_fmt = texture_format::cInvalidTextureFormat;
+			m_width = 0;
+			m_height = 0;
+			m_block_width = 0;
+			m_block_height = 0;
+			m_blocks_x = 0;
+			m_blocks_y = 0;
+			m_qwords_per_block = 0;
+			m_blocks.clear();
+		}
+
+		inline texture_format get_format() const { return m_fmt; }
+		
+		// Width/height in pixels
+		inline uint32_t get_pixel_width() const { return m_width; }
+		inline uint32_t get_pixel_height() const { return m_height; }
+		
+		// Width/height in blocks, row pitch is assumed to be m_blocks_x.
+		inline uint32_t get_blocks_x() const { return m_blocks_x; }
+		inline uint32_t get_blocks_y() const { return m_blocks_y; }
+
+		// Size of each block in pixels
+		inline uint32_t get_block_width() const { return m_block_width; }
+		inline uint32_t get_block_height() const { return m_block_height; }
+
+		inline uint32_t get_qwords_per_block() const { return m_qwords_per_block; }
+		inline uint32_t get_total_blocks() const { return m_blocks_x * m_blocks_y; }
+		inline uint32_t get_bytes_per_block() const { return get_qwords_per_block() * sizeof(uint64_t); }
+		inline uint32_t get_row_pitch_in_bytes() const { return get_bytes_per_block() * get_blocks_x(); }
+
+		inline const uint64_vec &get_blocks() const { return m_blocks; }
+		
+		inline const uint64_t *get_ptr() const { return &m_blocks[0]; }
+		inline uint64_t *get_ptr() { return &m_blocks[0]; }
+
+		inline uint32_t get_size_in_bytes() const { return get_total_blocks() * get_qwords_per_block() * sizeof(uint64_t); }
+
+		inline const void *get_block_ptr(uint32_t block_x, uint32_t block_y, uint32_t element_index = 0) const
+		{
+			assert(block_x < m_blocks_x && block_y < m_blocks_y);
+			return &m_blocks[(block_x + block_y * m_blocks_x) * m_qwords_per_block + element_index];
+		}
+
+		inline void *get_block_ptr(uint32_t block_x, uint32_t block_y, uint32_t element_index = 0)
+		{
+			assert(block_x < m_blocks_x && block_y < m_blocks_y && element_index < m_qwords_per_block);
+			return &m_blocks[(block_x + block_y * m_blocks_x) * m_qwords_per_block + element_index];
+		}
+
+		void init(texture_format fmt, uint32_t width, uint32_t height)
+		{
+			m_fmt = fmt;
+			m_width = width;
+			m_height = height;
+			m_block_width = basisu::get_block_width(m_fmt);
+			m_block_height = basisu::get_block_height(m_fmt);
+			m_blocks_x = (m_width + m_block_width - 1) / m_block_width;
+			m_blocks_y = (m_height + m_block_height - 1) / m_block_height;
+			m_qwords_per_block = basisu::get_qwords_per_block(m_fmt);
+
+			m_blocks.resize(0);
+			m_blocks.resize(m_blocks_x * m_blocks_y * m_qwords_per_block);
+		}
+
+		bool unpack(image& img) const;
+		
+		void override_dimensions(uint32_t w, uint32_t h)
+		{
+			m_width = w;
+			m_height = h;
+		}
+
+	private:
+		texture_format m_fmt;
+		uint32_t m_width, m_height, m_blocks_x, m_blocks_y, m_block_width, m_block_height, m_qwords_per_block;
+		uint64_vec m_blocks;
+	};
+
+	typedef basisu::vector<gpu_image> gpu_image_vec;
+
+	// KTX file writing
+
+	bool create_ktx_texture_file(uint8_vec &ktx_data, const basisu::vector<gpu_image_vec>& gpu_images, bool cubemap_flag);
+		
+	bool write_compressed_texture_file(const char *pFilename, const basisu::vector<gpu_image_vec>& g, bool cubemap_flag);
+	
+	inline bool write_compressed_texture_file(const char *pFilename, const gpu_image_vec &g)
+	{
+		basisu::vector<gpu_image_vec> a;
+		a.push_back(g);
+		return write_compressed_texture_file(pFilename, a, false);
+	}
+
+	bool write_compressed_texture_file(const char *pFilename, const gpu_image &g);
+	
+	bool write_3dfx_out_file(const char* pFilename, const gpu_image& gi);
+
+	// GPU texture block unpacking
+	void unpack_etc2_eac(const void *pBlock_bits, color_rgba *pPixels);
+	bool unpack_bc1(const void *pBlock_bits, color_rgba *pPixels, bool set_alpha);
+	void unpack_bc4(const void *pBlock_bits, uint8_t *pPixels, uint32_t stride);
+	bool unpack_bc3(const void *pBlock_bits, color_rgba *pPixels);
+	void unpack_bc5(const void *pBlock_bits, color_rgba *pPixels);
+	bool unpack_bc7_mode6(const void *pBlock_bits, color_rgba *pPixels);
+	bool unpack_bc7(const void* pBlock_bits, color_rgba* pPixels);
+	void unpack_atc(const void* pBlock_bits, color_rgba* pPixels);
+	bool unpack_fxt1(const void* p, color_rgba* pPixels);
+	bool unpack_pvrtc2(const void* p, color_rgba* pPixels);
+	void unpack_etc2_eac_r(const void *p, color_rgba* pPixels, uint32_t c);
+	void unpack_etc2_eac_rg(const void* p, color_rgba* pPixels);
+
+	// unpack_block() is primarily intended to unpack texture data created by the transcoder.
+	// For some texture formats (like ETC2 RGB, PVRTC2, FXT1) it's not a complete implementation.
+	bool unpack_block(texture_format fmt, const void *pBlock, color_rgba *pPixels);
+			
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_declares.h b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
new file mode 100644
index 0000000000..e24bdd7978
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_declares.h
@@ -0,0 +1,25 @@
+// basisu_kernels_declares.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if BASISU_SUPPORT_SSE
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err);
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const basisu::color_rgba* pBlock_colors, const basisu::color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error);
+#endif
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_imp.h b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
new file mode 100644
index 0000000000..046880517b
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_imp.h
@@ -0,0 +1,584 @@
+// basisu_kernels_imp.h - Do not directly include
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+using namespace CPPSPMD;
+
+namespace CPPSPMD_NAME(basisu_kernels_namespace)
+{
+   struct perceptual_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint delta_l = dr * 27 + dg * 92 + db * 9;
+            vint delta_cr = dr * 128 - delta_l;
+            vint delta_cb = db * 128 - delta_l;
+
+            vint id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int delta_l = dr * 27 + dg * 92 + db * 9;
+            int delta_cr = dr * 128 - delta_l;
+            int delta_cb = db * 128 - delta_l;
+
+            int id = ((delta_l * delta_l) >> 7) +
+               ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+               ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct linear_distance_rgb_4_N : spmd_kernel
+   {
+      void _call(int64_t* pDistance,
+         const uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         __m128i block_colors[4];
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            block_colors[i] = load_rgba32(&pBlock_colors[i]);
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            int s0 = pSelectors[i], s1 = pSelectors[i + 1], s2 = pSelectors[i + 2], s3 = pSelectors[i + 3];
+
+            vint base_r, base_g, base_b, base_a;
+            if ((s0 == s1) && (s0 == s2) && (s0 == s3))
+            {
+               store_all(base_r, block_colors_r[s0]);
+               store_all(base_g, block_colors_g[s0]);
+               store_all(base_b, block_colors_b[s0]);
+            }
+            else
+            {
+               __m128i k0 = block_colors[s0], k1 = block_colors[s1], k2 = block_colors[s2], k3 = block_colors[s3];
+               transpose4x4(base_r.m_value, base_g.m_value, base_b.m_value, base_a.m_value, k0, k1, k2, k3);
+            }
+
+            vint dr = base_r - r;
+            vint dg = base_g - g;
+            vint db = base_b - b;
+
+            vint id = dr * dr + dg * dg + db * db;
+
+            *pDistance += reduce_add(id);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int sel = pSelectors[i];
+            int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+            int dr = base_r - r;
+            int dg = base_g - g;
+            int db = base_b - b;
+
+            int id = dr * dr + dg * dg + db * db;
+
+            *pDistance += id;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_selectors_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         uint8_t* pSelectors,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_err)
+      {
+         assert(early_out_err >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         const __m128i shuf = _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 12, 8, 4, 0);
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            vint sels = spmd_ternaryi(min_dist == dist0, 0, spmd_ternaryi(min_dist == dist1, 1, spmd_ternaryi(min_dist == dist2, 2, 3)));
+
+            __m128i vsels = shuffle_epi8(sels.m_value, shuf);
+            storeu_si32((void *)(pSelectors + i), vsels);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance >= early_out_err)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX, best_sel = 0;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+               if (id < best_err)
+               {
+                  best_err = id;
+                  best_sel = sel;
+               }
+            }
+
+            pSelectors[i] = (uint8_t)best_sel;
+
+            *pDistance += best_err;
+            if (*pDistance >= early_out_err)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_perceptual_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint delta_l = dr * 27 + dg * 92 + db * 9;
+         vint delta_cr = dr * 128 - delta_l;
+         vint delta_cb = db * 128 - delta_l;
+
+         vint id = VINT_SHIFT_RIGHT(delta_l * delta_l, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cr * delta_cr, 7) * 26, 7) +
+            VINT_SHIFT_RIGHT(VINT_SHIFT_RIGHT(delta_cb * delta_cb, 7) * 3, 7);
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n, 
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int delta_l = dr * 27 + dg * 92 + db * 9;
+               int delta_cr = dr * 128 - delta_l;
+               int delta_cb = db * 128 - delta_l;
+
+               int id = ((delta_l * delta_l) >> 7) +
+                  ((((delta_cr * delta_cr) >> 7) * 26) >> 7) +
+                  ((((delta_cb * delta_cb) >> 7) * 3) >> 7);
+               
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+   struct find_lowest_error_linear_rgb_4_N : spmd_kernel
+   {
+      inline vint compute_dist(
+         const vint& base_r, const vint& base_g, const vint& base_b,
+         const vint& r, const vint& g, const vint& b)
+      {
+         vint dr = base_r - r;
+         vint dg = base_g - g;
+         vint db = base_b - b;
+
+         vint id = dr * dr + dg * dg + db * db;
+
+         return id;
+      }
+
+      void _call(int64_t* pDistance,
+         const color_rgba* pBlock_colors,
+         const color_rgba* pSrc_pixels, uint32_t n,
+         int64_t early_out_error)
+      {
+         assert(early_out_error >= 0);
+
+         *pDistance = 0;
+
+         vint block_colors_r[4], block_colors_g[4], block_colors_b[4];
+         for (uint32_t i = 0; i < 4; i++)
+         {
+            store_all(block_colors_r[i], (int)pBlock_colors[i].r);
+            store_all(block_colors_g[i], (int)pBlock_colors[i].g);
+            store_all(block_colors_b[i], (int)pBlock_colors[i].b);
+         }
+
+         uint32_t i;
+
+         for (i = 0; (i + 4) <= n; i += 4)
+         {
+            __m128i c0 = load_rgba32(&pSrc_pixels[i + 0]), c1 = load_rgba32(&pSrc_pixels[i + 1]), c2 = load_rgba32(&pSrc_pixels[i + 2]), c3 = load_rgba32(&pSrc_pixels[i + 3]);
+
+            vint r, g, b, a;
+            transpose4x4(r.m_value, g.m_value, b.m_value, a.m_value, c0, c1, c2, c3);
+
+            vint dist0 = compute_dist(block_colors_r[0], block_colors_g[0], block_colors_b[0], r, g, b);
+            vint dist1 = compute_dist(block_colors_r[1], block_colors_g[1], block_colors_b[1], r, g, b);
+            vint dist2 = compute_dist(block_colors_r[2], block_colors_g[2], block_colors_b[2], r, g, b);
+            vint dist3 = compute_dist(block_colors_r[3], block_colors_g[3], block_colors_b[3], r, g, b);
+
+            vint min_dist = min(min(min(dist0, dist1), dist2), dist3);
+
+            *pDistance += reduce_add(min_dist);
+            if (*pDistance > early_out_error)
+               return;
+         }
+
+         for (; i < n; i++)
+         {
+            int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b;
+
+            int best_err = INT_MAX;
+            for (int sel = 0; sel < 4; sel++)
+            {
+               int base_r = pBlock_colors[sel].r, base_g = pBlock_colors[sel].g, base_b = pBlock_colors[sel].b;
+
+               int dr = base_r - r;
+               int dg = base_g - g;
+               int db = base_b - b;
+
+               int id = dr * dr + dg * dg + db * db;
+
+               if (id < best_err)
+               {
+                  best_err = id;
+               }
+            }
+
+            *pDistance += best_err;
+            if (*pDistance > early_out_error)
+               return;
+         }
+      }
+   };
+
+} // namespace
+
+using namespace CPPSPMD_NAME(basisu_kernels_namespace);
+
+void CPPSPMD_NAME(perceptual_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< perceptual_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(linear_distance_rgb_4_N)(int64_t* pDistance, const uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< linear_distance_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_perceptual_rgb_4_N)(int64_t *pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_perceptual_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_selectors_linear_rgb_4_N)(int64_t* pDistance, uint8_t* pSelectors, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_err)
+{
+   spmd_call< find_selectors_linear_rgb_4_N >(pDistance, pSelectors, pBlock_colors, pSrc_pixels, n, early_out_err);
+}
+
+void CPPSPMD_NAME(find_lowest_error_perceptual_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_perceptual_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
+void CPPSPMD_NAME(find_lowest_error_linear_rgb_4_N)(int64_t* pDistance, const color_rgba* pBlock_colors, const color_rgba* pSrc_pixels, uint32_t n, int64_t early_out_error)
+{
+   spmd_call< find_lowest_error_linear_rgb_4_N >(pDistance, pBlock_colors, pSrc_pixels, n, early_out_error);
+}
+
diff --git a/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
new file mode 100644
index 0000000000..12d2321f20
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_kernels_sse.cpp
@@ -0,0 +1,161 @@
+// basisu_kernels_sse.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_enc.h"
+
+#if BASISU_SUPPORT_SSE
+
+#define CPPSPMD_SSE2 0
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#if !defined(_MSC_VER)
+	#if __AVX__ || __AVX2__ || __AVX512F__
+		#error Please check your compiler options
+	#endif
+	
+	#if CPPSPMD_SSE2
+		#if __SSE4_1__ || __SSE3__ || __SSE4_2__ || __SSSE3__
+			#error SSE4.1/SSE3/SSE4.2/SSSE3 cannot be enabled to use this file
+		#endif
+	#else
+		#if !__SSE4_1__ || !__SSE3__ || __SSE4_2__ || !__SSSE3__
+			#error Please check your compiler options
+		#endif
+	#endif
+#endif
+
+#include "cppspmd_sse.h"
+
+#include "cppspmd_type_aliases.h"
+
+using namespace basisu;
+
+#include "basisu_kernels_declares.h"
+#include "basisu_kernels_imp.h"
+
+namespace basisu
+{
+
+struct cpu_info
+{
+	cpu_info() { memset(this, 0, sizeof(*this)); }
+
+	bool m_has_fpu;
+	bool m_has_mmx;
+	bool m_has_sse;
+	bool m_has_sse2;
+	bool m_has_sse3;
+	bool m_has_ssse3;
+	bool m_has_sse41;
+	bool m_has_sse42;
+	bool m_has_avx;
+	bool m_has_avx2;
+	bool m_has_pclmulqdq;
+};
+
+static void extract_x86_flags(cpu_info &info, uint32_t ecx, uint32_t edx)
+{
+	info.m_has_fpu = (edx & (1 << 0)) != 0;
+	info.m_has_mmx = (edx & (1 << 23)) != 0;
+	info.m_has_sse = (edx & (1 << 25)) != 0;
+	info.m_has_sse2 = (edx & (1 << 26)) != 0;
+	info.m_has_sse3 = (ecx & (1 << 0)) != 0;
+	info.m_has_ssse3 = (ecx & (1 << 9)) != 0;
+	info.m_has_sse41 = (ecx & (1 << 19)) != 0;
+	info.m_has_sse42 = (ecx & (1 << 20)) != 0;
+	info.m_has_pclmulqdq = (ecx & (1 << 1)) != 0;
+	info.m_has_avx = (ecx & (1 << 28)) != 0;
+}
+
+static void extract_x86_extended_flags(cpu_info &info, uint32_t ebx)
+{
+	info.m_has_avx2 = (ebx & (1 << 5)) != 0;
+}
+
+#ifndef _MSC_VER
+static void do_cpuid(uint32_t eax, uint32_t ecx, uint32_t* regs)
+{
+	uint32_t ebx = 0, edx = 0;
+
+#if defined(__PIC__) && defined(__i386__)
+	__asm__("movl %%ebx, %%edi;"
+		"cpuid;"
+		"xchgl %%ebx, %%edi;"
+		: "=D"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#else
+	__asm__("cpuid;" : "+b"(ebx), "+a"(eax), "+c"(ecx), "=d"(edx));
+#endif
+
+	regs[0] = eax; regs[1] = ebx; regs[2] = ecx; regs[3] = edx;
+}
+#endif
+
+static void get_cpuinfo(cpu_info &info)
+{
+	int regs[4];
+
+#ifdef _MSC_VER
+	__cpuid(regs, 0);
+#else
+	do_cpuid(0, 0, (uint32_t *)regs);
+#endif
+
+	const uint32_t max_eax = regs[0];
+
+	if (max_eax >= 1U)
+	{
+#ifdef _MSC_VER
+		__cpuid(regs, 1);
+#else
+		do_cpuid(1, 0, (uint32_t*)regs);
+#endif
+		extract_x86_flags(info, regs[2], regs[3]);
+	}
+
+	if (max_eax >= 7U)
+	{
+#ifdef _MSC_VER
+		__cpuidex(regs, 7, 0);
+#else
+		do_cpuid(7, 0, (uint32_t*)regs);
+#endif
+
+		extract_x86_extended_flags(info, regs[1]);
+	}
+}
+
+void detect_sse41()
+{
+	cpu_info info;
+	get_cpuinfo(info);
+
+	// Check for everything from SSE to SSE 4.1
+	g_cpu_supports_sse41 = info.m_has_sse && info.m_has_sse2 && info.m_has_sse3 && info.m_has_ssse3 && info.m_has_sse41;
+}
+
+} // namespace basisu
+#else // #if BASISU_SUPPORT_SSE
+namespace basisu
+{
+
+void detect_sse41()
+{
+}
+
+} // namespace basisu
+#endif // #if BASISU_SUPPORT_SSE
+
diff --git a/thirdparty/basis_universal/encoder/basisu_miniz.h b/thirdparty/basis_universal/encoder/basisu_miniz.h
new file mode 100644
index 0000000000..8627abe893
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_miniz.h
@@ -0,0 +1,2514 @@
+/* miniz.c v1.15 - deflate/inflate, zlib-subset, ZIP reading/writing/appending, PNG writing
+   Implements RFC 1950: http://www.ietf.org/rfc/rfc1950.txt and RFC 1951: http://www.ietf.org/rfc/rfc1951.txt
+  
+   Forked from the public domain/unlicense version at: https://code.google.com/archive/p/miniz/ 
+   
+   Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+#ifndef MINIZ_HEADER_INCLUDED
+#define MINIZ_HEADER_INCLUDED
+
+#include <stdlib.h>
+
+// Defines to completely disable specific portions of miniz.c:
+// If all macros here are defined the only functionality remaining will be CRC-32, adler-32, tinfl, and tdefl.
+
+// Define MINIZ_NO_STDIO to disable all usage and any functions which rely on stdio for file I/O.
+//#define MINIZ_NO_STDIO
+
+// If MINIZ_NO_TIME is specified then the ZIP archive functions will not be able to get the current time, or
+// get/set file times, and the C run-time funcs that get/set times won't be called.
+// The current downside is the times written to your archives will be from 1979.
+//#define MINIZ_NO_TIME
+
+// Define MINIZ_NO_ARCHIVE_APIS to disable all ZIP archive API's.
+//#define MINIZ_NO_ARCHIVE_APIS
+
+// Define MINIZ_NO_ARCHIVE_APIS to disable all writing related ZIP archive API's.
+//#define MINIZ_NO_ARCHIVE_WRITING_APIS
+
+// Define MINIZ_NO_ZLIB_APIS to remove all ZLIB-style compression/decompression API's.
+//#define MINIZ_NO_ZLIB_APIS
+
+// Define MINIZ_NO_ZLIB_COMPATIBLE_NAME to disable zlib names, to prevent conflicts against stock zlib.
+//#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
+// Define MINIZ_NO_MALLOC to disable all calls to malloc, free, and realloc.
+// Note if MINIZ_NO_MALLOC is defined then the user must always provide custom user alloc/free/realloc
+// callbacks to the zlib and archive API's, and a few stand-alone helper API's which don't provide custom user
+// functions (such as tdefl_compress_mem_to_heap() and tinfl_decompress_mem_to_heap()) won't work.
+//#define MINIZ_NO_MALLOC
+
+#if defined(__TINYC__) && (defined(__linux) || defined(__linux__))
+  // TODO: Work around "error: include file 'sys\utime.h' when compiling with tcc on Linux
+  #define MINIZ_NO_TIME
+#endif
+
+#if !defined(MINIZ_NO_TIME) && !defined(MINIZ_NO_ARCHIVE_APIS)
+  #include <time.h>
+#endif
+
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__i386) || defined(__i486__) || defined(__i486) || defined(i386) || defined(__ia64__) || defined(__x86_64__)
+// MINIZ_X86_OR_X64_CPU is only used to help set the below macros.
+#define MINIZ_X86_OR_X64_CPU 1
+#endif
+
+#if (__BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__) || MINIZ_X86_OR_X64_CPU
+// Set MINIZ_LITTLE_ENDIAN to 1 if the processor is little endian.
+#define MINIZ_LITTLE_ENDIAN 1
+#endif
+
+#if MINIZ_X86_OR_X64_CPU
+// Set MINIZ_USE_UNALIGNED_LOADS_AND_STORES to 1 on CPU's that permit efficient integer loads and stores from unaligned addresses.
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 1
+#endif
+
+#if defined(_M_X64) || defined(_WIN64) || defined(__MINGW64__) || defined(_LP64) || defined(__LP64__) || defined(__ia64__) || defined(__x86_64__)
+// Set MINIZ_HAS_64BIT_REGISTERS to 1 if operations on 64-bit integers are reasonably fast (and don't involve compiler generated calls to helper functions).
+#define MINIZ_HAS_64BIT_REGISTERS 1
+#endif
+
+namespace buminiz {
+
+// ------------------- zlib-style API Definitions.
+
+// For more compatibility with zlib, miniz.c uses unsigned long for some parameters/struct members. Beware: mz_ulong can be either 32 or 64-bits!
+typedef unsigned long mz_ulong;
+
+// mz_free() internally uses the MZ_FREE() macro (which by default calls free() unless you've modified the MZ_MALLOC macro) to release a block allocated from the heap.
+void mz_free(void *p);
+
+#define MZ_ADLER32_INIT (1)
+// mz_adler32() returns the initial adler-32 value to use when called with ptr==NULL.
+mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len);
+
+#define MZ_CRC32_INIT (0)
+// mz_crc32() returns the initial CRC-32 value to use when called with ptr==NULL.
+mz_ulong mz_crc32(mz_ulong crc, const unsigned char *ptr, size_t buf_len);
+
+// Compression strategies.
+enum { MZ_DEFAULT_STRATEGY = 0, MZ_FILTERED = 1, MZ_HUFFMAN_ONLY = 2, MZ_RLE = 3, MZ_FIXED = 4 };
+
+// Method
+#define MZ_DEFLATED 8
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+// Heap allocation callbacks.
+// Note that mz_alloc_func parameter types purpsosely differ from zlib's: items/size is size_t, not unsigned long.
+typedef void *(*mz_alloc_func)(void *opaque, size_t items, size_t size);
+typedef void (*mz_free_func)(void *opaque, void *address);
+typedef void *(*mz_realloc_func)(void *opaque, void *address, size_t items, size_t size);
+
+#define MZ_VERSION          "9.1.15"
+#define MZ_VERNUM           0x91F0
+#define MZ_VER_MAJOR        9
+#define MZ_VER_MINOR        1
+#define MZ_VER_REVISION     15
+#define MZ_VER_SUBREVISION  0
+
+// Flush values. For typical usage you only need MZ_NO_FLUSH and MZ_FINISH. The other values are for advanced use (refer to the zlib docs).
+enum { MZ_NO_FLUSH = 0, MZ_PARTIAL_FLUSH = 1, MZ_SYNC_FLUSH = 2, MZ_FULL_FLUSH = 3, MZ_FINISH = 4, MZ_BLOCK = 5 };
+
+// Return status codes. MZ_PARAM_ERROR is non-standard.
+enum { MZ_OK = 0, MZ_STREAM_END = 1, MZ_NEED_DICT = 2, MZ_ERRNO = -1, MZ_STREAM_ERROR = -2, MZ_DATA_ERROR = -3, MZ_MEM_ERROR = -4, MZ_BUF_ERROR = -5, MZ_VERSION_ERROR = -6, MZ_PARAM_ERROR = -10000 };
+
+// Compression levels: 0-9 are the standard zlib-style levels, 10 is best possible compression (not zlib compatible, and may be very slow), MZ_DEFAULT_COMPRESSION=MZ_DEFAULT_LEVEL.
+enum { MZ_NO_COMPRESSION = 0, MZ_BEST_SPEED = 1, MZ_BEST_COMPRESSION = 9, MZ_UBER_COMPRESSION = 10, MZ_DEFAULT_LEVEL = 6, MZ_DEFAULT_COMPRESSION = -1 };
+
+// Window bits
+#define MZ_DEFAULT_WINDOW_BITS 15
+
+struct mz_internal_state;
+
+// Compression/decompression stream struct.
+typedef struct mz_stream_s
+{
+  const unsigned char *next_in;     // pointer to next byte to read
+  unsigned int avail_in;            // number of bytes available at next_in
+  mz_ulong total_in;                // total number of bytes consumed so far
+
+  unsigned char *next_out;          // pointer to next byte to write
+  unsigned int avail_out;           // number of bytes that can be written to next_out
+  mz_ulong total_out;               // total number of bytes produced so far
+
+  char *msg;                        // error msg (unused)
+  struct mz_internal_state *state;  // internal state, allocated by zalloc/zfree
+
+  mz_alloc_func zalloc;             // optional heap allocation function (defaults to malloc)
+  mz_free_func zfree;               // optional heap free function (defaults to free)
+  void *opaque;                     // heap alloc function user pointer
+
+  int data_type;                    // data_type (unused)
+  mz_ulong adler;                   // adler32 of the source or uncompressed data
+  mz_ulong reserved;                // not used
+} mz_stream;
+
+typedef mz_stream *mz_streamp;
+
+// Returns the version string of miniz.c.
+const char *mz_version(void);
+
+// mz_deflateInit() initializes a compressor with default options:
+// Parameters:
+//  pStream must point to an initialized mz_stream struct.
+//  level must be between [MZ_NO_COMPRESSION, MZ_BEST_COMPRESSION].
+//  level 1 enables a specially optimized compression function that's been optimized purely for performance, not ratio.
+//  (This special func. is currently only enabled when MINIZ_USE_UNALIGNED_LOADS_AND_STORES and MINIZ_LITTLE_ENDIAN are defined.)
+// Return values:
+//  MZ_OK on success.
+//  MZ_STREAM_ERROR if the stream is bogus.
+//  MZ_PARAM_ERROR if the input parameters are bogus.
+//  MZ_MEM_ERROR on out of memory.
+int mz_deflateInit(mz_streamp pStream, int level);
+
+// mz_deflateInit2() is like mz_deflate(), except with more control:
+// Additional parameters:
+//   method must be MZ_DEFLATED
+//   window_bits must be MZ_DEFAULT_WINDOW_BITS (to wrap the deflate stream with zlib header/adler-32 footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate/no header or footer)
+//   mem_level must be between [1, 9] (it's checked but ignored by miniz.c)
+int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy);
+
+// Quickly resets a compressor without having to reallocate anything. Same as calling mz_deflateEnd() followed by mz_deflateInit()/mz_deflateInit2().
+int mz_deflateReset(mz_streamp pStream);
+
+// mz_deflate() compresses the input to output, consuming as much of the input and producing as much output as possible.
+// Parameters:
+//   pStream is the stream to read from and write to. You must initialize/update the next_in, avail_in, next_out, and avail_out members.
+//   flush may be MZ_NO_FLUSH, MZ_PARTIAL_FLUSH/MZ_SYNC_FLUSH, MZ_FULL_FLUSH, or MZ_FINISH.
+// Return values:
+//   MZ_OK on success (when flushing, or if more input is needed but not available, and/or there's more output to be written but the output buffer is full).
+//   MZ_STREAM_END if all input has been consumed and all output bytes have been written. Don't call mz_deflate() on the stream anymore.
+//   MZ_STREAM_ERROR if the stream is bogus.
+//   MZ_PARAM_ERROR if one of the parameters is invalid.
+//   MZ_BUF_ERROR if no forward progress is possible because the input and/or output buffers are empty. (Fill up the input buffer or free up some output space and try again.)
+int mz_deflate(mz_streamp pStream, int flush);
+
+// mz_deflateEnd() deinitializes a compressor:
+// Return values:
+//  MZ_OK on success.
+//  MZ_STREAM_ERROR if the stream is bogus.
+int mz_deflateEnd(mz_streamp pStream);
+
+// mz_deflateBound() returns a (very) conservative upper bound on the amount of data that could be generated by deflate(), assuming flush is set to only MZ_NO_FLUSH or MZ_FINISH.
+mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len);
+
+// Single-call compression functions mz_compress() and mz_compress2():
+// Returns MZ_OK on success, or one of the error codes from mz_deflate() on failure.
+int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level);
+
+// mz_compressBound() returns a (very) conservative upper bound on the amount of data that could be generated by calling mz_compress().
+mz_ulong mz_compressBound(mz_ulong source_len);
+
+// Initializes a decompressor.
+int mz_inflateInit(mz_streamp pStream);
+
+// mz_inflateInit2() is like mz_inflateInit() with an additional option that controls the window size and whether or not the stream has been wrapped with a zlib header/footer:
+// window_bits must be MZ_DEFAULT_WINDOW_BITS (to parse zlib header/footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate).
+int mz_inflateInit2(mz_streamp pStream, int window_bits);
+
+// Decompresses the input stream to the output, consuming only as much of the input as needed, and writing as much to the output as possible.
+// Parameters:
+//   pStream is the stream to read from and write to. You must initialize/update the next_in, avail_in, next_out, and avail_out members.
+//   flush may be MZ_NO_FLUSH, MZ_SYNC_FLUSH, or MZ_FINISH.
+//   On the first call, if flush is MZ_FINISH it's assumed the input and output buffers are both sized large enough to decompress the entire stream in a single call (this is slightly faster).
+//   MZ_FINISH implies that there are no more source bytes available beside what's already in the input buffer, and that the output buffer is large enough to hold the rest of the decompressed data.
+// Return values:
+//   MZ_OK on success. Either more input is needed but not available, and/or there's more output to be written but the output buffer is full.
+//   MZ_STREAM_END if all needed input has been consumed and all output bytes have been written. For zlib streams, the adler-32 of the decompressed data has also been verified.
+//   MZ_STREAM_ERROR if the stream is bogus.
+//   MZ_DATA_ERROR if the deflate stream is invalid.
+//   MZ_PARAM_ERROR if one of the parameters is invalid.
+//   MZ_BUF_ERROR if no forward progress is possible because the input buffer is empty but the inflater needs more input to continue, or if the output buffer is not large enough. Call mz_inflate() again
+//   with more input data, or with more room in the output buffer (except when using single call decompression, described above).
+int mz_inflate(mz_streamp pStream, int flush);
+
+// Deinitializes a decompressor.
+int mz_inflateEnd(mz_streamp pStream);
+
+// Single-call decompression.
+// Returns MZ_OK on success, or one of the error codes from mz_inflate() on failure.
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len);
+
+// Returns a string description of the specified error code, or NULL if the error code is invalid.
+const char *mz_error(int err);
+
+// Redefine zlib-compatible names to miniz equivalents, so miniz.c can be used as a drop-in replacement for the subset of zlib that miniz.c supports.
+// Define MINIZ_NO_ZLIB_COMPATIBLE_NAMES to disable zlib-compatibility if you use zlib in the same project.
+#ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+  typedef unsigned char Byte;
+  typedef unsigned int uInt;
+  typedef mz_ulong uLong;
+  typedef Byte Bytef;
+  typedef uInt uIntf;
+  typedef char charf;
+  typedef int intf;
+  typedef void *voidpf;
+  typedef uLong uLongf;
+  typedef void *voidp;
+  typedef void *const voidpc;
+  #define Z_NULL                0
+  #define Z_NO_FLUSH            MZ_NO_FLUSH
+  #define Z_PARTIAL_FLUSH       MZ_PARTIAL_FLUSH
+  #define Z_SYNC_FLUSH          MZ_SYNC_FLUSH
+  #define Z_FULL_FLUSH          MZ_FULL_FLUSH
+  #define Z_FINISH              MZ_FINISH
+  #define Z_BLOCK               MZ_BLOCK
+  #define Z_OK                  MZ_OK
+  #define Z_STREAM_END          MZ_STREAM_END
+  #define Z_NEED_DICT           MZ_NEED_DICT
+  #define Z_ERRNO               MZ_ERRNO
+  #define Z_STREAM_ERROR        MZ_STREAM_ERROR
+  #define Z_DATA_ERROR          MZ_DATA_ERROR
+  #define Z_MEM_ERROR           MZ_MEM_ERROR
+  #define Z_BUF_ERROR           MZ_BUF_ERROR
+  #define Z_VERSION_ERROR       MZ_VERSION_ERROR
+  #define Z_PARAM_ERROR         MZ_PARAM_ERROR
+  #define Z_NO_COMPRESSION      MZ_NO_COMPRESSION
+  #define Z_BEST_SPEED          MZ_BEST_SPEED
+  #define Z_BEST_COMPRESSION    MZ_BEST_COMPRESSION
+  #define Z_DEFAULT_COMPRESSION MZ_DEFAULT_COMPRESSION
+  #define Z_DEFAULT_STRATEGY    MZ_DEFAULT_STRATEGY
+  #define Z_FILTERED            MZ_FILTERED
+  #define Z_HUFFMAN_ONLY        MZ_HUFFMAN_ONLY
+  #define Z_RLE                 MZ_RLE
+  #define Z_FIXED               MZ_FIXED
+  #define Z_DEFLATED            MZ_DEFLATED
+  #define Z_DEFAULT_WINDOW_BITS MZ_DEFAULT_WINDOW_BITS
+  #define alloc_func            mz_alloc_func
+  #define free_func             mz_free_func
+  #define internal_state        mz_internal_state
+  #define z_stream              mz_stream
+  #define deflateInit           mz_deflateInit
+  #define deflateInit2          mz_deflateInit2
+  #define deflateReset          mz_deflateReset
+  #define deflate               mz_deflate
+  #define deflateEnd            mz_deflateEnd
+  #define deflateBound          mz_deflateBound
+  #define compress              mz_compress
+  #define compress2             mz_compress2
+  #define compressBound         mz_compressBound
+  #define inflateInit           mz_inflateInit
+  #define inflateInit2          mz_inflateInit2
+  #define inflate               mz_inflate
+  #define inflateEnd            mz_inflateEnd
+  #define uncompress            mz_uncompress
+  #define crc32                 mz_crc32
+  #define adler32               mz_adler32
+  #define MAX_WBITS             15
+  #define MAX_MEM_LEVEL         9
+  #define zError                mz_error
+  #define ZLIB_VERSION          MZ_VERSION
+  #define ZLIB_VERNUM           MZ_VERNUM
+  #define ZLIB_VER_MAJOR        MZ_VER_MAJOR
+  #define ZLIB_VER_MINOR        MZ_VER_MINOR
+  #define ZLIB_VER_REVISION     MZ_VER_REVISION
+  #define ZLIB_VER_SUBREVISION  MZ_VER_SUBREVISION
+  #define zlibVersion           mz_version
+  #define zlib_version          mz_version()
+#endif // #ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
+#endif // MINIZ_NO_ZLIB_APIS
+
+// ------------------- Types and macros
+
+typedef unsigned char mz_uint8;
+typedef signed short mz_int16;
+typedef unsigned short mz_uint16;
+typedef unsigned int mz_uint32;
+typedef unsigned int mz_uint;
+typedef long long mz_int64;
+typedef unsigned long long mz_uint64;
+typedef int mz_bool;
+
+#define MZ_FALSE (0)
+#define MZ_TRUE (1)
+
+// An attempt to work around MSVC's spammy "warning C4127: conditional expression is constant" message.
+#ifdef _MSC_VER
+   #define MZ_MACRO_END while (0, 0)
+#else
+   #define MZ_MACRO_END while (0)
+#endif
+
+// ------------------- Low-level Decompression API Definitions
+
+// Decompression flags used by tinfl_decompress().
+// TINFL_FLAG_PARSE_ZLIB_HEADER: If set, the input has a valid zlib header and ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the input is a raw deflate stream.
+// TINFL_FLAG_HAS_MORE_INPUT: If set, there are more input bytes available beyond the end of the supplied input buffer. If clear, the input buffer contains all remaining input.
+// TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF: If set, the output buffer is large enough to hold the entire decompressed stream. If clear, the output buffer is at least the size of the dictionary (typically 32KB).
+// TINFL_FLAG_COMPUTE_ADLER32: Force adler-32 checksum computation of the decompressed bytes.
+enum
+{
+  TINFL_FLAG_PARSE_ZLIB_HEADER = 1,
+  TINFL_FLAG_HAS_MORE_INPUT = 2,
+  TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF = 4,
+  TINFL_FLAG_COMPUTE_ADLER32 = 8
+};
+
+// High level decompression functions:
+// tinfl_decompress_mem_to_heap() decompresses a block in memory to a heap block allocated via malloc().
+// On entry:
+//  pSrc_buf, src_buf_len: Pointer and size of the Deflate or zlib source data to decompress.
+// On return:
+//  Function returns a pointer to the decompressed data, or NULL on failure.
+//  *pOut_len will be set to the decompressed data's size, which could be larger than src_buf_len on uncompressible data.
+//  The caller must call mz_free() on the returned block when it's no longer needed.
+void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+
+// tinfl_decompress_mem_to_mem() decompresses a block in memory to another block in memory.
+// Returns TINFL_DECOMPRESS_MEM_TO_MEM_FAILED on failure, or the number of bytes written on success.
+#define TINFL_DECOMPRESS_MEM_TO_MEM_FAILED ((size_t)(-1))
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+// tinfl_decompress_mem_to_callback() decompresses a block in memory to an internal 32KB buffer, and a user provided callback function will be called to flush the buffer.
+// Returns 1 on success or 0 on failure.
+typedef int (*tinfl_put_buf_func_ptr)(const void* pBuf, int len, void *pUser);
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+struct tinfl_decompressor_tag; typedef struct tinfl_decompressor_tag tinfl_decompressor;
+
+// Max size of LZ dictionary.
+#define TINFL_LZ_DICT_SIZE 32768
+
+// Return status.
+typedef enum
+{
+  TINFL_STATUS_BAD_PARAM = -3,
+  TINFL_STATUS_ADLER32_MISMATCH = -2,
+  TINFL_STATUS_FAILED = -1,
+  TINFL_STATUS_DONE = 0,
+  TINFL_STATUS_NEEDS_MORE_INPUT = 1,
+  TINFL_STATUS_HAS_MORE_OUTPUT = 2
+} tinfl_status;
+
+// Initializes the decompressor to its initial state.
+#define tinfl_init(r) do { (r)->m_state = 0; } MZ_MACRO_END
+#define tinfl_get_adler32(r) (r)->m_check_adler32
+
+// Main low-level decompressor coroutine function. This is the only function actually needed for decompression. All the other functions are just high-level helpers for improved usability.
+// This is a universal API, i.e. it can be used as a building block to build any desired higher level decompression API. In the limit case, it can be called once per every byte input or output.
+tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags);
+
+// Internal/private bits follow.
+enum
+{
+  TINFL_MAX_HUFF_TABLES = 3, TINFL_MAX_HUFF_SYMBOLS_0 = 288, TINFL_MAX_HUFF_SYMBOLS_1 = 32, TINFL_MAX_HUFF_SYMBOLS_2 = 19,
+  TINFL_FAST_LOOKUP_BITS = 10, TINFL_FAST_LOOKUP_SIZE = 1 << TINFL_FAST_LOOKUP_BITS
+};
+
+typedef struct
+{
+  mz_uint8 m_code_size[TINFL_MAX_HUFF_SYMBOLS_0];
+  mz_int16 m_look_up[TINFL_FAST_LOOKUP_SIZE], m_tree[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
+} tinfl_huff_table;
+
+#if MINIZ_HAS_64BIT_REGISTERS
+  #define TINFL_USE_64BIT_BITBUF 1
+#endif
+
+#if TINFL_USE_64BIT_BITBUF
+  typedef mz_uint64 tinfl_bit_buf_t;
+  #define TINFL_BITBUF_SIZE (64)
+#else
+  typedef mz_uint32 tinfl_bit_buf_t;
+  #define TINFL_BITBUF_SIZE (32)
+#endif
+
+struct tinfl_decompressor_tag
+{
+  mz_uint32 m_state, m_num_bits, m_zhdr0, m_zhdr1, m_z_adler32, m_final, m_type, m_check_adler32, m_dist, m_counter, m_num_extra, m_table_sizes[TINFL_MAX_HUFF_TABLES];
+  tinfl_bit_buf_t m_bit_buf;
+  size_t m_dist_from_out_buf_start;
+  tinfl_huff_table m_tables[TINFL_MAX_HUFF_TABLES];
+  mz_uint8 m_raw_header[4], m_len_codes[TINFL_MAX_HUFF_SYMBOLS_0 + TINFL_MAX_HUFF_SYMBOLS_1 + 137];
+};
+
+// ------------------- Low-level Compression API Definitions
+
+// Set TDEFL_LESS_MEMORY to 1 to use less memory (compression will be slightly slower, and raw/dynamic blocks will be output more frequently).
+#define TDEFL_LESS_MEMORY 0
+
+// tdefl_init() compression flags logically OR'd together (low 12 bits contain the max. number of probes per dictionary search):
+// TDEFL_DEFAULT_MAX_PROBES: The compressor defaults to 128 dictionary probes per dictionary search. 0=Huffman only, 1=Huffman+LZ (fastest/crap compression), 4095=Huffman+LZ (slowest/best compression).
+enum
+{
+  TDEFL_HUFFMAN_ONLY = 0, TDEFL_DEFAULT_MAX_PROBES = 128, TDEFL_MAX_PROBES_MASK = 0xFFF
+};
+
+// TDEFL_WRITE_ZLIB_HEADER: If set, the compressor outputs a zlib header before the deflate data, and the Adler-32 of the source data at the end. Otherwise, you'll get raw deflate data.
+// TDEFL_COMPUTE_ADLER32: Always compute the adler-32 of the input data (even when not writing zlib headers).
+// TDEFL_GREEDY_PARSING_FLAG: Set to use faster greedy parsing, instead of more efficient lazy parsing.
+// TDEFL_NONDETERMINISTIC_PARSING_FLAG: Enable to decrease the compressor's initialization time to the minimum, but the output may vary from run to run given the same input (depending on the contents of memory).
+// TDEFL_RLE_MATCHES: Only look for RLE matches (matches with a distance of 1)
+// TDEFL_FILTER_MATCHES: Discards matches <= 5 chars if enabled.
+// TDEFL_FORCE_ALL_STATIC_BLOCKS: Disable usage of optimized Huffman tables.
+// TDEFL_FORCE_ALL_RAW_BLOCKS: Only use raw (uncompressed) deflate blocks.
+// The low 12 bits are reserved to control the max # of hash probes per dictionary lookup (see TDEFL_MAX_PROBES_MASK).
+enum
+{
+  TDEFL_WRITE_ZLIB_HEADER             = 0x01000,
+  TDEFL_COMPUTE_ADLER32               = 0x02000,
+  TDEFL_GREEDY_PARSING_FLAG           = 0x04000,
+  TDEFL_NONDETERMINISTIC_PARSING_FLAG = 0x08000,
+  TDEFL_RLE_MATCHES                   = 0x10000,
+  TDEFL_FILTER_MATCHES                = 0x20000,
+  TDEFL_FORCE_ALL_STATIC_BLOCKS       = 0x40000,
+  TDEFL_FORCE_ALL_RAW_BLOCKS          = 0x80000
+};
+
+// High level compression functions:
+// tdefl_compress_mem_to_heap() compresses a block in memory to a heap block allocated via malloc().
+// On entry:
+//  pSrc_buf, src_buf_len: Pointer and size of source block to compress.
+//  flags: The max match finder probes (default is 128) logically OR'd against the above flags. Higher probes are slower but improve compression.
+// On return:
+//  Function returns a pointer to the compressed data, or NULL on failure.
+//  *pOut_len will be set to the compressed data's size, which could be larger than src_buf_len on uncompressible data.
+//  The caller must free() the returned block when it's no longer needed.
+void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags);
+
+// tdefl_compress_mem_to_mem() compresses a block in memory to another block in memory.
+// Returns 0 on failure.
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags);
+
+// Compresses an image to a compressed PNG file in memory.
+// On entry:
+//  pImage, w, h, and num_chans describe the image to compress. num_chans may be 1, 2, 3, or 4. 
+//  The image pitch in bytes per scanline will be w*num_chans. The leftmost pixel on the top scanline is stored first in memory.
+//  level may range from [0,10], use MZ_NO_COMPRESSION, MZ_BEST_SPEED, MZ_BEST_COMPRESSION, etc. or a decent default is MZ_DEFAULT_LEVEL
+//  If flip is true, the image will be flipped on the Y axis (useful for OpenGL apps).
+// On return:
+//  Function returns a pointer to the compressed data, or NULL on failure.
+//  *pLen_out will be set to the size of the PNG image file.
+//  The caller must mz_free() the returned heap block (which will typically be larger than *pLen_out) when it's no longer needed.
+void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip);
+void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out);
+
+// Output stream interface. The compressor uses this interface to write compressed data. It'll typically be called TDEFL_OUT_BUF_SIZE at a time.
+typedef mz_bool (*tdefl_put_buf_func_ptr)(const void* pBuf, int len, void *pUser);
+
+// tdefl_compress_mem_to_output() compresses a block to an output stream. The above helpers use this function internally.
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+enum { TDEFL_MAX_HUFF_TABLES = 3, TDEFL_MAX_HUFF_SYMBOLS_0 = 288, TDEFL_MAX_HUFF_SYMBOLS_1 = 32, TDEFL_MAX_HUFF_SYMBOLS_2 = 19, TDEFL_LZ_DICT_SIZE = 32768, TDEFL_LZ_DICT_SIZE_MASK = TDEFL_LZ_DICT_SIZE - 1, TDEFL_MIN_MATCH_LEN = 3, TDEFL_MAX_MATCH_LEN = 258 };
+
+// TDEFL_OUT_BUF_SIZE MUST be large enough to hold a single entire compressed output block (using static/fixed Huffman codes).
+#if TDEFL_LESS_MEMORY
+enum { TDEFL_LZ_CODE_BUF_SIZE = 24 * 1024, TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13 ) / 10, TDEFL_MAX_HUFF_SYMBOLS = 288, TDEFL_LZ_HASH_BITS = 12, TDEFL_LEVEL1_HASH_SIZE_MASK = 4095, TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3, TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS };
+#else
+enum { TDEFL_LZ_CODE_BUF_SIZE = 64 * 1024, TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13 ) / 10, TDEFL_MAX_HUFF_SYMBOLS = 288, TDEFL_LZ_HASH_BITS = 15, TDEFL_LEVEL1_HASH_SIZE_MASK = 4095, TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3, TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS };
+#endif
+
+// The low-level tdefl functions below may be used directly if the above helper functions aren't flexible enough. The low-level functions don't make any heap allocations, unlike the above helper functions.
+typedef enum
+{
+  TDEFL_STATUS_BAD_PARAM = -2,
+  TDEFL_STATUS_PUT_BUF_FAILED = -1,
+  TDEFL_STATUS_OKAY = 0,
+  TDEFL_STATUS_DONE = 1,
+} tdefl_status;
+
+// Must map to MZ_NO_FLUSH, MZ_SYNC_FLUSH, etc. enums
+typedef enum
+{
+  TDEFL_NO_FLUSH = 0,
+  TDEFL_SYNC_FLUSH = 2,
+  TDEFL_FULL_FLUSH = 3,
+  TDEFL_FINISH = 4
+} tdefl_flush;
+
+// tdefl's compression state structure.
+typedef struct
+{
+  tdefl_put_buf_func_ptr m_pPut_buf_func;
+  void *m_pPut_buf_user;
+  mz_uint m_flags, m_max_probes[2];
+  int m_greedy_parsing;
+  mz_uint m_adler32, m_lookahead_pos, m_lookahead_size, m_dict_size;
+  mz_uint8 *m_pLZ_code_buf, *m_pLZ_flags, *m_pOutput_buf, *m_pOutput_buf_end;
+  mz_uint m_num_flags_left, m_total_lz_bytes, m_lz_code_buf_dict_pos, m_bits_in, m_bit_buffer;
+  mz_uint m_saved_match_dist, m_saved_match_len, m_saved_lit, m_output_flush_ofs, m_output_flush_remaining, m_finished, m_block_index, m_wants_to_finish;
+  tdefl_status m_prev_return_status;
+  const void *m_pIn_buf;
+  void *m_pOut_buf;
+  size_t *m_pIn_buf_size, *m_pOut_buf_size;
+  tdefl_flush m_flush;
+  const mz_uint8 *m_pSrc;
+  size_t m_src_buf_left, m_out_buf_ofs;
+  mz_uint8 m_dict[TDEFL_LZ_DICT_SIZE + TDEFL_MAX_MATCH_LEN - 1];
+  mz_uint16 m_huff_count[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint16 m_huff_codes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_huff_code_sizes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE];
+  mz_uint16 m_next[TDEFL_LZ_DICT_SIZE];
+  mz_uint16 m_hash[TDEFL_LZ_HASH_SIZE];
+  mz_uint8 m_output_buf[TDEFL_OUT_BUF_SIZE];
+} tdefl_compressor;
+
+// Initializes the compressor.
+// There is no corresponding deinit() function because the tdefl API's do not dynamically allocate memory.
+// pBut_buf_func: If NULL, output data will be supplied to the specified callback. In this case, the user should call the tdefl_compress_buffer() API for compression.
+// If pBut_buf_func is NULL the user should always call the tdefl_compress() API.
+// flags: See the above enums (TDEFL_HUFFMAN_ONLY, TDEFL_WRITE_ZLIB_HEADER, etc.)
+tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags);
+
+// Compresses a block of data, consuming as much of the specified input buffer as possible, and writing as much compressed data to the specified output buffer as possible.
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush);
+
+// tdefl_compress_buffer() is only usable when the tdefl_init() is called with a non-NULL tdefl_put_buf_func_ptr.
+// tdefl_compress_buffer() always consumes the entire input buffer.
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush);
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d);
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d);
+
+// Can't use tdefl_create_comp_flags_from_zip_params if MINIZ_NO_ZLIB_APIS isn't defined, because it uses some of its macros.
+#ifndef MINIZ_NO_ZLIB_APIS
+// Create tdefl_compress() flags given zlib-style compression parameters.
+// level may range from [0,10] (where 10 is absolute max compression, but may be much slower on some files)
+// window_bits may be -15 (raw deflate) or 15 (zlib)
+// strategy may be either MZ_DEFAULT_STRATEGY, MZ_FILTERED, MZ_HUFFMAN_ONLY, MZ_RLE, or MZ_FIXED
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy);
+#endif // #ifndef MINIZ_NO_ZLIB_APIS
+
+} // namespace buminiz
+
+#endif // MINIZ_HEADER_INCLUDED
+
+// ------------------- End of Header: Implementation follows. (If you only want the header, define MINIZ_HEADER_FILE_ONLY.)
+
+#ifndef MINIZ_HEADER_FILE_ONLY
+
+#include <string.h>
+#include <assert.h>
+
+namespace buminiz {
+
+typedef unsigned char mz_validate_uint16[sizeof(mz_uint16)==2 ? 1 : -1];
+typedef unsigned char mz_validate_uint32[sizeof(mz_uint32)==4 ? 1 : -1];
+typedef unsigned char mz_validate_uint64[sizeof(mz_uint64)==8 ? 1 : -1];
+
+#define MZ_ASSERT(x) assert(x)
+
+#ifdef MINIZ_NO_MALLOC
+  #define MZ_MALLOC(x) NULL
+  #define MZ_FREE(x) (void)x, ((void)0)
+  #define MZ_REALLOC(p, x) NULL
+#else
+  #define MZ_MALLOC(x) malloc(x)
+  #define MZ_FREE(x) free(x)
+  #define MZ_REALLOC(p, x) realloc(p, x)
+#endif
+
+#define MZ_MAX(a,b) (((a)>(b))?(a):(b))
+#define MZ_MIN(a,b) (((a)<(b))?(a):(b))
+#define MZ_CLEAR_OBJ(obj) memset(&(obj), 0, sizeof(obj))
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  #define MZ_READ_LE16(p) *((const mz_uint16 *)(p))
+  #define MZ_READ_LE32(p) *((const mz_uint32 *)(p))
+#else
+  #define MZ_READ_LE16(p) ((mz_uint32)(((const mz_uint8 *)(p))[0]) | ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U))
+  #define MZ_READ_LE32(p) ((mz_uint32)(((const mz_uint8 *)(p))[0]) | ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U) | ((mz_uint32)(((const mz_uint8 *)(p))[2]) << 16U) | ((mz_uint32)(((const mz_uint8 *)(p))[3]) << 24U))
+#endif
+
+#ifdef _MSC_VER
+  #define MZ_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+  #define MZ_FORCEINLINE inline __attribute__((__always_inline__))
+#else
+  #define MZ_FORCEINLINE inline
+#endif
+
+// ------------------- zlib-style API's
+
+mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len)
+{
+  mz_uint32 i, s1 = (mz_uint32)(adler & 0xffff), s2 = (mz_uint32)(adler >> 16); size_t block_len = buf_len % 5552;
+  if (!ptr) return MZ_ADLER32_INIT;
+  while (buf_len) {
+    for (i = 0; i + 7 < block_len; i += 8, ptr += 8) {
+      s1 += ptr[0], s2 += s1; s1 += ptr[1], s2 += s1; s1 += ptr[2], s2 += s1; s1 += ptr[3], s2 += s1;
+      s1 += ptr[4], s2 += s1; s1 += ptr[5], s2 += s1; s1 += ptr[6], s2 += s1; s1 += ptr[7], s2 += s1;
+    }
+    for ( ; i < block_len; ++i) s1 += *ptr++, s2 += s1;
+    s1 %= 65521U, s2 %= 65521U; buf_len -= block_len; block_len = 5552;
+  }
+  return (s2 << 16) + s1;
+}
+
+// Karl Malbrain's compact CRC-32. See "A compact CCITT crc16 and crc32 C implementation that balances processor cache usage against speed": http://www.geocities.com/malbrain/
+mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len)
+{
+  static const mz_uint32 s_crc32[16] = { 0, 0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4, 0x4db26158, 0x5005713c,
+    0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c, 0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c };
+  mz_uint32 crcu32 = (mz_uint32)crc;
+  if (!ptr) return MZ_CRC32_INIT;
+  crcu32 = ~crcu32; while (buf_len--) { mz_uint8 b = *ptr++; crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b & 0xF)]; crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b >> 4)]; }
+  return ~crcu32;
+}
+
+void mz_free(void *p)
+{
+  MZ_FREE(p);
+}
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+static void *def_alloc_func(void *opaque, size_t items, size_t size) { (void)opaque, (void)items, (void)size; return MZ_MALLOC(items * size); }
+static void def_free_func(void *opaque, void *address) { (void)opaque, (void)address; MZ_FREE(address); }
+//static void *def_realloc_func(void *opaque, void *address, size_t items, size_t size) { (void)opaque, (void)address, (void)items, (void)size; return MZ_REALLOC(address, items * size); }
+
+const char *mz_version(void)
+{
+  return MZ_VERSION;
+}
+
+int mz_deflateInit(mz_streamp pStream, int level)
+{
+  return mz_deflateInit2(pStream, level, MZ_DEFLATED, MZ_DEFAULT_WINDOW_BITS, 9, MZ_DEFAULT_STRATEGY);
+}
+
+int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits, int mem_level, int strategy)
+{
+  tdefl_compressor *pComp;
+  mz_uint comp_flags = TDEFL_COMPUTE_ADLER32 | tdefl_create_comp_flags_from_zip_params(level, window_bits, strategy);
+
+  if (!pStream) return MZ_STREAM_ERROR;
+  if ((method != MZ_DEFLATED) || ((mem_level < 1) || (mem_level > 9)) || ((window_bits != MZ_DEFAULT_WINDOW_BITS) && (-window_bits != MZ_DEFAULT_WINDOW_BITS))) return MZ_PARAM_ERROR;
+
+  pStream->data_type = 0;
+  pStream->adler = MZ_ADLER32_INIT;
+  pStream->msg = NULL;
+  pStream->reserved = 0;
+  pStream->total_in = 0;
+  pStream->total_out = 0;
+  if (!pStream->zalloc) pStream->zalloc = def_alloc_func;
+  if (!pStream->zfree) pStream->zfree = def_free_func;
+
+  pComp = (tdefl_compressor *)pStream->zalloc(pStream->opaque, 1, sizeof(tdefl_compressor));
+  if (!pComp)
+    return MZ_MEM_ERROR;
+
+  pStream->state = (struct mz_internal_state *)pComp;
+
+  if (tdefl_init(pComp, NULL, NULL, comp_flags) != TDEFL_STATUS_OKAY)
+  {
+    mz_deflateEnd(pStream);
+    return MZ_PARAM_ERROR;
+  }
+
+  return MZ_OK;
+}
+
+int mz_deflateReset(mz_streamp pStream)
+{
+  if ((!pStream) || (!pStream->state) || (!pStream->zalloc) || (!pStream->zfree)) return MZ_STREAM_ERROR;
+  pStream->total_in = pStream->total_out = 0;
+  tdefl_init((tdefl_compressor*)pStream->state, NULL, NULL, ((tdefl_compressor*)pStream->state)->m_flags);
+  return MZ_OK;
+}
+
+int mz_deflate(mz_streamp pStream, int flush)
+{
+  size_t in_bytes, out_bytes;
+  mz_ulong orig_total_in, orig_total_out;
+  int mz_status = MZ_OK;
+
+  if ((!pStream) || (!pStream->state) || (flush < 0) || (flush > MZ_FINISH) || (!pStream->next_out)) return MZ_STREAM_ERROR;
+  if (!pStream->avail_out) return MZ_BUF_ERROR;
+
+  if (flush == MZ_PARTIAL_FLUSH) flush = MZ_SYNC_FLUSH;
+
+  if (((tdefl_compressor*)pStream->state)->m_prev_return_status == TDEFL_STATUS_DONE)
+    return (flush == MZ_FINISH) ? MZ_STREAM_END : MZ_BUF_ERROR;
+
+  orig_total_in = pStream->total_in; orig_total_out = pStream->total_out;
+  for ( ; ; )
+  {
+    tdefl_status defl_status;
+    in_bytes = pStream->avail_in; out_bytes = pStream->avail_out;
+
+    defl_status = tdefl_compress((tdefl_compressor*)pStream->state, pStream->next_in, &in_bytes, pStream->next_out, &out_bytes, (tdefl_flush)flush);
+    pStream->next_in += (mz_uint)in_bytes; pStream->avail_in -= (mz_uint)in_bytes;
+    pStream->total_in += (mz_uint)in_bytes; pStream->adler = tdefl_get_adler32((tdefl_compressor*)pStream->state);
+
+    pStream->next_out += (mz_uint)out_bytes; pStream->avail_out -= (mz_uint)out_bytes;
+    pStream->total_out += (mz_uint)out_bytes;
+
+    if (defl_status < 0)
+    {
+      mz_status = MZ_STREAM_ERROR;
+      break;
+    }
+    else if (defl_status == TDEFL_STATUS_DONE)
+    {
+      mz_status = MZ_STREAM_END;
+      break;
+    }
+    else if (!pStream->avail_out)
+      break;
+    else if ((!pStream->avail_in) && (flush != MZ_FINISH))
+    {
+      if ((flush) || (pStream->total_in != orig_total_in) || (pStream->total_out != orig_total_out))
+        break;
+      return MZ_BUF_ERROR; // Can't make forward progress without some input.
+    }
+  }
+  return mz_status;
+}
+
+int mz_deflateEnd(mz_streamp pStream)
+{
+  if (!pStream) return MZ_STREAM_ERROR;
+  if (pStream->state)
+  {
+    pStream->zfree(pStream->opaque, pStream->state);
+    pStream->state = NULL;
+  }
+  return MZ_OK;
+}
+
+mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len)
+{
+  (void)pStream;
+  // This is really over conservative. (And lame, but it's actually pretty tricky to compute a true upper bound given the way tdefl's blocking works.)
+  mz_uint64 a = 128ULL + (source_len * 110ULL) / 100ULL;
+  mz_uint64 b = 128ULL + (mz_uint64)source_len + ((source_len / (31 * 1024)) + 1ULL) * 5ULL;
+  
+  mz_uint64 t = MZ_MAX(a, b);
+  if (((mz_ulong)t) != t)
+     t = (mz_ulong)(-1);
+
+  return (mz_ulong)t;
+}
+
+int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len, int level)
+{
+  int status;
+  mz_stream stream;
+  memset(&stream, 0, sizeof(stream));
+
+  // In case mz_ulong is 64-bits (argh I hate longs).
+  if ((source_len | *pDest_len) > 0xFFFFFFFFU) return MZ_PARAM_ERROR;
+
+  stream.next_in = pSource;
+  stream.avail_in = (mz_uint32)source_len;
+  stream.next_out = pDest;
+  stream.avail_out = (mz_uint32)*pDest_len;
+
+  status = mz_deflateInit(&stream, level);
+  if (status != MZ_OK) return status;
+
+  status = mz_deflate(&stream, MZ_FINISH);
+  if (status != MZ_STREAM_END)
+  {
+    mz_deflateEnd(&stream);
+    return (status == MZ_OK) ? MZ_BUF_ERROR : status;
+  }
+
+  *pDest_len = stream.total_out;
+  return mz_deflateEnd(&stream);
+}
+
+int mz_compress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+{
+  return mz_compress2(pDest, pDest_len, pSource, source_len, MZ_DEFAULT_COMPRESSION);
+}
+
+mz_ulong mz_compressBound(mz_ulong source_len)
+{
+  return mz_deflateBound(NULL, source_len);
+}
+
+typedef struct
+{
+  tinfl_decompressor m_decomp;
+  mz_uint m_dict_ofs, m_dict_avail, m_first_call, m_has_flushed; int m_window_bits;
+  mz_uint8 m_dict[TINFL_LZ_DICT_SIZE];
+  tinfl_status m_last_status;
+} inflate_state;
+
+int mz_inflateInit2(mz_streamp pStream, int window_bits)
+{
+  inflate_state *pDecomp;
+  if (!pStream) return MZ_STREAM_ERROR;
+  if ((window_bits != MZ_DEFAULT_WINDOW_BITS) && (-window_bits != MZ_DEFAULT_WINDOW_BITS)) return MZ_PARAM_ERROR;
+
+  pStream->data_type = 0;
+  pStream->adler = 0;
+  pStream->msg = NULL;
+  pStream->total_in = 0;
+  pStream->total_out = 0;
+  pStream->reserved = 0;
+  if (!pStream->zalloc) pStream->zalloc = def_alloc_func;
+  if (!pStream->zfree) pStream->zfree = def_free_func;
+
+  pDecomp = (inflate_state*)pStream->zalloc(pStream->opaque, 1, sizeof(inflate_state));
+  if (!pDecomp) return MZ_MEM_ERROR;
+
+  pStream->state = (struct mz_internal_state *)pDecomp;
+
+  tinfl_init(&pDecomp->m_decomp);
+  pDecomp->m_dict_ofs = 0;
+  pDecomp->m_dict_avail = 0;
+  pDecomp->m_last_status = TINFL_STATUS_NEEDS_MORE_INPUT;
+  pDecomp->m_first_call = 1;
+  pDecomp->m_has_flushed = 0;
+  pDecomp->m_window_bits = window_bits;
+
+  return MZ_OK;
+}
+
+int mz_inflateInit(mz_streamp pStream)
+{
+   return mz_inflateInit2(pStream, MZ_DEFAULT_WINDOW_BITS);
+}
+
+int mz_inflate(mz_streamp pStream, int flush)
+{
+  inflate_state* pState;
+  mz_uint n, first_call, decomp_flags = TINFL_FLAG_COMPUTE_ADLER32;
+  size_t in_bytes, out_bytes, orig_avail_in;
+  tinfl_status status;
+
+  if ((!pStream) || (!pStream->state)) return MZ_STREAM_ERROR;
+  if (flush == MZ_PARTIAL_FLUSH) flush = MZ_SYNC_FLUSH;
+  if ((flush) && (flush != MZ_SYNC_FLUSH) && (flush != MZ_FINISH)) return MZ_STREAM_ERROR;
+
+  pState = (inflate_state*)pStream->state;
+  if (pState->m_window_bits > 0) decomp_flags |= TINFL_FLAG_PARSE_ZLIB_HEADER;
+  orig_avail_in = pStream->avail_in;
+
+  first_call = pState->m_first_call; pState->m_first_call = 0;
+  if (pState->m_last_status < 0) return MZ_DATA_ERROR;
+
+  if (pState->m_has_flushed && (flush != MZ_FINISH)) return MZ_STREAM_ERROR;
+  pState->m_has_flushed |= (flush == MZ_FINISH);
+
+  if ((flush == MZ_FINISH) && (first_call))
+  {
+    // MZ_FINISH on the first call implies that the input and output buffers are large enough to hold the entire compressed/decompressed file.
+    decomp_flags |= TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF;
+    in_bytes = pStream->avail_in; out_bytes = pStream->avail_out;
+    status = tinfl_decompress(&pState->m_decomp, pStream->next_in, &in_bytes, pStream->next_out, pStream->next_out, &out_bytes, decomp_flags);
+    pState->m_last_status = status;
+    pStream->next_in += (mz_uint)in_bytes; pStream->avail_in -= (mz_uint)in_bytes; pStream->total_in += (mz_uint)in_bytes;
+    pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+    pStream->next_out += (mz_uint)out_bytes; pStream->avail_out -= (mz_uint)out_bytes; pStream->total_out += (mz_uint)out_bytes;
+
+    if (status < 0)
+      return MZ_DATA_ERROR;
+    else if (status != TINFL_STATUS_DONE)
+    {
+      pState->m_last_status = TINFL_STATUS_FAILED;
+      return MZ_BUF_ERROR;
+    }
+    return MZ_STREAM_END;
+  }
+  // flush != MZ_FINISH then we must assume there's more input.
+  if (flush != MZ_FINISH) decomp_flags |= TINFL_FLAG_HAS_MORE_INPUT;
+
+  if (pState->m_dict_avail)
+  {
+    n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+    memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+    pStream->next_out += n; pStream->avail_out -= n; pStream->total_out += n;
+    pState->m_dict_avail -= n; pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+    return ((pState->m_last_status == TINFL_STATUS_DONE) && (!pState->m_dict_avail)) ? MZ_STREAM_END : MZ_OK;
+  }
+
+  for ( ; ; )
+  {
+    in_bytes = pStream->avail_in;
+    out_bytes = TINFL_LZ_DICT_SIZE - pState->m_dict_ofs;
+
+    status = tinfl_decompress(&pState->m_decomp, pStream->next_in, &in_bytes, pState->m_dict, pState->m_dict + pState->m_dict_ofs, &out_bytes, decomp_flags);
+    pState->m_last_status = status;
+
+    pStream->next_in += (mz_uint)in_bytes; pStream->avail_in -= (mz_uint)in_bytes;
+    pStream->total_in += (mz_uint)in_bytes; pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+
+    pState->m_dict_avail = (mz_uint)out_bytes;
+
+    n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+    memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+    pStream->next_out += n; pStream->avail_out -= n; pStream->total_out += n;
+    pState->m_dict_avail -= n; pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+
+    if (status < 0)
+       return MZ_DATA_ERROR; // Stream is corrupted (there could be some uncompressed data left in the output dictionary - oh well).
+    else if ((status == TINFL_STATUS_NEEDS_MORE_INPUT) && (!orig_avail_in))
+      return MZ_BUF_ERROR; // Signal caller that we can't make forward progress without supplying more input or by setting flush to MZ_FINISH.
+    else if (flush == MZ_FINISH)
+    {
+       // The output buffer MUST be large to hold the remaining uncompressed data when flush==MZ_FINISH.
+       if (status == TINFL_STATUS_DONE)
+          return pState->m_dict_avail ? MZ_BUF_ERROR : MZ_STREAM_END;
+       // status here must be TINFL_STATUS_HAS_MORE_OUTPUT, which means there's at least 1 more byte on the way. If there's no more room left in the output buffer then something is wrong.
+       else if (!pStream->avail_out)
+          return MZ_BUF_ERROR;
+    }
+    else if ((status == TINFL_STATUS_DONE) || (!pStream->avail_in) || (!pStream->avail_out) || (pState->m_dict_avail))
+      break;
+  }
+
+  return ((status == TINFL_STATUS_DONE) && (!pState->m_dict_avail)) ? MZ_STREAM_END : MZ_OK;
+}
+
+int mz_inflateEnd(mz_streamp pStream)
+{
+  if (!pStream)
+    return MZ_STREAM_ERROR;
+  if (pStream->state)
+  {
+    pStream->zfree(pStream->opaque, pStream->state);
+    pStream->state = NULL;
+  }
+  return MZ_OK;
+}
+
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len, const unsigned char *pSource, mz_ulong source_len)
+{
+  mz_stream stream;
+  int status;
+  memset(&stream, 0, sizeof(stream));
+
+  // In case mz_ulong is 64-bits (argh I hate longs).
+  if ((source_len | *pDest_len) > 0xFFFFFFFFU) return MZ_PARAM_ERROR;
+
+  stream.next_in = pSource;
+  stream.avail_in = (mz_uint32)source_len;
+  stream.next_out = pDest;
+  stream.avail_out = (mz_uint32)*pDest_len;
+
+  status = mz_inflateInit(&stream);
+  if (status != MZ_OK)
+    return status;
+
+  status = mz_inflate(&stream, MZ_FINISH);
+  if (status != MZ_STREAM_END)
+  {
+    mz_inflateEnd(&stream);
+    return ((status == MZ_BUF_ERROR) && (!stream.avail_in)) ? MZ_DATA_ERROR : status;
+  }
+  *pDest_len = stream.total_out;
+
+  return mz_inflateEnd(&stream);
+}
+
+const char *mz_error(int err)
+{
+  static struct { int m_err; const char *m_pDesc; } s_error_descs[] =
+  {
+    { MZ_OK, "" }, { MZ_STREAM_END, "stream end" }, { MZ_NEED_DICT, "need dictionary" }, { MZ_ERRNO, "file error" }, { MZ_STREAM_ERROR, "stream error" },
+    { MZ_DATA_ERROR, "data error" }, { MZ_MEM_ERROR, "out of memory" }, { MZ_BUF_ERROR, "buf error" }, { MZ_VERSION_ERROR, "version error" }, { MZ_PARAM_ERROR, "parameter error" }
+  };
+  mz_uint i; for (i = 0; i < sizeof(s_error_descs) / sizeof(s_error_descs[0]); ++i) if (s_error_descs[i].m_err == err) return s_error_descs[i].m_pDesc;
+  return NULL;
+}
+
+#endif //MINIZ_NO_ZLIB_APIS
+
+// ------------------- Low-level Decompression (completely independent from all compression API's)
+
+#define TINFL_MEMCPY(d, s, l) memcpy(d, s, l)
+#define TINFL_MEMSET(p, c, l) memset(p, c, l)
+
+#define TINFL_CR_BEGIN switch(r->m_state) { case 0:
+#define TINFL_CR_RETURN(state_index, result) do { status = result; r->m_state = state_index; goto common_exit; case state_index:; } MZ_MACRO_END
+#define TINFL_CR_RETURN_FOREVER(state_index, result) do { for ( ; ; ) { TINFL_CR_RETURN(state_index, result); } } MZ_MACRO_END
+#define TINFL_CR_FINISH }
+
+// TODO: If the caller has indicated that there's no more input, and we attempt to read beyond the input buf, then something is wrong with the input because the inflator never
+// reads ahead more than it needs to. Currently TINFL_GET_BYTE() pads the end of the stream with 0's in this scenario.
+#define TINFL_GET_BYTE(state_index, c) do { \
+  if (pIn_buf_cur >= pIn_buf_end) { \
+    for ( ; ; ) { \
+      if (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) { \
+        TINFL_CR_RETURN(state_index, TINFL_STATUS_NEEDS_MORE_INPUT); \
+        if (pIn_buf_cur < pIn_buf_end) { \
+          c = *pIn_buf_cur++; \
+          break; \
+        } \
+      } else { \
+        c = 0; \
+        break; \
+      } \
+    } \
+  } else c = *pIn_buf_cur++; } MZ_MACRO_END
+
+#define TINFL_NEED_BITS(state_index, n) do { mz_uint c; TINFL_GET_BYTE(state_index, c); bit_buf |= (((tinfl_bit_buf_t)c) << num_bits); num_bits += 8; } while (num_bits < (mz_uint)(n))
+#define TINFL_SKIP_BITS(state_index, n) do { if (num_bits < (mz_uint)(n)) { TINFL_NEED_BITS(state_index, n); } bit_buf >>= (n); num_bits -= (n); } MZ_MACRO_END
+#define TINFL_GET_BITS(state_index, b, n) do { if (num_bits < (mz_uint)(n)) { TINFL_NEED_BITS(state_index, n); } b = bit_buf & ((1 << (n)) - 1); bit_buf >>= (n); num_bits -= (n); } MZ_MACRO_END
+
+// TINFL_HUFF_BITBUF_FILL() is only used rarely, when the number of bytes remaining in the input buffer falls below 2.
+// It reads just enough bytes from the input stream that are needed to decode the next Huffman code (and absolutely no more). It works by trying to fully decode a
+// Huffman code by using whatever bits are currently present in the bit buffer. If this fails, it reads another byte, and tries again until it succeeds or until the
+// bit buffer contains >=15 bits (deflate's max. Huffman code size).
+#define TINFL_HUFF_BITBUF_FILL(state_index, pHuff) \
+  do { \
+    temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]; \
+    if (temp >= 0) { \
+      code_len = temp >> 9; \
+      if ((code_len) && (num_bits >= code_len)) \
+      break; \
+    } else if (num_bits > TINFL_FAST_LOOKUP_BITS) { \
+       code_len = TINFL_FAST_LOOKUP_BITS; \
+       do { \
+          temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)]; \
+       } while ((temp < 0) && (num_bits >= (code_len + 1))); if (temp >= 0) break; \
+    } TINFL_GET_BYTE(state_index, c); bit_buf |= (((tinfl_bit_buf_t)c) << num_bits); num_bits += 8; \
+  } while (num_bits < 15);
+
+// TINFL_HUFF_DECODE() decodes the next Huffman coded symbol. It's more complex than you would initially expect because the zlib API expects the decompressor to never read
+// beyond the final byte of the deflate stream. (In other words, when this macro wants to read another byte from the input, it REALLY needs another byte in order to fully
+// decode the next Huffman code.) Handling this properly is particularly important on raw deflate (non-zlib) streams, which aren't followed by a byte aligned adler-32.
+// The slow path is only executed at the very end of the input buffer.
+#define TINFL_HUFF_DECODE(state_index, sym, pHuff) do { \
+  int temp; mz_uint code_len, c; \
+  if (num_bits < 15) { \
+    if ((pIn_buf_end - pIn_buf_cur) < 2) { \
+       TINFL_HUFF_BITBUF_FILL(state_index, pHuff); \
+    } else { \
+       bit_buf |= (((tinfl_bit_buf_t)pIn_buf_cur[0]) << num_bits) | (((tinfl_bit_buf_t)pIn_buf_cur[1]) << (num_bits + 8)); pIn_buf_cur += 2; num_bits += 16; \
+    } \
+  } \
+  if ((temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0) \
+    code_len = temp >> 9, temp &= 511; \
+  else { \
+    code_len = TINFL_FAST_LOOKUP_BITS; do { temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)]; } while (temp < 0); \
+  } sym = temp; bit_buf >>= code_len; num_bits -= code_len; } MZ_MACRO_END
+
+tinfl_status tinfl_decompress(tinfl_decompressor *r, const mz_uint8 *pIn_buf_next, size_t *pIn_buf_size, mz_uint8 *pOut_buf_start, mz_uint8 *pOut_buf_next, size_t *pOut_buf_size, const mz_uint32 decomp_flags)
+{
+  static const int s_length_base[31] = { 3,4,5,6,7,8,9,10,11,13, 15,17,19,23,27,31,35,43,51,59, 67,83,99,115,131,163,195,227,258,0,0 };
+  static const int s_length_extra[31]= { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+  static const int s_dist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193, 257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+  static const int s_dist_extra[32] = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+  static const mz_uint8 s_length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+  static const int s_min_table_sizes[3] = { 257, 1, 4 };
+
+  tinfl_status status = TINFL_STATUS_FAILED; mz_uint32 num_bits, dist, counter, num_extra; tinfl_bit_buf_t bit_buf;
+  const mz_uint8 *pIn_buf_cur = pIn_buf_next, *const pIn_buf_end = pIn_buf_next + *pIn_buf_size;
+  mz_uint8 *pOut_buf_cur = pOut_buf_next, *const pOut_buf_end = pOut_buf_next + *pOut_buf_size;
+  size_t out_buf_size_mask = (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF) ? (size_t)-1 : ((pOut_buf_next - pOut_buf_start) + *pOut_buf_size) - 1, dist_from_out_buf_start;
+
+  // Ensure the output buffer's size is a power of 2, unless the output buffer is large enough to hold the entire output file (in which case it doesn't matter).
+  if (((out_buf_size_mask + 1) & out_buf_size_mask) || (pOut_buf_next < pOut_buf_start)) { *pIn_buf_size = *pOut_buf_size = 0; return TINFL_STATUS_BAD_PARAM; }
+
+  num_bits = r->m_num_bits; bit_buf = r->m_bit_buf; dist = r->m_dist; counter = r->m_counter; num_extra = r->m_num_extra; dist_from_out_buf_start = r->m_dist_from_out_buf_start;
+  TINFL_CR_BEGIN
+
+  bit_buf = num_bits = dist = counter = num_extra = r->m_zhdr0 = r->m_zhdr1 = 0; r->m_z_adler32 = r->m_check_adler32 = 1;
+  if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER)
+  {
+    TINFL_GET_BYTE(1, r->m_zhdr0); TINFL_GET_BYTE(2, r->m_zhdr1);
+    counter = (((r->m_zhdr0 * 256 + r->m_zhdr1) % 31 != 0) || (r->m_zhdr1 & 32) || ((r->m_zhdr0 & 15) != 8));
+    if (!(decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)) counter |= (((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) || ((out_buf_size_mask + 1) < (size_t)(1ULL << (8U + (r->m_zhdr0 >> 4)))));
+    if (counter) { TINFL_CR_RETURN_FOREVER(36, TINFL_STATUS_FAILED); }
+  }
+
+  do
+  {
+    TINFL_GET_BITS(3, r->m_final, 3); r->m_type = r->m_final >> 1;
+    if (r->m_type == 0)
+    {
+      TINFL_SKIP_BITS(5, num_bits & 7);
+      for (counter = 0; counter < 4; ++counter) { if (num_bits) TINFL_GET_BITS(6, r->m_raw_header[counter], 8); else TINFL_GET_BYTE(7, r->m_raw_header[counter]); }
+      if ((counter = (r->m_raw_header[0] | (r->m_raw_header[1] << 8))) != (mz_uint)(0xFFFF ^ (r->m_raw_header[2] | (r->m_raw_header[3] << 8)))) { TINFL_CR_RETURN_FOREVER(39, TINFL_STATUS_FAILED); }
+      while ((counter) && (num_bits))
+      {
+        TINFL_GET_BITS(51, dist, 8);
+        while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(52, TINFL_STATUS_HAS_MORE_OUTPUT); }
+        *pOut_buf_cur++ = (mz_uint8)dist;
+        counter--;
+      }
+      while (counter)
+      {
+        size_t n; while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(9, TINFL_STATUS_HAS_MORE_OUTPUT); }
+        while (pIn_buf_cur >= pIn_buf_end)
+        {
+          if (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT)
+          {
+            TINFL_CR_RETURN(38, TINFL_STATUS_NEEDS_MORE_INPUT);
+          }
+          else
+          {
+            TINFL_CR_RETURN_FOREVER(40, TINFL_STATUS_FAILED);
+          }
+        }
+        n = MZ_MIN(MZ_MIN((size_t)(pOut_buf_end - pOut_buf_cur), (size_t)(pIn_buf_end - pIn_buf_cur)), counter);
+        TINFL_MEMCPY(pOut_buf_cur, pIn_buf_cur, n); pIn_buf_cur += n; pOut_buf_cur += n; counter -= (mz_uint)n;
+      }
+    }
+    else if (r->m_type == 3)
+    {
+      TINFL_CR_RETURN_FOREVER(10, TINFL_STATUS_FAILED);
+    }
+    else
+    {
+      if (r->m_type == 1)
+      {
+        mz_uint8 *p = r->m_tables[0].m_code_size; mz_uint i;
+        r->m_table_sizes[0] = 288; r->m_table_sizes[1] = 32; TINFL_MEMSET(r->m_tables[1].m_code_size, 5, 32);
+        for ( i = 0; i <= 143; ++i) *p++ = 8; for ( ; i <= 255; ++i) *p++ = 9; for ( ; i <= 279; ++i) *p++ = 7; for ( ; i <= 287; ++i) *p++ = 8;
+      }
+      else
+      {
+        for (counter = 0; counter < 3; counter++) { TINFL_GET_BITS(11, r->m_table_sizes[counter], "\05\05\04"[counter]); r->m_table_sizes[counter] += s_min_table_sizes[counter]; }
+        MZ_CLEAR_OBJ(r->m_tables[2].m_code_size); for (counter = 0; counter < r->m_table_sizes[2]; counter++) { mz_uint s; TINFL_GET_BITS(14, s, 3); r->m_tables[2].m_code_size[s_length_dezigzag[counter]] = (mz_uint8)s; }
+        r->m_table_sizes[2] = 19;
+      }
+      for ( ; (int)r->m_type >= 0; r->m_type--)
+      {
+        int tree_next, tree_cur; tinfl_huff_table *pTable;
+        mz_uint i, j, used_syms, total, sym_index, next_code[17], total_syms[16]; pTable = &r->m_tables[r->m_type]; MZ_CLEAR_OBJ(total_syms); MZ_CLEAR_OBJ(pTable->m_look_up); MZ_CLEAR_OBJ(pTable->m_tree);
+        for (i = 0; i < r->m_table_sizes[r->m_type]; ++i) total_syms[pTable->m_code_size[i]]++;
+        used_syms = 0, total = 0; next_code[0] = next_code[1] = 0;
+        for (i = 1; i <= 15; ++i) { used_syms += total_syms[i]; next_code[i + 1] = (total = ((total + total_syms[i]) << 1)); }
+        if ((65536 != total) && (used_syms > 1))
+        {
+          TINFL_CR_RETURN_FOREVER(35, TINFL_STATUS_FAILED);
+        }
+        for (tree_next = -1, sym_index = 0; sym_index < r->m_table_sizes[r->m_type]; ++sym_index)
+        {
+          mz_uint rev_code = 0, l, cur_code, code_size = pTable->m_code_size[sym_index]; if (!code_size) continue;
+          cur_code = next_code[code_size]++; for (l = code_size; l > 0; l--, cur_code >>= 1) rev_code = (rev_code << 1) | (cur_code & 1);
+          if (code_size <= TINFL_FAST_LOOKUP_BITS) { mz_int16 k = (mz_int16)((code_size << 9) | sym_index); while (rev_code < TINFL_FAST_LOOKUP_SIZE) { pTable->m_look_up[rev_code] = k; rev_code += (1 << code_size); } continue; }
+          if (0 == (tree_cur = pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)])) { pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)] = (mz_int16)tree_next; tree_cur = tree_next; tree_next -= 2; }
+          rev_code >>= (TINFL_FAST_LOOKUP_BITS - 1);
+          for (j = code_size; j > (TINFL_FAST_LOOKUP_BITS + 1); j--)
+          {
+            tree_cur -= ((rev_code >>= 1) & 1);
+            if (!pTable->m_tree[-tree_cur - 1]) { pTable->m_tree[-tree_cur - 1] = (mz_int16)tree_next; tree_cur = tree_next; tree_next -= 2; } else tree_cur = pTable->m_tree[-tree_cur - 1];
+          }
+          tree_cur -= ((rev_code >>= 1) & 1); pTable->m_tree[-tree_cur - 1] = (mz_int16)sym_index;
+        }
+        if (r->m_type == 2)
+        {
+          for (counter = 0; counter < (r->m_table_sizes[0] + r->m_table_sizes[1]); )
+          {
+            mz_uint s; TINFL_HUFF_DECODE(16, dist, &r->m_tables[2]); if (dist < 16) { r->m_len_codes[counter++] = (mz_uint8)dist; continue; }
+            if ((dist == 16) && (!counter))
+            {
+              TINFL_CR_RETURN_FOREVER(17, TINFL_STATUS_FAILED);
+            }
+            num_extra = "\02\03\07"[dist - 16]; TINFL_GET_BITS(18, s, num_extra); s += "\03\03\013"[dist - 16];
+            TINFL_MEMSET(r->m_len_codes + counter, (dist == 16) ? r->m_len_codes[counter - 1] : 0, s); counter += s;
+          }
+          if ((r->m_table_sizes[0] + r->m_table_sizes[1]) != counter)
+          {
+            TINFL_CR_RETURN_FOREVER(21, TINFL_STATUS_FAILED);
+          }
+          TINFL_MEMCPY(r->m_tables[0].m_code_size, r->m_len_codes, r->m_table_sizes[0]); TINFL_MEMCPY(r->m_tables[1].m_code_size, r->m_len_codes + r->m_table_sizes[0], r->m_table_sizes[1]);
+        }
+      }
+      for ( ; ; )
+      {
+        mz_uint8 *pSrc;
+        for ( ; ; )
+        {
+          if (((pIn_buf_end - pIn_buf_cur) < 4) || ((pOut_buf_end - pOut_buf_cur) < 2))
+          {
+            TINFL_HUFF_DECODE(23, counter, &r->m_tables[0]);
+            if (counter >= 256)
+              break;
+            while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(24, TINFL_STATUS_HAS_MORE_OUTPUT); }
+            *pOut_buf_cur++ = (mz_uint8)counter;
+          }
+          else
+          {
+            int sym2; mz_uint code_len;
+#if TINFL_USE_64BIT_BITBUF
+            if (num_bits < 30) { bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE32(pIn_buf_cur)) << num_bits); pIn_buf_cur += 4; num_bits += 32; }
+#else
+            if (num_bits < 15) { bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits); pIn_buf_cur += 2; num_bits += 16; }
+#endif
+            if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+              code_len = sym2 >> 9;
+            else
+            {
+              code_len = TINFL_FAST_LOOKUP_BITS; do { sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)]; } while (sym2 < 0);
+            }
+            counter = sym2; bit_buf >>= code_len; num_bits -= code_len;
+            if (counter & 256)
+              break;
+
+#if !TINFL_USE_64BIT_BITBUF
+            if (num_bits < 15) { bit_buf |= (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits); pIn_buf_cur += 2; num_bits += 16; }
+#endif
+            if ((sym2 = r->m_tables[0].m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= 0)
+              code_len = sym2 >> 9;
+            else
+            {
+              code_len = TINFL_FAST_LOOKUP_BITS; do { sym2 = r->m_tables[0].m_tree[~sym2 + ((bit_buf >> code_len++) & 1)]; } while (sym2 < 0);
+            }
+            bit_buf >>= code_len; num_bits -= code_len;
+
+            pOut_buf_cur[0] = (mz_uint8)counter;
+            if (sym2 & 256)
+            {
+              pOut_buf_cur++;
+              counter = sym2;
+              break;
+            }
+            pOut_buf_cur[1] = (mz_uint8)sym2;
+            pOut_buf_cur += 2;
+          }
+        }
+        if ((counter &= 511) == 256) break;
+
+        num_extra = s_length_extra[counter - 257]; counter = s_length_base[counter - 257];
+        if (num_extra) { mz_uint extra_bits; TINFL_GET_BITS(25, extra_bits, num_extra); counter += extra_bits; }
+
+        TINFL_HUFF_DECODE(26, dist, &r->m_tables[1]);
+        num_extra = s_dist_extra[dist]; dist = s_dist_base[dist];
+        if (num_extra) { mz_uint extra_bits; TINFL_GET_BITS(27, extra_bits, num_extra); dist += extra_bits; }
+
+        dist_from_out_buf_start = pOut_buf_cur - pOut_buf_start;
+        if ((dist > dist_from_out_buf_start) && (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
+        {
+          TINFL_CR_RETURN_FOREVER(37, TINFL_STATUS_FAILED);
+        }
+
+        pSrc = pOut_buf_start + ((dist_from_out_buf_start - dist) & out_buf_size_mask);
+
+        if ((MZ_MAX(pOut_buf_cur, pSrc) + counter) > pOut_buf_end)
+        {
+          while (counter--)
+          {
+            while (pOut_buf_cur >= pOut_buf_end) { TINFL_CR_RETURN(53, TINFL_STATUS_HAS_MORE_OUTPUT); }
+            *pOut_buf_cur++ = pOut_buf_start[(dist_from_out_buf_start++ - dist) & out_buf_size_mask];
+          }
+          continue;
+        }
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+        else if ((counter >= 9) && (counter <= dist))
+        {
+          const mz_uint8 *pSrc_end = pSrc + (counter & ~7);
+          do
+          {
+            ((mz_uint32 *)pOut_buf_cur)[0] = ((const mz_uint32 *)pSrc)[0];
+            ((mz_uint32 *)pOut_buf_cur)[1] = ((const mz_uint32 *)pSrc)[1];
+            pOut_buf_cur += 8;
+          } while ((pSrc += 8) < pSrc_end);
+          if ((counter &= 7) < 3)
+          {
+            if (counter)
+            {
+              pOut_buf_cur[0] = pSrc[0];
+              if (counter > 1)
+                pOut_buf_cur[1] = pSrc[1];
+              pOut_buf_cur += counter;
+            }
+            continue;
+          }
+        }
+#endif
+        do
+        {
+          pOut_buf_cur[0] = pSrc[0];
+          pOut_buf_cur[1] = pSrc[1];
+          pOut_buf_cur[2] = pSrc[2];
+          pOut_buf_cur += 3; pSrc += 3;
+        } while ((int)(counter -= 3) > 2);
+        if ((int)counter > 0)
+        {
+          pOut_buf_cur[0] = pSrc[0];
+          if ((int)counter > 1)
+            pOut_buf_cur[1] = pSrc[1];
+          pOut_buf_cur += counter;
+        }
+      }
+    }
+  } while (!(r->m_final & 1));
+  if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER)
+  {
+    TINFL_SKIP_BITS(32, num_bits & 7); for (counter = 0; counter < 4; ++counter) { mz_uint s; if (num_bits) TINFL_GET_BITS(41, s, 8); else TINFL_GET_BYTE(42, s); r->m_z_adler32 = (r->m_z_adler32 << 8) | s; }
+  }
+  TINFL_CR_RETURN_FOREVER(34, TINFL_STATUS_DONE);
+  TINFL_CR_FINISH
+
+common_exit:
+  r->m_num_bits = num_bits; r->m_bit_buf = bit_buf; r->m_dist = dist; r->m_counter = counter; r->m_num_extra = num_extra; r->m_dist_from_out_buf_start = dist_from_out_buf_start;
+  *pIn_buf_size = pIn_buf_cur - pIn_buf_next; *pOut_buf_size = pOut_buf_cur - pOut_buf_next;
+  if ((decomp_flags & (TINFL_FLAG_PARSE_ZLIB_HEADER | TINFL_FLAG_COMPUTE_ADLER32)) && (status >= 0))
+  {
+    const mz_uint8 *ptr = pOut_buf_next; size_t buf_len = *pOut_buf_size;
+    mz_uint32 i, s1 = r->m_check_adler32 & 0xffff, s2 = r->m_check_adler32 >> 16; size_t block_len = buf_len % 5552;
+    while (buf_len)
+    {
+      for (i = 0; i + 7 < block_len; i += 8, ptr += 8)
+      {
+        s1 += ptr[0], s2 += s1; s1 += ptr[1], s2 += s1; s1 += ptr[2], s2 += s1; s1 += ptr[3], s2 += s1;
+        s1 += ptr[4], s2 += s1; s1 += ptr[5], s2 += s1; s1 += ptr[6], s2 += s1; s1 += ptr[7], s2 += s1;
+      }
+      for ( ; i < block_len; ++i) s1 += *ptr++, s2 += s1;
+      s1 %= 65521U, s2 %= 65521U; buf_len -= block_len; block_len = 5552;
+    }
+    r->m_check_adler32 = (s2 << 16) + s1; if ((status == TINFL_STATUS_DONE) && (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER) && (r->m_check_adler32 != r->m_z_adler32)) status = TINFL_STATUS_ADLER32_MISMATCH;
+  }
+  return status;
+}
+
+// Higher level helper functions.
+void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags)
+{
+  tinfl_decompressor decomp; void *pBuf = NULL, *pNew_buf; size_t src_buf_ofs = 0, out_buf_capacity = 0;
+  *pOut_len = 0;
+  tinfl_init(&decomp);
+  for ( ; ; )
+  {
+    size_t src_buf_size = src_buf_len - src_buf_ofs, dst_buf_size = out_buf_capacity - *pOut_len, new_out_buf_capacity;
+    tinfl_status status = tinfl_decompress(&decomp, (const mz_uint8*)pSrc_buf + src_buf_ofs, &src_buf_size, (mz_uint8*)pBuf, pBuf ? (mz_uint8*)pBuf + *pOut_len : NULL, &dst_buf_size,
+      (flags & ~TINFL_FLAG_HAS_MORE_INPUT) | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+    if ((status < 0) || (status == TINFL_STATUS_NEEDS_MORE_INPUT))
+    {
+      MZ_FREE(pBuf); *pOut_len = 0; return NULL;
+    }
+    src_buf_ofs += src_buf_size;
+    *pOut_len += dst_buf_size;
+    if (status == TINFL_STATUS_DONE) break;
+    new_out_buf_capacity = out_buf_capacity * 2; if (new_out_buf_capacity < 128) new_out_buf_capacity = 128;
+    pNew_buf = MZ_REALLOC(pBuf, new_out_buf_capacity);
+    if (!pNew_buf)
+    {
+      MZ_FREE(pBuf); *pOut_len = 0; return NULL;
+    }
+    pBuf = pNew_buf; out_buf_capacity = new_out_buf_capacity;
+  }
+  return pBuf;
+}
+
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags)
+{
+  tinfl_decompressor decomp; tinfl_status status; tinfl_init(&decomp);
+  status = tinfl_decompress(&decomp, (const mz_uint8*)pSrc_buf, &src_buf_len, (mz_uint8*)pOut_buf, (mz_uint8*)pOut_buf, &out_buf_len, (flags & ~TINFL_FLAG_HAS_MORE_INPUT) | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+  return (status != TINFL_STATUS_DONE) ? TINFL_DECOMPRESS_MEM_TO_MEM_FAILED : out_buf_len;
+}
+
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size, tinfl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+  int result = 0;
+  tinfl_decompressor decomp;
+  mz_uint8 *pDict = (mz_uint8*)MZ_MALLOC(TINFL_LZ_DICT_SIZE); size_t in_buf_ofs = 0, dict_ofs = 0;
+  if (!pDict)
+    return TINFL_STATUS_FAILED;
+  tinfl_init(&decomp);
+  for ( ; ; )
+  {
+    size_t in_buf_size = *pIn_buf_size - in_buf_ofs, dst_buf_size = TINFL_LZ_DICT_SIZE - dict_ofs;
+    tinfl_status status = tinfl_decompress(&decomp, (const mz_uint8*)pIn_buf + in_buf_ofs, &in_buf_size, pDict, pDict + dict_ofs, &dst_buf_size,
+      (flags & ~(TINFL_FLAG_HAS_MORE_INPUT | TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)));
+    in_buf_ofs += in_buf_size;
+    if ((dst_buf_size) && (!(*pPut_buf_func)(pDict + dict_ofs, (int)dst_buf_size, pPut_buf_user)))
+      break;
+    if (status != TINFL_STATUS_HAS_MORE_OUTPUT)
+    {
+      result = (status == TINFL_STATUS_DONE);
+      break;
+    }
+    dict_ofs = (dict_ofs + dst_buf_size) & (TINFL_LZ_DICT_SIZE - 1);
+  }
+  MZ_FREE(pDict);
+  *pIn_buf_size = in_buf_ofs;
+  return result;
+}
+
+// ------------------- Low-level Compression (independent from all decompression API's)
+
+// Purposely making these tables static for faster init and thread safety.
+static const mz_uint16 s_tdefl_len_sym[256] = {
+  257,258,259,260,261,262,263,264,265,265,266,266,267,267,268,268,269,269,269,269,270,270,270,270,271,271,271,271,272,272,272,272,
+  273,273,273,273,273,273,273,273,274,274,274,274,274,274,274,274,275,275,275,275,275,275,275,275,276,276,276,276,276,276,276,276,
+  277,277,277,277,277,277,277,277,277,277,277,277,277,277,277,277,278,278,278,278,278,278,278,278,278,278,278,278,278,278,278,278,
+  279,279,279,279,279,279,279,279,279,279,279,279,279,279,279,279,280,280,280,280,280,280,280,280,280,280,280,280,280,280,280,280,
+  281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,281,
+  282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,282,
+  283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,283,
+  284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,284,285 };
+
+static const mz_uint8 s_tdefl_len_extra[256] = {
+  0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+  4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,0 };
+
+static const mz_uint8 s_tdefl_small_dist_sym[512] = {
+  0,1,2,3,4,4,5,5,6,6,6,6,7,7,7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,
+  11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,
+  13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,
+  14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,
+  14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,
+  15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,16,16,16,16,
+  16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+  16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,
+  16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+  17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+  17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,
+  17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17 };
+
+static const mz_uint8 s_tdefl_small_dist_extra[512] = {
+  0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,
+  5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+  6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+  7,7,7,7,7,7,7,7 };
+
+static const mz_uint8 s_tdefl_large_dist_sym[128] = {
+  0,0,18,19,20,20,21,21,22,22,22,22,23,23,23,23,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,26,26,26,26,
+  26,26,26,26,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,
+  28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29 };
+
+static const mz_uint8 s_tdefl_large_dist_extra[128] = {
+  0,0,8,8,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,
+  12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+  13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13 };
+
+// Radix sorts tdefl_sym_freq[] array by 16-bit key m_key. Returns ptr to sorted values.
+typedef struct { mz_uint16 m_key, m_sym_index; } tdefl_sym_freq;
+static tdefl_sym_freq* tdefl_radix_sort_syms(mz_uint num_syms, tdefl_sym_freq* pSyms0, tdefl_sym_freq* pSyms1)
+{
+  mz_uint32 total_passes = 2, pass_shift, pass, i, hist[256 * 2]; tdefl_sym_freq* pCur_syms = pSyms0, *pNew_syms = pSyms1; MZ_CLEAR_OBJ(hist);
+  for (i = 0; i < num_syms; i++) { mz_uint freq = pSyms0[i].m_key; hist[freq & 0xFF]++; hist[256 + ((freq >> 8) & 0xFF)]++; }
+  while ((total_passes > 1) && (num_syms == hist[(total_passes - 1) * 256])) total_passes--;
+  for (pass_shift = 0, pass = 0; pass < total_passes; pass++, pass_shift += 8)
+  {
+    const mz_uint32* pHist = &hist[pass << 8];
+    mz_uint offsets[256], cur_ofs = 0;
+    for (i = 0; i < 256; i++) { offsets[i] = cur_ofs; cur_ofs += pHist[i]; }
+    for (i = 0; i < num_syms; i++) pNew_syms[offsets[(pCur_syms[i].m_key >> pass_shift) & 0xFF]++] = pCur_syms[i];
+    { tdefl_sym_freq* t = pCur_syms; pCur_syms = pNew_syms; pNew_syms = t; }
+  }
+  return pCur_syms;
+}
+
+// tdefl_calculate_minimum_redundancy() originally written by: Alistair Moffat, alistair@cs.mu.oz.au, Jyrki Katajainen, jyrki@diku.dk, November 1996.
+static void tdefl_calculate_minimum_redundancy(tdefl_sym_freq *A, int n)
+{
+  int root, leaf, next, avbl, used, dpth;
+  if (n==0) return; else if (n==1) { A[0].m_key = 1; return; }
+  A[0].m_key += A[1].m_key; root = 0; leaf = 2;
+  for (next=1; next < n-1; next++)
+  {
+    if (leaf>=n || A[root].m_key<A[leaf].m_key) { A[next].m_key = A[root].m_key; A[root++].m_key = (mz_uint16)next; } else A[next].m_key = A[leaf++].m_key;
+    if (leaf>=n || (root<next && A[root].m_key<A[leaf].m_key)) { A[next].m_key = (mz_uint16)(A[next].m_key + A[root].m_key); A[root++].m_key = (mz_uint16)next; } else A[next].m_key = (mz_uint16)(A[next].m_key + A[leaf++].m_key);
+  }
+  A[n-2].m_key = 0; for (next=n-3; next>=0; next--) A[next].m_key = A[A[next].m_key].m_key+1;
+  avbl = 1; used = dpth = 0; root = n-2; next = n-1;
+  while (avbl>0)
+  {
+    while (root>=0 && (int)A[root].m_key==dpth) { used++; root--; }
+    while (avbl>used) { A[next--].m_key = (mz_uint16)(dpth); avbl--; }
+    avbl = 2*used; dpth++; used = 0;
+  }
+}
+
+// Limits canonical Huffman code table's max code size.
+enum { TDEFL_MAX_SUPPORTED_HUFF_CODESIZE = 32 };
+static void tdefl_huffman_enforce_max_code_size(int *pNum_codes, int code_list_len, int max_code_size)
+{
+  int i; mz_uint32 total = 0; if (code_list_len <= 1) return;
+  for (i = max_code_size + 1; i <= TDEFL_MAX_SUPPORTED_HUFF_CODESIZE; i++) pNum_codes[max_code_size] += pNum_codes[i];
+  for (i = max_code_size; i > 0; i--) total += (((mz_uint32)pNum_codes[i]) << (max_code_size - i));
+  while (total != (1UL << max_code_size))
+  {
+    pNum_codes[max_code_size]--;
+    for (i = max_code_size - 1; i > 0; i--) if (pNum_codes[i]) { pNum_codes[i]--; pNum_codes[i + 1] += 2; break; }
+    total--;
+  }
+}
+
+static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_num, int table_len, int code_size_limit, int static_table)
+{
+  int i, j, l, num_codes[1 + TDEFL_MAX_SUPPORTED_HUFF_CODESIZE]; mz_uint next_code[TDEFL_MAX_SUPPORTED_HUFF_CODESIZE + 1]; MZ_CLEAR_OBJ(num_codes);
+  if (static_table)
+  {
+    for (i = 0; i < table_len; i++) num_codes[d->m_huff_code_sizes[table_num][i]]++;
+  }
+  else
+  {
+    tdefl_sym_freq syms0[TDEFL_MAX_HUFF_SYMBOLS], syms1[TDEFL_MAX_HUFF_SYMBOLS], *pSyms;
+    int num_used_syms = 0;
+    const mz_uint16 *pSym_count = &d->m_huff_count[table_num][0];
+    for (i = 0; i < table_len; i++) if (pSym_count[i]) { syms0[num_used_syms].m_key = (mz_uint16)pSym_count[i]; syms0[num_used_syms++].m_sym_index = (mz_uint16)i; }
+
+    pSyms = tdefl_radix_sort_syms(num_used_syms, syms0, syms1); tdefl_calculate_minimum_redundancy(pSyms, num_used_syms);
+
+    for (i = 0; i < num_used_syms; i++) num_codes[pSyms[i].m_key]++;
+
+    tdefl_huffman_enforce_max_code_size(num_codes, num_used_syms, code_size_limit);
+
+    MZ_CLEAR_OBJ(d->m_huff_code_sizes[table_num]); MZ_CLEAR_OBJ(d->m_huff_codes[table_num]);
+    for (i = 1, j = num_used_syms; i <= code_size_limit; i++)
+      for (l = num_codes[i]; l > 0; l--) d->m_huff_code_sizes[table_num][pSyms[--j].m_sym_index] = (mz_uint8)(i);
+  }
+
+  next_code[1] = 0; for (j = 0, i = 2; i <= code_size_limit; i++) next_code[i] = j = ((j + num_codes[i - 1]) << 1);
+
+  for (i = 0; i < table_len; i++)
+  {
+    mz_uint rev_code = 0, code, code_size; if ((code_size = d->m_huff_code_sizes[table_num][i]) == 0) continue;
+    code = next_code[code_size]++; for (l = code_size; l > 0; l--, code >>= 1) rev_code = (rev_code << 1) | (code & 1);
+    d->m_huff_codes[table_num][i] = (mz_uint16)rev_code;
+  }
+}
+
+#define TDEFL_PUT_BITS(b, l) do { \
+  mz_uint bits = b; mz_uint len = l; MZ_ASSERT(bits <= ((1U << len) - 1U)); \
+  d->m_bit_buffer |= (bits << d->m_bits_in); d->m_bits_in += len; \
+  while (d->m_bits_in >= 8) { \
+    if (d->m_pOutput_buf < d->m_pOutput_buf_end) \
+      *d->m_pOutput_buf++ = (mz_uint8)(d->m_bit_buffer); \
+      d->m_bit_buffer >>= 8; \
+      d->m_bits_in -= 8; \
+  } \
+} MZ_MACRO_END
+
+#define TDEFL_RLE_PREV_CODE_SIZE() { if (rle_repeat_count) { \
+  if (rle_repeat_count < 3) { \
+    d->m_huff_count[2][prev_code_size] = (mz_uint16)(d->m_huff_count[2][prev_code_size] + rle_repeat_count); \
+    while (rle_repeat_count--) packed_code_sizes[num_packed_code_sizes++] = prev_code_size; \
+  } else { \
+    d->m_huff_count[2][16] = (mz_uint16)(d->m_huff_count[2][16] + 1); packed_code_sizes[num_packed_code_sizes++] = 16; packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_repeat_count - 3); \
+} rle_repeat_count = 0; } }
+
+#define TDEFL_RLE_ZERO_CODE_SIZE() { if (rle_z_count) { \
+  if (rle_z_count < 3) { \
+    d->m_huff_count[2][0] = (mz_uint16)(d->m_huff_count[2][0] + rle_z_count); while (rle_z_count--) packed_code_sizes[num_packed_code_sizes++] = 0; \
+  } else if (rle_z_count <= 10) { \
+    d->m_huff_count[2][17] = (mz_uint16)(d->m_huff_count[2][17] + 1); packed_code_sizes[num_packed_code_sizes++] = 17; packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_z_count - 3); \
+  } else { \
+    d->m_huff_count[2][18] = (mz_uint16)(d->m_huff_count[2][18] + 1); packed_code_sizes[num_packed_code_sizes++] = 18; packed_code_sizes[num_packed_code_sizes++] = (mz_uint8)(rle_z_count - 11); \
+} rle_z_count = 0; } }
+
+static mz_uint8 s_tdefl_packed_code_size_syms_swizzle[] = { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
+
+static void tdefl_start_dynamic_block(tdefl_compressor *d)
+{
+  int num_lit_codes, num_dist_codes, num_bit_lengths; mz_uint i, total_code_sizes_to_pack, num_packed_code_sizes, rle_z_count, rle_repeat_count, packed_code_sizes_index;
+  mz_uint8 code_sizes_to_pack[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1], packed_code_sizes[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1], prev_code_size = 0xFF;
+
+  d->m_huff_count[0][256] = 1;
+
+  tdefl_optimize_huffman_table(d, 0, TDEFL_MAX_HUFF_SYMBOLS_0, 15, MZ_FALSE);
+  tdefl_optimize_huffman_table(d, 1, TDEFL_MAX_HUFF_SYMBOLS_1, 15, MZ_FALSE);
+
+  for (num_lit_codes = 286; num_lit_codes > 257; num_lit_codes--) if (d->m_huff_code_sizes[0][num_lit_codes - 1]) break;
+  for (num_dist_codes = 30; num_dist_codes > 1; num_dist_codes--) if (d->m_huff_code_sizes[1][num_dist_codes - 1]) break;
+
+  memcpy(code_sizes_to_pack, &d->m_huff_code_sizes[0][0], num_lit_codes);
+  memcpy(code_sizes_to_pack + num_lit_codes, &d->m_huff_code_sizes[1][0], num_dist_codes);
+  total_code_sizes_to_pack = num_lit_codes + num_dist_codes; num_packed_code_sizes = 0; rle_z_count = 0; rle_repeat_count = 0;
+
+  memset(&d->m_huff_count[2][0], 0, sizeof(d->m_huff_count[2][0]) * TDEFL_MAX_HUFF_SYMBOLS_2);
+  for (i = 0; i < total_code_sizes_to_pack; i++)
+  {
+    mz_uint8 code_size = code_sizes_to_pack[i];
+    if (!code_size)
+    {
+      TDEFL_RLE_PREV_CODE_SIZE();
+      if (++rle_z_count == 138) { TDEFL_RLE_ZERO_CODE_SIZE(); }
+    }
+    else
+    {
+      TDEFL_RLE_ZERO_CODE_SIZE();
+      if (code_size != prev_code_size)
+      {
+        TDEFL_RLE_PREV_CODE_SIZE();
+        d->m_huff_count[2][code_size] = (mz_uint16)(d->m_huff_count[2][code_size] + 1); packed_code_sizes[num_packed_code_sizes++] = code_size;
+      }
+      else if (++rle_repeat_count == 6)
+      {
+        TDEFL_RLE_PREV_CODE_SIZE();
+      }
+    }
+    prev_code_size = code_size;
+  }
+  if (rle_repeat_count) { TDEFL_RLE_PREV_CODE_SIZE(); } else { TDEFL_RLE_ZERO_CODE_SIZE(); }
+
+  tdefl_optimize_huffman_table(d, 2, TDEFL_MAX_HUFF_SYMBOLS_2, 7, MZ_FALSE);
+
+  TDEFL_PUT_BITS(2, 2);
+
+  TDEFL_PUT_BITS(num_lit_codes - 257, 5);
+  TDEFL_PUT_BITS(num_dist_codes - 1, 5);
+
+  for (num_bit_lengths = 18; num_bit_lengths >= 0; num_bit_lengths--) if (d->m_huff_code_sizes[2][s_tdefl_packed_code_size_syms_swizzle[num_bit_lengths]]) break;
+  num_bit_lengths = MZ_MAX(4, (num_bit_lengths + 1)); TDEFL_PUT_BITS(num_bit_lengths - 4, 4);
+  for (i = 0; (int)i < num_bit_lengths; i++) TDEFL_PUT_BITS(d->m_huff_code_sizes[2][s_tdefl_packed_code_size_syms_swizzle[i]], 3);
+
+  for (packed_code_sizes_index = 0; packed_code_sizes_index < num_packed_code_sizes; )
+  {
+    mz_uint code = packed_code_sizes[packed_code_sizes_index++]; MZ_ASSERT(code < TDEFL_MAX_HUFF_SYMBOLS_2);
+    TDEFL_PUT_BITS(d->m_huff_codes[2][code], d->m_huff_code_sizes[2][code]);
+    if (code >= 16) TDEFL_PUT_BITS(packed_code_sizes[packed_code_sizes_index++], "\02\03\07"[code - 16]);
+  }
+}
+
+static void tdefl_start_static_block(tdefl_compressor *d)
+{
+  mz_uint i;
+  mz_uint8 *p = &d->m_huff_code_sizes[0][0];
+
+  for (i = 0; i <= 143; ++i) *p++ = 8;
+  for ( ; i <= 255; ++i) *p++ = 9;
+  for ( ; i <= 279; ++i) *p++ = 7;
+  for ( ; i <= 287; ++i) *p++ = 8;
+
+  memset(d->m_huff_code_sizes[1], 5, 32);
+
+  tdefl_optimize_huffman_table(d, 0, 288, 15, MZ_TRUE);
+  tdefl_optimize_huffman_table(d, 1, 32, 15, MZ_TRUE);
+
+  TDEFL_PUT_BITS(1, 2);
+}
+
+static const mz_uint mz_bitmasks[17] = { 0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF, 0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF };
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && MINIZ_HAS_64BIT_REGISTERS
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
+{
+  mz_uint flags;
+  mz_uint8 *pLZ_codes;
+  mz_uint8 *pOutput_buf = d->m_pOutput_buf;
+  mz_uint8 *pLZ_code_buf_end = d->m_pLZ_code_buf;
+  mz_uint64 bit_buffer = d->m_bit_buffer;
+  mz_uint bits_in = d->m_bits_in;
+
+#define TDEFL_PUT_BITS_FAST(b, l) { bit_buffer |= (((mz_uint64)(b)) << bits_in); bits_in += (l); }
+
+  flags = 1;
+  for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < pLZ_code_buf_end; flags >>= 1)
+  {
+    if (flags == 1)
+      flags = *pLZ_codes++ | 0x100;
+
+    if (flags & 1)
+    {
+      mz_uint s0, s1, n0, n1, sym, num_extra_bits;
+      mz_uint match_len = pLZ_codes[0], match_dist = *(const mz_uint16 *)(pLZ_codes + 1); pLZ_codes += 3;
+
+      MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][s_tdefl_len_sym[match_len]], d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS_FAST(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]], s_tdefl_len_extra[match_len]);
+
+      // This sequence coaxes MSVC into using cmov's vs. jmp's.
+      s0 = s_tdefl_small_dist_sym[match_dist & 511];
+      n0 = s_tdefl_small_dist_extra[match_dist & 511];
+      s1 = s_tdefl_large_dist_sym[match_dist >> 8];
+      n1 = s_tdefl_large_dist_extra[match_dist >> 8];
+      sym = (match_dist < 512) ? s0 : s1;
+      num_extra_bits = (match_dist < 512) ? n0 : n1;
+
+      MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS_FAST(match_dist & mz_bitmasks[num_extra_bits], num_extra_bits);
+    }
+    else
+    {
+      mz_uint lit = *pLZ_codes++;
+      MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+
+      if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end))
+      {
+        flags >>= 1;
+        lit = *pLZ_codes++;
+        MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+        TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+
+        if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end))
+        {
+          flags >>= 1;
+          lit = *pLZ_codes++;
+          MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+          TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+        }
+      }
+    }
+
+    if (pOutput_buf >= d->m_pOutput_buf_end)
+      return MZ_FALSE;
+
+    *(mz_uint64*)pOutput_buf = bit_buffer;
+    pOutput_buf += (bits_in >> 3);
+    bit_buffer >>= (bits_in & ~7);
+    bits_in &= 7;
+  }
+
+#undef TDEFL_PUT_BITS_FAST
+
+  d->m_pOutput_buf = pOutput_buf;
+  d->m_bits_in = 0;
+  d->m_bit_buffer = 0;
+
+  while (bits_in)
+  {
+    mz_uint32 n = MZ_MIN(bits_in, 16);
+    TDEFL_PUT_BITS((mz_uint)bit_buffer & mz_bitmasks[n], n);
+    bit_buffer >>= n;
+    bits_in -= n;
+  }
+
+  TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+  return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#else
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d)
+{
+  mz_uint flags;
+  mz_uint8 *pLZ_codes;
+
+  flags = 1;
+  for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < d->m_pLZ_code_buf; flags >>= 1)
+  {
+    if (flags == 1)
+      flags = *pLZ_codes++ | 0x100;
+    if (flags & 1)
+    {
+      mz_uint sym, num_extra_bits;
+      mz_uint match_len = pLZ_codes[0], match_dist = (pLZ_codes[1] | (pLZ_codes[2] << 8)); pLZ_codes += 3;
+
+      MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS(d->m_huff_codes[0][s_tdefl_len_sym[match_len]], d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]], s_tdefl_len_extra[match_len]);
+
+      if (match_dist < 512)
+      {
+        sym = s_tdefl_small_dist_sym[match_dist]; num_extra_bits = s_tdefl_small_dist_extra[match_dist];
+      }
+      else
+      {
+        sym = s_tdefl_large_dist_sym[match_dist >> 8]; num_extra_bits = s_tdefl_large_dist_extra[match_dist >> 8];
+      }
+      MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS(match_dist & mz_bitmasks[num_extra_bits], num_extra_bits);
+    }
+    else
+    {
+      mz_uint lit = *pLZ_codes++;
+      MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+      TDEFL_PUT_BITS(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+    }
+  }
+
+  TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+  return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#endif // MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && MINIZ_HAS_64BIT_REGISTERS
+
+static mz_bool tdefl_compress_block(tdefl_compressor *d, mz_bool static_block)
+{
+  if (static_block)
+    tdefl_start_static_block(d);
+  else
+    tdefl_start_dynamic_block(d);
+  return tdefl_compress_lz_codes(d);
+}
+
+static int tdefl_flush_block(tdefl_compressor *d, int flush)
+{
+  mz_uint saved_bit_buf, saved_bits_in;
+  mz_uint8 *pSaved_output_buf;
+  mz_bool comp_block_succeeded = MZ_FALSE;
+  int n, use_raw_block = ((d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS) != 0) && (d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size;
+  mz_uint8 *pOutput_buf_start = ((d->m_pPut_buf_func == NULL) && ((*d->m_pOut_buf_size - d->m_out_buf_ofs) >= TDEFL_OUT_BUF_SIZE)) ? ((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs) : d->m_output_buf;
+
+  d->m_pOutput_buf = pOutput_buf_start;
+  d->m_pOutput_buf_end = d->m_pOutput_buf + TDEFL_OUT_BUF_SIZE - 16;
+
+  MZ_ASSERT(!d->m_output_flush_remaining);
+  d->m_output_flush_ofs = 0;
+  d->m_output_flush_remaining = 0;
+
+  *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> d->m_num_flags_left);
+  d->m_pLZ_code_buf -= (d->m_num_flags_left == 8);
+
+  if ((d->m_flags & TDEFL_WRITE_ZLIB_HEADER) && (!d->m_block_index))
+  {
+    TDEFL_PUT_BITS(0x78, 8); TDEFL_PUT_BITS(0x01, 8);
+  }
+
+  TDEFL_PUT_BITS(flush == TDEFL_FINISH, 1);
+
+  pSaved_output_buf = d->m_pOutput_buf; saved_bit_buf = d->m_bit_buffer; saved_bits_in = d->m_bits_in;
+
+  if (!use_raw_block)
+    comp_block_succeeded = tdefl_compress_block(d, (d->m_flags & TDEFL_FORCE_ALL_STATIC_BLOCKS) || (d->m_total_lz_bytes < 48));
+
+  // If the block gets expanded, forget the current contents of the output buffer and send a raw block instead.
+  if ( ((use_raw_block) || ((d->m_total_lz_bytes) && ((d->m_pOutput_buf - pSaved_output_buf + 1U) >= d->m_total_lz_bytes))) &&
+       ((d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size) )
+  {
+    mz_uint i; d->m_pOutput_buf = pSaved_output_buf; d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+    TDEFL_PUT_BITS(0, 2);
+    if (d->m_bits_in) { TDEFL_PUT_BITS(0, 8 - d->m_bits_in); }
+    for (i = 2; i; --i, d->m_total_lz_bytes ^= 0xFFFF)
+    {
+      TDEFL_PUT_BITS(d->m_total_lz_bytes & 0xFFFF, 16);
+    }
+    for (i = 0; i < d->m_total_lz_bytes; ++i)
+    {
+      TDEFL_PUT_BITS(d->m_dict[(d->m_lz_code_buf_dict_pos + i) & TDEFL_LZ_DICT_SIZE_MASK], 8);
+    }
+  }
+  // Check for the extremely unlikely (if not impossible) case of the compressed block not fitting into the output buffer when using dynamic codes.
+  else if (!comp_block_succeeded)
+  {
+    d->m_pOutput_buf = pSaved_output_buf; d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+    tdefl_compress_block(d, MZ_TRUE);
+  }
+
+  if (flush)
+  {
+    if (flush == TDEFL_FINISH)
+    {
+      if (d->m_bits_in) { TDEFL_PUT_BITS(0, 8 - d->m_bits_in); }
+      if (d->m_flags & TDEFL_WRITE_ZLIB_HEADER) { mz_uint i, a = d->m_adler32; for (i = 0; i < 4; i++) { TDEFL_PUT_BITS((a >> 24) & 0xFF, 8); a <<= 8; } }
+    }
+    else
+    {
+      mz_uint i, z = 0; TDEFL_PUT_BITS(0, 3); if (d->m_bits_in) { TDEFL_PUT_BITS(0, 8 - d->m_bits_in); } for (i = 2; i; --i, z ^= 0xFFFF) { TDEFL_PUT_BITS(z & 0xFFFF, 16); }
+    }
+  }
+
+  MZ_ASSERT(d->m_pOutput_buf < d->m_pOutput_buf_end);
+
+  memset(&d->m_huff_count[0][0], 0, sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+  memset(&d->m_huff_count[1][0], 0, sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+
+  d->m_pLZ_code_buf = d->m_lz_code_buf + 1; d->m_pLZ_flags = d->m_lz_code_buf; d->m_num_flags_left = 8; d->m_lz_code_buf_dict_pos += d->m_total_lz_bytes; d->m_total_lz_bytes = 0; d->m_block_index++;
+
+  if ((n = (int)(d->m_pOutput_buf - pOutput_buf_start)) != 0)
+  {
+    if (d->m_pPut_buf_func)
+    {
+      *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+      if (!(*d->m_pPut_buf_func)(d->m_output_buf, n, d->m_pPut_buf_user))
+        return (d->m_prev_return_status = TDEFL_STATUS_PUT_BUF_FAILED);
+    }
+    else if (pOutput_buf_start == d->m_output_buf)
+    {
+      int bytes_to_copy = (int)MZ_MIN((size_t)n, (size_t)(*d->m_pOut_buf_size - d->m_out_buf_ofs));
+      memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf, bytes_to_copy);
+      d->m_out_buf_ofs += bytes_to_copy;
+      if ((n -= bytes_to_copy) != 0)
+      {
+        d->m_output_flush_ofs = bytes_to_copy;
+        d->m_output_flush_remaining = n;
+      }
+    }
+    else
+    {
+      d->m_out_buf_ofs += n;
+    }
+  }
+
+  return d->m_output_flush_remaining;
+}
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+#define TDEFL_READ_UNALIGNED_WORD(p) *(const mz_uint16*)(p)
+static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist, mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len)
+{
+  mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK, match_len = *pMatch_len, probe_pos = pos, next_probe_pos, probe_len;
+  mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+  const mz_uint16 *s = (const mz_uint16*)(d->m_dict + pos), *p, *q;
+  mz_uint16 c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]), s01 = TDEFL_READ_UNALIGNED_WORD(s);
+  MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN); if (max_match_len <= match_len) return;
+  for ( ; ; )
+  {
+    for ( ; ; )
+    {
+      if (--num_probes_left == 0) return;
+      #define TDEFL_PROBE \
+        next_probe_pos = d->m_next[probe_pos]; \
+        if ((!next_probe_pos) || ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist)) return; \
+        probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK; \
+        if (TDEFL_READ_UNALIGNED_WORD(&d->m_dict[probe_pos + match_len - 1]) == c01) break;
+      TDEFL_PROBE; TDEFL_PROBE; TDEFL_PROBE;
+    }
+    if (!dist) break; q = (const mz_uint16*)(d->m_dict + probe_pos); if (TDEFL_READ_UNALIGNED_WORD(q) != s01) continue; p = s; probe_len = 32;
+    do { } while ( (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+                   (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0) );
+    if (!probe_len)
+    {
+      *pMatch_dist = dist; *pMatch_len = MZ_MIN(max_match_len, TDEFL_MAX_MATCH_LEN); break;
+    }
+    else if ((probe_len = ((mz_uint)(p - s) * 2) + (mz_uint)(*(const mz_uint8*)p == *(const mz_uint8*)q)) > match_len)
+    {
+      *pMatch_dist = dist; if ((*pMatch_len = match_len = MZ_MIN(max_match_len, probe_len)) == max_match_len) break;
+      c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]);
+    }
+  }
+}
+#else
+static MZ_FORCEINLINE void tdefl_find_match(tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist, mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len)
+{
+  mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK, match_len = *pMatch_len, probe_pos = pos, next_probe_pos, probe_len;
+  mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+  const mz_uint8 *s = d->m_dict + pos, *p, *q;
+  mz_uint8 c0 = d->m_dict[pos + match_len], c1 = d->m_dict[pos + match_len - 1];
+  MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN); if (max_match_len <= match_len) return;
+  for ( ; ; )
+  {
+    for ( ; ; )
+    {
+      if (--num_probes_left == 0) return;
+      #define TDEFL_PROBE \
+        next_probe_pos = d->m_next[probe_pos]; \
+        if ((!next_probe_pos) || ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist)) return; \
+        probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK; \
+        if ((d->m_dict[probe_pos + match_len] == c0) && (d->m_dict[probe_pos + match_len - 1] == c1)) break;
+      TDEFL_PROBE; TDEFL_PROBE; TDEFL_PROBE;
+    }
+    if (!dist) break; p = s; q = d->m_dict + probe_pos; for (probe_len = 0; probe_len < max_match_len; probe_len++) if (*p++ != *q++) break;
+    if (probe_len > match_len)
+    {
+      *pMatch_dist = dist; if ((*pMatch_len = match_len = probe_len) == max_match_len) return;
+      c0 = d->m_dict[pos + match_len]; c1 = d->m_dict[pos + match_len - 1];
+    }
+  }
+}
+#endif // #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+static mz_bool tdefl_compress_fast(tdefl_compressor *d)
+{
+  // Faster, minimally featured LZRW1-style match+parse loop with better register utilization. Intended for applications where raw throughput is valued more highly than ratio.
+  mz_uint lookahead_pos = d->m_lookahead_pos, lookahead_size = d->m_lookahead_size, dict_size = d->m_dict_size, total_lz_bytes = d->m_total_lz_bytes, num_flags_left = d->m_num_flags_left;
+  mz_uint8 *pLZ_code_buf = d->m_pLZ_code_buf, *pLZ_flags = d->m_pLZ_flags;
+  mz_uint cur_pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+
+  while ((d->m_src_buf_left) || ((d->m_flush) && (lookahead_size)))
+  {
+    const mz_uint TDEFL_COMP_FAST_LOOKAHEAD_SIZE = 4096;
+    mz_uint dst_pos = (lookahead_pos + lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK;
+    mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(d->m_src_buf_left, TDEFL_COMP_FAST_LOOKAHEAD_SIZE - lookahead_size);
+    d->m_src_buf_left -= num_bytes_to_process;
+    lookahead_size += num_bytes_to_process;
+
+    while (num_bytes_to_process)
+    {
+      mz_uint32 n = MZ_MIN(TDEFL_LZ_DICT_SIZE - dst_pos, num_bytes_to_process);
+      memcpy(d->m_dict + dst_pos, d->m_pSrc, n);
+      if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+        memcpy(d->m_dict + TDEFL_LZ_DICT_SIZE + dst_pos, d->m_pSrc, MZ_MIN(n, (TDEFL_MAX_MATCH_LEN - 1) - dst_pos));
+      d->m_pSrc += n;
+      dst_pos = (dst_pos + n) & TDEFL_LZ_DICT_SIZE_MASK;
+      num_bytes_to_process -= n;
+    }
+
+    dict_size = MZ_MIN(TDEFL_LZ_DICT_SIZE - lookahead_size, dict_size);
+    if ((!d->m_flush) && (lookahead_size < TDEFL_COMP_FAST_LOOKAHEAD_SIZE)) break;
+
+    while (lookahead_size >= 4)
+    {
+      mz_uint cur_match_dist, cur_match_len = 1;
+      mz_uint8 *pCur_dict = d->m_dict + cur_pos;
+      mz_uint first_trigram = (*(const mz_uint32 *)pCur_dict) & 0xFFFFFF;
+      mz_uint hash = (first_trigram ^ (first_trigram >> (24 - (TDEFL_LZ_HASH_BITS - 8)))) & TDEFL_LEVEL1_HASH_SIZE_MASK;
+      mz_uint probe_pos = d->m_hash[hash];
+      d->m_hash[hash] = (mz_uint16)lookahead_pos;
+
+      if (((cur_match_dist = (mz_uint16)(lookahead_pos - probe_pos)) <= dict_size) && ((*(const mz_uint32 *)(d->m_dict + (probe_pos &= TDEFL_LZ_DICT_SIZE_MASK)) & 0xFFFFFF) == first_trigram))
+      {
+        const mz_uint16 *p = (const mz_uint16 *)pCur_dict;
+        const mz_uint16 *q = (const mz_uint16 *)(d->m_dict + probe_pos);
+        mz_uint32 probe_len = 32;
+        do { } while ( (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+          (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) && (--probe_len > 0) );
+        cur_match_len = ((mz_uint)(p - (const mz_uint16 *)pCur_dict) * 2) + (mz_uint)(*(const mz_uint8 *)p == *(const mz_uint8 *)q);
+        if (!probe_len)
+          cur_match_len = cur_match_dist ? TDEFL_MAX_MATCH_LEN : 0;
+
+        if ((cur_match_len < TDEFL_MIN_MATCH_LEN) || ((cur_match_len == TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 8U*1024U)))
+        {
+          cur_match_len = 1;
+          *pLZ_code_buf++ = (mz_uint8)first_trigram;
+          *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+          d->m_huff_count[0][(mz_uint8)first_trigram]++;
+        }
+        else
+        {
+          mz_uint32 s0, s1;
+          cur_match_len = MZ_MIN(cur_match_len, lookahead_size);
+
+          MZ_ASSERT((cur_match_len >= TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 1) && (cur_match_dist <= TDEFL_LZ_DICT_SIZE));
+
+          cur_match_dist--;
+
+          pLZ_code_buf[0] = (mz_uint8)(cur_match_len - TDEFL_MIN_MATCH_LEN);
+          *(mz_uint16 *)(&pLZ_code_buf[1]) = (mz_uint16)cur_match_dist;
+          pLZ_code_buf += 3;
+          *pLZ_flags = (mz_uint8)((*pLZ_flags >> 1) | 0x80);
+
+          s0 = s_tdefl_small_dist_sym[cur_match_dist & 511];
+          s1 = s_tdefl_large_dist_sym[cur_match_dist >> 8];
+          d->m_huff_count[1][(cur_match_dist < 512) ? s0 : s1]++;
+
+          d->m_huff_count[0][s_tdefl_len_sym[cur_match_len - TDEFL_MIN_MATCH_LEN]]++;
+        }
+      }
+      else
+      {
+        *pLZ_code_buf++ = (mz_uint8)first_trigram;
+        *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+        d->m_huff_count[0][(mz_uint8)first_trigram]++;
+      }
+
+      if (--num_flags_left == 0) { num_flags_left = 8; pLZ_flags = pLZ_code_buf++; }
+
+      total_lz_bytes += cur_match_len;
+      lookahead_pos += cur_match_len;
+      dict_size = MZ_MIN(dict_size + cur_match_len, TDEFL_LZ_DICT_SIZE);
+      cur_pos = (cur_pos + cur_match_len) & TDEFL_LZ_DICT_SIZE_MASK;
+      MZ_ASSERT(lookahead_size >= cur_match_len);
+      lookahead_size -= cur_match_len;
+
+      if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8])
+      {
+        int n;
+        d->m_lookahead_pos = lookahead_pos; d->m_lookahead_size = lookahead_size; d->m_dict_size = dict_size;
+        d->m_total_lz_bytes = total_lz_bytes; d->m_pLZ_code_buf = pLZ_code_buf; d->m_pLZ_flags = pLZ_flags; d->m_num_flags_left = num_flags_left;
+        if ((n = tdefl_flush_block(d, 0)) != 0)
+          return (n < 0) ? MZ_FALSE : MZ_TRUE;
+        total_lz_bytes = d->m_total_lz_bytes; pLZ_code_buf = d->m_pLZ_code_buf; pLZ_flags = d->m_pLZ_flags; num_flags_left = d->m_num_flags_left;
+      }
+    }
+
+    while (lookahead_size)
+    {
+      mz_uint8 lit = d->m_dict[cur_pos];
+
+      total_lz_bytes++;
+      *pLZ_code_buf++ = lit;
+      *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+      if (--num_flags_left == 0) { num_flags_left = 8; pLZ_flags = pLZ_code_buf++; }
+
+      d->m_huff_count[0][lit]++;
+
+      lookahead_pos++;
+      dict_size = MZ_MIN(dict_size + 1, TDEFL_LZ_DICT_SIZE);
+      cur_pos = (cur_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK;
+      lookahead_size--;
+
+      if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8])
+      {
+        int n;
+        d->m_lookahead_pos = lookahead_pos; d->m_lookahead_size = lookahead_size; d->m_dict_size = dict_size;
+        d->m_total_lz_bytes = total_lz_bytes; d->m_pLZ_code_buf = pLZ_code_buf; d->m_pLZ_flags = pLZ_flags; d->m_num_flags_left = num_flags_left;
+        if ((n = tdefl_flush_block(d, 0)) != 0)
+          return (n < 0) ? MZ_FALSE : MZ_TRUE;
+        total_lz_bytes = d->m_total_lz_bytes; pLZ_code_buf = d->m_pLZ_code_buf; pLZ_flags = d->m_pLZ_flags; num_flags_left = d->m_num_flags_left;
+      }
+    }
+  }
+
+  d->m_lookahead_pos = lookahead_pos; d->m_lookahead_size = lookahead_size; d->m_dict_size = dict_size;
+  d->m_total_lz_bytes = total_lz_bytes; d->m_pLZ_code_buf = pLZ_code_buf; d->m_pLZ_flags = pLZ_flags; d->m_num_flags_left = num_flags_left;
+  return MZ_TRUE;
+}
+#endif // MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+
+static MZ_FORCEINLINE void tdefl_record_literal(tdefl_compressor *d, mz_uint8 lit)
+{
+  d->m_total_lz_bytes++;
+  *d->m_pLZ_code_buf++ = lit;
+  *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> 1); if (--d->m_num_flags_left == 0) { d->m_num_flags_left = 8; d->m_pLZ_flags = d->m_pLZ_code_buf++; }
+  d->m_huff_count[0][lit]++;
+}
+
+static MZ_FORCEINLINE void tdefl_record_match(tdefl_compressor *d, mz_uint match_len, mz_uint match_dist)
+{
+  mz_uint32 s0, s1;
+
+  MZ_ASSERT((match_len >= TDEFL_MIN_MATCH_LEN) && (match_dist >= 1) && (match_dist <= TDEFL_LZ_DICT_SIZE));
+
+  d->m_total_lz_bytes += match_len;
+
+  d->m_pLZ_code_buf[0] = (mz_uint8)(match_len - TDEFL_MIN_MATCH_LEN);
+
+  match_dist -= 1;
+  d->m_pLZ_code_buf[1] = (mz_uint8)(match_dist & 0xFF);
+  d->m_pLZ_code_buf[2] = (mz_uint8)(match_dist >> 8); d->m_pLZ_code_buf += 3;
+
+  *d->m_pLZ_flags = (mz_uint8)((*d->m_pLZ_flags >> 1) | 0x80); if (--d->m_num_flags_left == 0) { d->m_num_flags_left = 8; d->m_pLZ_flags = d->m_pLZ_code_buf++; }
+
+  s0 = s_tdefl_small_dist_sym[match_dist & 511]; s1 = s_tdefl_large_dist_sym[(match_dist >> 8) & 127];
+  d->m_huff_count[1][(match_dist < 512) ? s0 : s1]++;
+
+  if (match_len >= TDEFL_MIN_MATCH_LEN) d->m_huff_count[0][s_tdefl_len_sym[match_len - TDEFL_MIN_MATCH_LEN]]++;
+}
+
+static mz_bool tdefl_compress_normal(tdefl_compressor *d)
+{
+  const mz_uint8 *pSrc = d->m_pSrc; size_t src_buf_left = d->m_src_buf_left;
+  tdefl_flush flush = d->m_flush;
+
+  while ((src_buf_left) || ((flush) && (d->m_lookahead_size)))
+  {
+    mz_uint len_to_move, cur_match_dist, cur_match_len, cur_pos;
+    // Update dictionary and hash chains. Keeps the lookahead size equal to TDEFL_MAX_MATCH_LEN.
+    if ((d->m_lookahead_size + d->m_dict_size) >= (TDEFL_MIN_MATCH_LEN - 1))
+    {
+      mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK, ins_pos = d->m_lookahead_pos + d->m_lookahead_size - 2;
+      mz_uint hash = (d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] << TDEFL_LZ_HASH_SHIFT) ^ d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK];
+      mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(src_buf_left, TDEFL_MAX_MATCH_LEN - d->m_lookahead_size);
+      const mz_uint8 *pSrc_end = pSrc + num_bytes_to_process;
+      src_buf_left -= num_bytes_to_process;
+      d->m_lookahead_size += num_bytes_to_process;
+      while (pSrc != pSrc_end)
+      {
+        mz_uint8 c = *pSrc++; d->m_dict[dst_pos] = c; if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1)) d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+        hash = ((hash << TDEFL_LZ_HASH_SHIFT) ^ c) & (TDEFL_LZ_HASH_SIZE - 1);
+        d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash]; d->m_hash[hash] = (mz_uint16)(ins_pos);
+        dst_pos = (dst_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK; ins_pos++;
+      }
+    }
+    else
+    {
+      while ((src_buf_left) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN))
+      {
+        mz_uint8 c = *pSrc++;
+        mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK;
+        src_buf_left--;
+        d->m_dict[dst_pos] = c;
+        if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+          d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+        if ((++d->m_lookahead_size + d->m_dict_size) >= TDEFL_MIN_MATCH_LEN)
+        {
+          mz_uint ins_pos = d->m_lookahead_pos + (d->m_lookahead_size - 1) - 2;
+          mz_uint hash = ((d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] << (TDEFL_LZ_HASH_SHIFT * 2)) ^ (d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK] << TDEFL_LZ_HASH_SHIFT) ^ c) & (TDEFL_LZ_HASH_SIZE - 1);
+          d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash]; d->m_hash[hash] = (mz_uint16)(ins_pos);
+        }
+      }
+    }
+    d->m_dict_size = MZ_MIN(TDEFL_LZ_DICT_SIZE - d->m_lookahead_size, d->m_dict_size);
+    if ((!flush) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN))
+      break;
+
+    // Simple lazy/greedy parsing state machine.
+    len_to_move = 1; cur_match_dist = 0; cur_match_len = d->m_saved_match_len ? d->m_saved_match_len : (TDEFL_MIN_MATCH_LEN - 1); cur_pos = d->m_lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+    if (d->m_flags & (TDEFL_RLE_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS))
+    {
+      if ((d->m_dict_size) && (!(d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS)))
+      {
+        mz_uint8 c = d->m_dict[(cur_pos - 1) & TDEFL_LZ_DICT_SIZE_MASK];
+        cur_match_len = 0; while (cur_match_len < d->m_lookahead_size) { if (d->m_dict[cur_pos + cur_match_len] != c) break; cur_match_len++; }
+        if (cur_match_len < TDEFL_MIN_MATCH_LEN) cur_match_len = 0; else cur_match_dist = 1;
+      }
+    }
+    else
+    {
+      tdefl_find_match(d, d->m_lookahead_pos, d->m_dict_size, d->m_lookahead_size, &cur_match_dist, &cur_match_len);
+    }
+    if (((cur_match_len == TDEFL_MIN_MATCH_LEN) && (cur_match_dist >= 8U*1024U)) || (cur_pos == cur_match_dist) || ((d->m_flags & TDEFL_FILTER_MATCHES) && (cur_match_len <= 5)))
+    {
+      cur_match_dist = cur_match_len = 0;
+    }
+    if (d->m_saved_match_len)
+    {
+      if (cur_match_len > d->m_saved_match_len)
+      {
+        tdefl_record_literal(d, (mz_uint8)d->m_saved_lit);
+        if (cur_match_len >= 128)
+        {
+          tdefl_record_match(d, cur_match_len, cur_match_dist);
+          d->m_saved_match_len = 0; len_to_move = cur_match_len;
+        }
+        else
+        {
+          d->m_saved_lit = d->m_dict[cur_pos]; d->m_saved_match_dist = cur_match_dist; d->m_saved_match_len = cur_match_len;
+        }
+      }
+      else
+      {
+        tdefl_record_match(d, d->m_saved_match_len, d->m_saved_match_dist);
+        len_to_move = d->m_saved_match_len - 1; d->m_saved_match_len = 0;
+      }
+    }
+    else if (!cur_match_dist)
+      tdefl_record_literal(d, d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)]);
+    else if ((d->m_greedy_parsing) || (d->m_flags & TDEFL_RLE_MATCHES) || (cur_match_len >= 128))
+    {
+      tdefl_record_match(d, cur_match_len, cur_match_dist);
+      len_to_move = cur_match_len;
+    }
+    else
+    {
+      d->m_saved_lit = d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)]; d->m_saved_match_dist = cur_match_dist; d->m_saved_match_len = cur_match_len;
+    }
+    // Move the lookahead forward by len_to_move bytes.
+    d->m_lookahead_pos += len_to_move;
+    MZ_ASSERT(d->m_lookahead_size >= len_to_move);
+    d->m_lookahead_size -= len_to_move;
+    d->m_dict_size = MZ_MIN(d->m_dict_size + len_to_move, TDEFL_LZ_DICT_SIZE);
+    // Check if it's time to flush the current LZ codes to the internal output buffer.
+    if ( (d->m_pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) ||
+         ( (d->m_total_lz_bytes > 31*1024) && (((((mz_uint)(d->m_pLZ_code_buf - d->m_lz_code_buf) * 115) >> 7) >= d->m_total_lz_bytes) || (d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS))) )
+    {
+      int n;
+      d->m_pSrc = pSrc; d->m_src_buf_left = src_buf_left;
+      if ((n = tdefl_flush_block(d, 0)) != 0)
+        return (n < 0) ? MZ_FALSE : MZ_TRUE;
+    }
+  }
+
+  d->m_pSrc = pSrc; d->m_src_buf_left = src_buf_left;
+  return MZ_TRUE;
+}
+
+static tdefl_status tdefl_flush_output_buffer(tdefl_compressor *d)
+{
+  if (d->m_pIn_buf_size)
+  {
+    *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+  }
+
+  if (d->m_pOut_buf_size)
+  {
+    size_t n = MZ_MIN(*d->m_pOut_buf_size - d->m_out_buf_ofs, d->m_output_flush_remaining);
+    memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf + d->m_output_flush_ofs, n);
+    d->m_output_flush_ofs += (mz_uint)n;
+    d->m_output_flush_remaining -= (mz_uint)n;
+    d->m_out_buf_ofs += n;
+
+    *d->m_pOut_buf_size = d->m_out_buf_ofs;
+  }
+
+  return (d->m_finished && !d->m_output_flush_remaining) ? TDEFL_STATUS_DONE : TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf, size_t *pIn_buf_size, void *pOut_buf, size_t *pOut_buf_size, tdefl_flush flush)
+{
+  if (!d)
+  {
+    if (pIn_buf_size) *pIn_buf_size = 0;
+    if (pOut_buf_size) *pOut_buf_size = 0;
+    return TDEFL_STATUS_BAD_PARAM;
+  }
+
+  d->m_pIn_buf = pIn_buf; d->m_pIn_buf_size = pIn_buf_size;
+  d->m_pOut_buf = pOut_buf; d->m_pOut_buf_size = pOut_buf_size;
+  d->m_pSrc = (const mz_uint8 *)(pIn_buf); d->m_src_buf_left = pIn_buf_size ? *pIn_buf_size : 0;
+  d->m_out_buf_ofs = 0;
+  d->m_flush = flush;
+
+  if ( ((d->m_pPut_buf_func != NULL) == ((pOut_buf != NULL) || (pOut_buf_size != NULL))) || (d->m_prev_return_status != TDEFL_STATUS_OKAY) ||
+        (d->m_wants_to_finish && (flush != TDEFL_FINISH)) || (pIn_buf_size && *pIn_buf_size && !pIn_buf) || (pOut_buf_size && *pOut_buf_size && !pOut_buf) )
+  {
+    if (pIn_buf_size) *pIn_buf_size = 0;
+    if (pOut_buf_size) *pOut_buf_size = 0;
+    return (d->m_prev_return_status = TDEFL_STATUS_BAD_PARAM);
+  }
+  d->m_wants_to_finish |= (flush == TDEFL_FINISH);
+
+  if ((d->m_output_flush_remaining) || (d->m_finished))
+    return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  if (((d->m_flags & TDEFL_MAX_PROBES_MASK) == 1) &&
+      ((d->m_flags & TDEFL_GREEDY_PARSING_FLAG) != 0) &&
+      ((d->m_flags & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS | TDEFL_RLE_MATCHES)) == 0))
+  {
+    if (!tdefl_compress_fast(d))
+      return d->m_prev_return_status;
+  }
+  else
+#endif // #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  {
+    if (!tdefl_compress_normal(d))
+      return d->m_prev_return_status;
+  }
+
+  if ((d->m_flags & (TDEFL_WRITE_ZLIB_HEADER | TDEFL_COMPUTE_ADLER32)) && (pIn_buf))
+    d->m_adler32 = (mz_uint32)mz_adler32(d->m_adler32, (const mz_uint8 *)pIn_buf, d->m_pSrc - (const mz_uint8 *)pIn_buf);
+
+  if ((flush) && (!d->m_lookahead_size) && (!d->m_src_buf_left) && (!d->m_output_flush_remaining))
+  {
+    if (tdefl_flush_block(d, flush) < 0)
+      return d->m_prev_return_status;
+    d->m_finished = (flush == TDEFL_FINISH);
+    if (flush == TDEFL_FULL_FLUSH) { MZ_CLEAR_OBJ(d->m_hash); MZ_CLEAR_OBJ(d->m_next); d->m_dict_size = 0; }
+  }
+
+  return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+}
+
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf, size_t in_buf_size, tdefl_flush flush)
+{
+  MZ_ASSERT(d->m_pPut_buf_func); return tdefl_compress(d, pIn_buf, &in_buf_size, NULL, NULL, flush);
+}
+
+tdefl_status tdefl_init(tdefl_compressor *d, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+  d->m_pPut_buf_func = pPut_buf_func; d->m_pPut_buf_user = pPut_buf_user;
+  d->m_flags = (mz_uint)(flags); d->m_max_probes[0] = 1 + ((flags & 0xFFF) + 2) / 3; d->m_greedy_parsing = (flags & TDEFL_GREEDY_PARSING_FLAG) != 0;
+  d->m_max_probes[1] = 1 + (((flags & 0xFFF) >> 2) + 2) / 3;
+  if (!(flags & TDEFL_NONDETERMINISTIC_PARSING_FLAG)) MZ_CLEAR_OBJ(d->m_hash);
+  d->m_lookahead_pos = d->m_lookahead_size = d->m_dict_size = d->m_total_lz_bytes = d->m_lz_code_buf_dict_pos = d->m_bits_in = 0;
+  d->m_output_flush_ofs = d->m_output_flush_remaining = d->m_finished = d->m_block_index = d->m_bit_buffer = d->m_wants_to_finish = 0;
+  d->m_pLZ_code_buf = d->m_lz_code_buf + 1; d->m_pLZ_flags = d->m_lz_code_buf; d->m_num_flags_left = 8;
+  d->m_pOutput_buf = d->m_output_buf; d->m_pOutput_buf_end = d->m_output_buf; d->m_prev_return_status = TDEFL_STATUS_OKAY;
+  d->m_saved_match_dist = d->m_saved_match_len = d->m_saved_lit = 0; d->m_adler32 = 1;
+  d->m_pIn_buf = NULL; d->m_pOut_buf = NULL;
+  d->m_pIn_buf_size = NULL; d->m_pOut_buf_size = NULL;
+  d->m_flush = TDEFL_NO_FLUSH; d->m_pSrc = NULL; d->m_src_buf_left = 0; d->m_out_buf_ofs = 0;
+  memset(&d->m_huff_count[0][0], 0, sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+  memset(&d->m_huff_count[1][0], 0, sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+  return TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d)
+{
+  return d->m_prev_return_status;
+}
+
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d)
+{
+  return d->m_adler32;
+}
+
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len, tdefl_put_buf_func_ptr pPut_buf_func, void *pPut_buf_user, int flags)
+{
+  tdefl_compressor *pComp; mz_bool succeeded; if (((buf_len) && (!pBuf)) || (!pPut_buf_func)) return MZ_FALSE;
+  pComp = (tdefl_compressor*)MZ_MALLOC(sizeof(tdefl_compressor)); if (!pComp) return MZ_FALSE;
+  succeeded = (tdefl_init(pComp, pPut_buf_func, pPut_buf_user, flags) == TDEFL_STATUS_OKAY);
+  succeeded = succeeded && (tdefl_compress_buffer(pComp, pBuf, buf_len, TDEFL_FINISH) == TDEFL_STATUS_DONE);
+  MZ_FREE(pComp); return succeeded;
+}
+
+typedef struct
+{
+  size_t m_size, m_capacity;
+  mz_uint8 *m_pBuf;
+  mz_bool m_expandable;
+} tdefl_output_buffer;
+
+static mz_bool tdefl_output_buffer_putter(const void *pBuf, int len, void *pUser)
+{
+  tdefl_output_buffer *p = (tdefl_output_buffer *)pUser;
+  size_t new_size = p->m_size + len;
+  if (new_size > p->m_capacity)
+  {
+    size_t new_capacity = p->m_capacity; mz_uint8 *pNew_buf; if (!p->m_expandable) return MZ_FALSE;
+    do { new_capacity = MZ_MAX(128U, new_capacity << 1U); } while (new_size > new_capacity);
+    pNew_buf = (mz_uint8*)MZ_REALLOC(p->m_pBuf, new_capacity); if (!pNew_buf) return MZ_FALSE;
+    p->m_pBuf = pNew_buf; p->m_capacity = new_capacity;
+  }
+  memcpy((mz_uint8*)p->m_pBuf + p->m_size, pBuf, len); p->m_size = new_size;
+  return MZ_TRUE;
+}
+
+void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len, size_t *pOut_len, int flags)
+{
+  tdefl_output_buffer out_buf; MZ_CLEAR_OBJ(out_buf);
+  if (!pOut_len) return MZ_FALSE; else *pOut_len = 0;
+  out_buf.m_expandable = MZ_TRUE;
+  if (!tdefl_compress_mem_to_output(pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags)) return NULL;
+  *pOut_len = out_buf.m_size; return out_buf.m_pBuf;
+}
+
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len, const void *pSrc_buf, size_t src_buf_len, int flags)
+{
+  tdefl_output_buffer out_buf; MZ_CLEAR_OBJ(out_buf);
+  if (!pOut_buf) return 0;
+  out_buf.m_pBuf = (mz_uint8*)pOut_buf; out_buf.m_capacity = out_buf_len;
+  if (!tdefl_compress_mem_to_output(pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags)) return 0;
+  return out_buf.m_size;
+}
+
+#ifndef MINIZ_NO_ZLIB_APIS
+static const mz_uint s_tdefl_num_probes[11] = { 0, 1, 6, 32,  16, 32, 128, 256,  512, 768, 1500 };
+
+// level may actually range from [0,10] (10 is a "hidden" max level, where we want a bit more compression and it's fine if throughput to fall off a cliff on some files).
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits, int strategy)
+{
+  mz_uint comp_flags = s_tdefl_num_probes[(level >= 0) ? MZ_MIN(10, level) : MZ_DEFAULT_LEVEL] | ((level <= 3) ? TDEFL_GREEDY_PARSING_FLAG : 0);
+  if (window_bits > 0) comp_flags |= TDEFL_WRITE_ZLIB_HEADER;
+
+  if (!level) comp_flags |= TDEFL_FORCE_ALL_RAW_BLOCKS;
+  else if (strategy == MZ_FILTERED) comp_flags |= TDEFL_FILTER_MATCHES;
+  else if (strategy == MZ_HUFFMAN_ONLY) comp_flags &= ~TDEFL_MAX_PROBES_MASK;
+  else if (strategy == MZ_FIXED) comp_flags |= TDEFL_FORCE_ALL_STATIC_BLOCKS;
+  else if (strategy == MZ_RLE) comp_flags |= TDEFL_RLE_MATCHES;
+
+  return comp_flags;
+}
+#endif //MINIZ_NO_ZLIB_APIS
+
+#ifdef _MSC_VER
+#pragma warning (push)
+#pragma warning (disable:4204) // nonstandard extension used : non-constant aggregate initializer (also supported by GNU C and C99, so no big deal)
+#endif
+
+// Simple PNG writer function by Alex Evans, 2011. Released into the public domain: https://gist.github.com/908299, more context at
+// http://altdevblogaday.org/2011/04/06/a-smaller-jpg-encoder/.
+// This is actually a modification of Alex's original code so PNG files generated by this function pass pngcheck.
+void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w, int h, int num_chans, size_t *pLen_out, mz_uint level, mz_bool flip)
+{
+  // Using a local copy of this array here in case MINIZ_NO_ZLIB_APIS was defined.
+  static const mz_uint s_tdefl_png_num_probes[11] = { 0, 1, 6, 32,  16, 32, 128, 256,  512, 768, 1500 };
+  tdefl_compressor *pComp = (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor)); tdefl_output_buffer out_buf; int i, bpl = w * num_chans, y, z; mz_uint32 c; *pLen_out = 0;
+  if (!pComp) return NULL;
+  MZ_CLEAR_OBJ(out_buf); out_buf.m_expandable = MZ_TRUE; out_buf.m_capacity = 57+MZ_MAX(64, (1+bpl)*h); if (NULL == (out_buf.m_pBuf = (mz_uint8*)MZ_MALLOC(out_buf.m_capacity))) { MZ_FREE(pComp); return NULL; }
+  // write dummy header
+  for (z = 41; z; --z) tdefl_output_buffer_putter(&z, 1, &out_buf);
+  // compress image data
+  tdefl_init(pComp, tdefl_output_buffer_putter, &out_buf, s_tdefl_png_num_probes[MZ_MIN(10, level)] | TDEFL_WRITE_ZLIB_HEADER);
+  for (y = 0; y < h; ++y) { tdefl_compress_buffer(pComp, &z, 1, TDEFL_NO_FLUSH); tdefl_compress_buffer(pComp, (mz_uint8*)pImage + (flip ? (h - 1 - y) : y) * bpl, bpl, TDEFL_NO_FLUSH); }
+  if (tdefl_compress_buffer(pComp, NULL, 0, TDEFL_FINISH) != TDEFL_STATUS_DONE) { MZ_FREE(pComp); MZ_FREE(out_buf.m_pBuf); return NULL; }
+  // write real header
+  *pLen_out = out_buf.m_size-41;
+  {
+    static const mz_uint8 chans[] = {0x00, 0x00, 0x04, 0x02, 0x06};
+    mz_uint8 pnghdr[41]={0x89,0x50,0x4e,0x47,0x0d,0x0a,0x1a,0x0a,0x00,0x00,0x00,0x0d,0x49,0x48,0x44,0x52,
+      0,0,(mz_uint8)(w>>8),(mz_uint8)w,0,0,(mz_uint8)(h>>8),(mz_uint8)h,8,chans[num_chans],0,0,0,0,0,0,0,
+      (mz_uint8)(*pLen_out>>24),(mz_uint8)(*pLen_out>>16),(mz_uint8)(*pLen_out>>8),(mz_uint8)*pLen_out,0x49,0x44,0x41,0x54};
+    c=(mz_uint32)mz_crc32(MZ_CRC32_INIT,pnghdr+12,17); for (i=0; i<4; ++i, c<<=8) ((mz_uint8*)(pnghdr+29))[i]=(mz_uint8)(c>>24);
+    memcpy(out_buf.m_pBuf, pnghdr, 41);
+  }
+  // write footer (IDAT CRC-32, followed by IEND chunk)
+  if (!tdefl_output_buffer_putter("\0\0\0\0\0\0\0\0\x49\x45\x4e\x44\xae\x42\x60\x82", 16, &out_buf)) { *pLen_out = 0; MZ_FREE(pComp); MZ_FREE(out_buf.m_pBuf); return NULL; }
+  c = (mz_uint32)mz_crc32(MZ_CRC32_INIT,out_buf.m_pBuf+41-4, *pLen_out+4); for (i=0; i<4; ++i, c<<=8) (out_buf.m_pBuf+out_buf.m_size-16)[i] = (mz_uint8)(c >> 24);
+  // compute final size of file, grab compressed data buffer and return
+  *pLen_out += 57; MZ_FREE(pComp); return out_buf.m_pBuf;
+}
+void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h, int num_chans, size_t *pLen_out)
+{
+  // Level 6 corresponds to TDEFL_DEFAULT_MAX_PROBES or MZ_DEFAULT_LEVEL (but we can't depend on MZ_DEFAULT_LEVEL being available in case the zlib API's where #defined out)
+  return tdefl_write_image_to_png_file_in_memory_ex(pImage, w, h, num_chans, pLen_out, 6, MZ_FALSE);
+}
+
+#ifdef _MSC_VER
+#pragma warning (pop)
+#endif
+
+} // namespace buminiz
+
+#endif // MINIZ_HEADER_FILE_ONLY
+
diff --git a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
new file mode 100644
index 0000000000..596fc197e6
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.cpp
@@ -0,0 +1,564 @@
+// basisu_pvrtc1_4.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_pvrtc1_4.h"
+
+namespace basisu
+{
+#if 0
+	static const uint8_t g_pvrtc_5[32] = { 0,8,16,24,33,41,49,57,66,74,82,90,99,107,115,123,132,140,148,156,165,173,181,189,198,206,214,222,231,239,247,255 };
+	static const uint8_t g_pvrtc_4[16] = { 0,16,33,49,66,82,99,115,140,156,173,189,206,222,239,255 };
+	static const uint8_t g_pvrtc_3[8] = { 0,33,74,107,148,181,222,255 };
+	static const uint8_t g_pvrtc_alpha[9] = { 0,34,68,102,136,170,204,238,255 };
+#endif
+
+	static const uint8_t g_pvrtc_5_nearest[256] = { 0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31 };
+	static const uint8_t g_pvrtc_4_nearest[256] = { 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15 };
+#if 0
+	static const uint8_t g_pvrtc_3_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 };
+	static const uint8_t g_pvrtc_alpha_nearest[256] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8 };
+#endif
+
+#if 0
+	static const uint8_t g_pvrtc_5_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,
+		3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,
+		7,7,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,
+		11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,
+		15,15,15,15,16,16,16,16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,
+		19,19,19,19,19,20,20,20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,
+		23,23,23,23,23,23,24,24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,
+		27,27,27,27,27,27,27,28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31
+	};
+
+	static const uint8_t g_pvrtc_5_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,
+		4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,
+		8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,
+		12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,16,16,16,16,
+		16,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,20,20,20,
+		20,20,20,20,20,20,21,21,21,21,21,21,21,21,22,22,22,22,22,22,22,22,23,23,23,23,23,23,23,23,24,24,
+		24,24,24,24,24,24,24,25,25,25,25,25,25,25,25,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,28,
+		28,28,28,28,28,28,28,28,29,29,29,29,29,29,29,29,30,30,30,30,30,30,30,30,31,31,31,31,31,31,31,31
+	};
+
+	static const uint8_t g_pvrtc_4_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,
+		9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,
+		11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,
+		13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15
+	};
+
+	static const uint8_t g_pvrtc_4_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,
+		8,8,8,8,8,8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,10,10,10,
+		10,10,10,10,10,10,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,12,12,
+		12,12,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,14,
+		14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15
+	};
+
+	static const uint8_t g_pvrtc_3_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7
+	};
+
+	static const uint8_t g_pvrtc_3_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+	};
+
+	static const uint8_t g_pvrtc_alpha_floor[256] =
+	{
+		0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+		0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8
+	};
+
+	static const uint8_t g_pvrtc_alpha_ceil[256] =
+	{
+		0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+		1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+		2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+		3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,
+		4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+		5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+		6,6,6,6,6,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+		7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+	};
+#endif
+
+	uint32_t pvrtc4_swizzle_uv(uint32_t width, uint32_t height, uint32_t x, uint32_t y)
+	{
+		assert((x < width) && (y < height) && basisu::is_pow2(height) && basisu::is_pow2(width));
+				
+		uint32_t min_d = width, max_v = y;
+		if (height < width)
+		{
+			min_d = height;
+			max_v = x;
+		}
+
+		// Interleave the XY LSB's
+		uint32_t shift_ofs = 0, swizzled = 0;
+		for (uint32_t s_bit = 1, d_bit = 1; s_bit < min_d; s_bit <<= 1, d_bit <<= 2, ++shift_ofs)
+		{
+			if (y & s_bit) swizzled |= d_bit;
+			if (x & s_bit) swizzled |= (2 * d_bit);
+		}
+
+		max_v >>= shift_ofs;
+		
+		// OR in the rest of the bits from the largest dimension
+		swizzled |= (max_v << (2 * shift_ofs));
+
+		return swizzled;
+	}
+
+	color_rgba pvrtc4_block::get_endpoint(uint32_t endpoint_index, bool unpack) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+					
+				if (unpack)
+				{
+					b = (b << 1) | (b >> 3);
+				}
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = unpack ? 255 : 7;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+						
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 2) | (b >> 1);
+				}
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				if (unpack)
+				{
+					a = (a << 1);
+					a = (a << 4) | a;
+
+					r = (r << 1) | (r >> 3);
+					g = (g << 1) | (g >> 3);
+					b = (b << 1) | (b >> 3);
+				}
+			}
+		}
+
+		if (unpack)
+		{
+			r = (r << 3) | (r >> 2);
+			g = (g << 3) | (g >> 2);
+			b = (b << 3) | (b >> 2);
+		}
+
+		assert((r < 256) && (g < 256) && (b < 256) && (a < 256));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	color_rgba pvrtc4_block::get_endpoint_5554(uint32_t endpoint_index) const
+	{
+		assert(endpoint_index < 2);
+		const uint32_t packed = m_endpoints >> (endpoint_index * 16);
+
+		uint32_t r, g, b, a;
+		if (packed & 0x8000)
+		{
+			// opaque 554 or 555
+			if (!endpoint_index)
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = (packed >> 1) & 15;
+
+				b = (b << 1) | (b >> 3);
+			}
+			else
+			{
+				r = (packed >> 10) & 31;
+				g = (packed >> 5) & 31;
+				b = packed & 31;
+			}
+
+			a = 15;
+		}
+		else
+		{
+			// translucent 4433 or 4443
+			if (!endpoint_index)
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = (packed >> 1) & 7;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 2) | (b >> 1);
+			}
+			else
+			{
+				a = (packed >> 12) & 7;
+				r = (packed >> 8) & 15;
+				g = (packed >> 4) & 15;
+				b = packed & 15;
+
+				a = a << 1;
+						
+				r = (r << 1) | (r >> 3);
+				g = (g << 1) | (g >> 3);
+				b = (b << 1) | (b >> 3);
+			}
+		}
+						
+		assert((r < 32) && (g < 32) && (b < 32) && (a < 16));
+
+		return color_rgba(r, g, b, a);
+	}
+
+	bool pvrtc4_image::get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		pColors[0] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+		pColors[3] = interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				uint32_t m = (pColors[0][c] + pColors[3][c]) / 2;
+				pColors[1][c] = static_cast<uint8_t>(m);
+				pColors[2][c] = static_cast<uint8_t>(m);
+			}
+			pColors[2][3] = 0;
+			return true;
+		}
+
+		for (uint32_t c = 0; c < 4; c++)
+		{
+			pColors[1][c] = static_cast<uint8_t>((pColors[0][c] * 5 + pColors[3][c] * 3) / 8);
+			pColors[2][c] = static_cast<uint8_t>((pColors[0][c] * 3 + pColors[3][c] * 5) / 8);
+		}
+
+		return false;
+	}
+		
+	color_rgba pvrtc4_image::get_pixel(uint32_t x, uint32_t y, uint32_t m) const
+	{
+		assert((x < m_width) && (y < m_height));
+
+		int block_x0 = (static_cast<int>(x) - 2) >> 2;
+		int block_x1 = block_x0 + 1;
+		int block_y0 = (static_cast<int>(y) - 2) >> 2;
+		int block_y1 = block_y0 + 1;
+		
+		block_x0 = posmod(block_x0, m_block_width);
+		block_x1 = posmod(block_x1, m_block_width);
+		block_y0 = posmod(block_y0, m_block_height);
+		block_y1 = posmod(block_y1, m_block_height);
+		
+		if (get_block_uses_transparent_modulation(x >> 2, y >> 2))
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			return color_rgba((l[0] + h[0]) / 2, (l[1] + h[1]) / 2, (l[2] + h[2]) / 2, (m == 2) ? 0 : (l[3] + h[3]) / 2);
+		}
+		else
+		{
+			if (m == 0)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0));
+			else if (m == 3)
+				return interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1));
+
+			color_rgba l(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(0), m_blocks(block_x1, block_y0).get_endpoint_5554(0), m_blocks(block_x0, block_y1).get_endpoint_5554(0), m_blocks(block_x1, block_y1).get_endpoint_5554(0)));
+			color_rgba h(interpolate(x, y, m_blocks(block_x0, block_y0).get_endpoint_5554(1), m_blocks(block_x1, block_y0).get_endpoint_5554(1), m_blocks(block_x0, block_y1).get_endpoint_5554(1), m_blocks(block_x1, block_y1).get_endpoint_5554(1)));
+
+			if (m == 2)
+				return color_rgba((l[0] * 3 + h[0] * 5) / 8, (l[1] * 3 + h[1] * 5) / 8, (l[2] * 3 + h[2] * 5) / 8, (l[3] * 3 + h[3] * 5) / 8);
+			else
+				return color_rgba((l[0] * 5 + h[0] * 3) / 8, (l[1] * 5 + h[1] * 3) / 8, (l[2] * 5 + h[2] * 3) / 8, (l[3] * 5 + h[3] * 3) / 8);
+		}
+	}
+
+	uint64_t pvrtc4_image::local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual)
+	{
+		uint64_t initial_error = evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false);
+		if (!initial_error)
+			return initial_error;
+
+		vec3F c_avg_orig(0);
+
+		for (int y = 0; y < 7; y++)
+		{
+			const uint32_t py = wrap_y(by * 4 + y - 1);
+			for (uint32_t x = 0; x < 7; x++)
+			{
+				const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+				const color_rgba& c = orig_img(px, py);
+
+				c_avg_orig[0] += c[0];
+				c_avg_orig[1] += c[1];
+				c_avg_orig[2] += c[2];
+			}
+		}
+
+		c_avg_orig *= 1.0f / 49.0f;
+
+		vec3F quant_colors[2];
+		quant_colors[0].set(c_avg_orig);
+		quant_colors[0] -= vec3F(.0125f);
+
+		quant_colors[1].set(c_avg_orig);
+		quant_colors[1] += vec3F(.0125f);
+
+		float total_weight[2];
+
+		bool success = true;
+
+		for (uint32_t pass = 0; pass < 4; pass++)
+		{
+			vec3F new_colors[2] = { vec3F(0), vec3F(0) };
+			memset(total_weight, 0, sizeof(total_weight));
+
+			static const float s_weights[7][7] =
+			{
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 2.242640f, 3.242640f, 4.242640f, 5.000000f, 4.242640f, 3.242640f, 2.242640f },
+				{ 2.080362f, 3.006572f, 3.828426f, 4.242640f, 3.828426f, 3.006572f, 2.080362f },
+				{ 1.637089f, 2.414213f, 3.006572f, 3.242640f, 3.006572f, 2.414213f, 1.637089f },
+				{ 1.000000f, 1.637089f, 2.080362f, 2.242640f, 2.080362f, 1.637089f, 1.000000f }
+			};
+
+			for (int y = 0; y < 7; y++)
+			{
+				const uint32_t py = wrap_y(by * 4 + y - 1);
+				for (uint32_t x = 0; x < 7; x++)
+				{
+					const uint32_t px = wrap_x(bx * 4 + x - 1);
+
+					const color_rgba& orig_c = orig_img(px, py);
+
+					vec3F color(orig_c[0], orig_c[1], orig_c[2]);
+
+					uint32_t c = quant_colors[0].squared_distance(color) > quant_colors[1].squared_distance(color);
+
+					const float weight = s_weights[y][x];
+					new_colors[c] += color * weight;
+
+					total_weight[c] += weight;
+				}
+			}
+
+			if (!total_weight[0] || !total_weight[1])
+				success = false;
+
+			quant_colors[0] = new_colors[0] / (float)total_weight[0];
+			quant_colors[1] = new_colors[1] / (float)total_weight[1];
+		}
+
+		if (!success)
+		{
+			quant_colors[0] = c_avg_orig;
+			quant_colors[1] = c_avg_orig;
+		}
+
+		vec4F colors[2] = { quant_colors[0], quant_colors[1] };
+
+		colors[0] += vec3F(.5f);
+		colors[1] += vec3F(.5f);
+		color_rgba color_0((int)colors[0][0], (int)colors[0][1], (int)colors[0][2], 0);
+		color_rgba color_1((int)colors[1][0], (int)colors[1][1], (int)colors[1][2], 0);
+
+		pvrtc4_block cur_blocks[3][3];
+		
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				cur_blocks[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		color_rgba l1(0), h1(0);
+
+		l1[0] = g_pvrtc_5_nearest[color_0[0]];
+		h1[0] = g_pvrtc_5_nearest[color_1[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_0[1]];
+		h1[1] = g_pvrtc_5_nearest[color_1[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_0[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_0 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		pvrtc4_block blocks0[3][3];
+		for (int y = -1; y <= 1; y++)
+		{
+			for (int x = -1; x <= 1; x++)
+			{
+				const uint32_t block_x = wrap_block_x(bx + x);
+				const uint32_t block_y = wrap_block_y(by + y);
+				blocks0[x + 1][y + 1] = m_blocks(block_x, block_y);
+			}
+		}
+
+		l1[0] = g_pvrtc_5_nearest[color_1[0]];
+		h1[0] = g_pvrtc_5_nearest[color_0[0]];
+
+		l1[1] = g_pvrtc_5_nearest[color_1[1]];
+		h1[1] = g_pvrtc_5_nearest[color_0[1]];
+
+		l1[2] = g_pvrtc_4_nearest[color_1[2]];
+		h1[2] = g_pvrtc_5_nearest[color_0[2]];
+
+		l1[3] = 0;
+		h1[3] = 0;
+
+		m_blocks(bx, by).set_endpoint_raw(0, l1, true);
+		m_blocks(bx, by).set_endpoint_raw(1, h1, true);
+
+		uint64_t e03_err_1 = remap_pixels_influenced_by_endpoint(bx, by, orig_img, perceptual, false);
+
+		if (initial_error < basisu::minimum(e03_err_0, e03_err_1))
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = cur_blocks[x + 1][y + 1];
+				}
+			}
+			return initial_error;
+		}
+		else if (e03_err_0 < e03_err_1)
+		{
+			for (int y = -1; y <= 1; y++)
+			{
+				for (int x = -1; x <= 1; x++)
+				{
+					const uint32_t block_x = wrap_block_x(bx + x);
+					const uint32_t block_y = wrap_block_y(by + y);
+					m_blocks(block_x, block_y) = blocks0[x + 1][y + 1];
+				}
+			}
+			assert(e03_err_0 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+			return e03_err_0;
+		}
+
+		assert(e03_err_1 == evaluate_1x1_endpoint_error(bx, by, orig_img, perceptual, false));
+		return e03_err_1;
+	}
+
+} // basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
new file mode 100644
index 0000000000..db6985a439
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_pvrtc1_4.h
@@ -0,0 +1,457 @@
+// basisu_pvrtc1_4.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_gpu_texture.h"
+
+namespace basisu
+{
+	enum 
+	{ 
+		PVRTC2_MIN_WIDTH = 16, 
+		PVRTC2_MIN_HEIGHT = 8, 
+		PVRTC4_MIN_WIDTH = 8, 
+		PVRTC4_MIN_HEIGHT = 8 
+	};
+	
+	struct pvrtc4_block
+	{
+		uint32_t m_modulation;
+		uint32_t m_endpoints;
+
+		pvrtc4_block() : m_modulation(0), m_endpoints(0) { }
+
+		inline bool operator== (const pvrtc4_block& rhs) const
+		{
+			return (m_modulation == rhs.m_modulation) && (m_endpoints == rhs.m_endpoints);
+		}
+
+		inline void clear()
+		{
+			m_modulation = 0;
+			m_endpoints = 0;
+		}
+
+		inline bool get_block_uses_transparent_modulation() const
+		{
+			return (m_endpoints & 1) != 0;
+		}
+
+		inline bool is_endpoint_opaque(uint32_t endpoint_index) const
+		{
+			static const uint32_t s_bitmasks[2] = { 0x8000U, 0x80000000U };
+			return (m_endpoints & s_bitmasks[open_range_check(endpoint_index, 2U)]) != 0;
+		}
+
+		// Returns raw endpoint or 8888
+		color_rgba get_endpoint(uint32_t endpoint_index, bool unpack) const;
+		
+		color_rgba get_endpoint_5554(uint32_t endpoint_index) const;
+		
+		static uint32_t get_component_precision_in_bits(uint32_t c, uint32_t endpoint_index, bool opaque_endpoint)
+		{
+			static const uint32_t s_comp_prec[4][4] =
+			{
+				// R0 G0 B0 A0      R1 G1 B1 A1
+				{  4, 4, 3, 3 }, {  4, 4, 4, 3 }, // transparent endpoint
+
+				{  5, 5, 4, 0 }, {  5, 5, 5, 0 }  // opaque endpoint
+			};
+			return s_comp_prec[open_range_check(endpoint_index, 2U) + (opaque_endpoint * 2)][open_range_check(c, 4U)];
+		}
+
+		static color_rgba get_color_precision_in_bits(uint32_t endpoint_index, bool opaque_endpoint)
+		{
+			static const color_rgba s_color_prec[4] =
+			{
+			   color_rgba(4, 4, 3, 3), color_rgba(4, 4, 4, 3), // transparent endpoint
+			   color_rgba(5, 5, 4, 0), color_rgba(5, 5, 5, 0)  // opaque endpoint
+			};
+			return s_color_prec[open_range_check(endpoint_index, 2U) + (opaque_endpoint * 2)];
+		}
+		
+		inline uint32_t get_modulation(uint32_t x, uint32_t y) const
+		{
+			assert((x < 4) && (y < 4));
+			return (m_modulation >> ((y * 4 + x) * 2)) & 3;
+		}
+
+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < 4) && (y < 4) && (s < 4));
+			uint32_t n = (y * 4 + x) * 2;
+			m_modulation = (m_modulation & (~(3 << n))) | (s << n);
+			assert(get_modulation(x, y) == s);
+		}
+
+		// Scaled by 8
+		inline const uint32_t* get_scaled_modulation_values(bool block_uses_transparent_modulation) const
+		{
+			static const uint32_t s_block_scales[2][4] = { { 0, 3, 5, 8 }, { 0, 4, 4, 8 } };
+			return s_block_scales[block_uses_transparent_modulation];
+		}
+
+		// Scaled by 8
+		inline uint32_t get_scaled_modulation(uint32_t x, uint32_t y) const
+		{
+			return get_scaled_modulation_values(get_block_uses_transparent_modulation())[get_modulation(x, y)];
+		}
+
+		inline void byte_swap()
+		{
+			m_modulation = byteswap32(m_modulation);
+			m_endpoints = byteswap32(m_endpoints);
+		}
+
+		// opaque endpoints:	554, 555
+		// transparent endpoints: 3443, 3444
+		inline void set_endpoint_raw(uint32_t endpoint_index, const color_rgba& c, bool opaque_endpoint)
+		{
+			assert(endpoint_index < 2);
+			const uint32_t m = m_endpoints & 1;
+			uint32_t r = c[0], g = c[1], b = c[2], a = c[3];
+						
+			uint32_t packed;
+
+			if (opaque_endpoint)
+			{
+				if (!endpoint_index)
+				{
+					// 554
+					// 1RRRRRGGGGGBBBBM
+					assert((r < 32) && (g < 32) && (b < 16));
+					packed = 0x8000 | (r << 10) | (g << 5) | (b << 1) | m;
+				}
+				else
+				{
+					// 555
+					// 1RRRRRGGGGGBBBBB
+					assert((r < 32) && (g < 32) && (b < 32));
+					packed = 0x8000 | (r << 10) | (g << 5) | b;
+				}
+			}
+			else
+			{
+				if (!endpoint_index)
+				{
+					// 3443
+					// 0AAA RRRR GGGG BBBM
+					assert((r < 16) && (g < 16) && (b < 8) && (a < 8));
+					packed = (a << 12) | (r << 8) | (g << 4) | (b << 1) | m;
+				}
+				else
+				{
+					// 3444
+					// 0AAA RRRR GGGG BBBB
+					assert((r < 16) && (g < 16) && (b < 16) && (a < 8));
+					packed = (a << 12) | (r << 8) | (g << 4) | b;
+				}
+			}
+
+			assert(packed <= 0xFFFF);
+
+			if (endpoint_index)
+				m_endpoints = (m_endpoints & 0xFFFFU) | (packed << 16);
+			else
+				m_endpoints = (m_endpoints & 0xFFFF0000U) | packed;
+		}
+	};
+
+	typedef vector2D<pvrtc4_block> pvrtc4_block_vector2D;
+
+	uint32_t pvrtc4_swizzle_uv(uint32_t XSize, uint32_t YSize, uint32_t XPos, uint32_t YPos);
+
+	class pvrtc4_image
+	{
+	public:
+		inline pvrtc4_image() :
+			m_width(0), m_height(0), m_block_width(0), m_block_height(0), m_uses_alpha(false)
+		{
+		}
+
+		inline pvrtc4_image(uint32_t width, uint32_t height) :
+			m_width(0), m_height(0), m_block_width(0), m_block_height(0), m_uses_alpha(false)
+		{
+			resize(width, height);
+		}
+
+		inline void clear()
+		{
+			m_width = 0;
+			m_height = 0;
+			m_block_width = 0;
+			m_block_height = 0;
+			m_blocks.clear();
+			m_uses_alpha = false;
+		}
+
+		inline void resize(uint32_t width, uint32_t height)
+		{
+			if ((width == m_width) && (height == m_height))
+				return;
+
+			m_width = width;
+			m_height = height;
+
+			m_block_width = (width + 3) >> 2;
+			m_block_height = (height + 3) >> 2;
+
+			m_blocks.resize(m_block_width, m_block_height);
+		}
+
+		inline uint32_t get_width() const { return m_width; }
+		inline uint32_t get_height() const { return m_height; }
+
+		inline uint32_t get_block_width() const { return m_block_width; }
+		inline uint32_t get_block_height() const { return m_block_height; }
+
+		inline const pvrtc4_block_vector2D &get_blocks() const { return m_blocks; }
+		inline		 pvrtc4_block_vector2D &get_blocks() { return m_blocks; }
+
+		inline uint32_t get_total_blocks() const { return m_block_width * m_block_height; }
+
+		inline bool get_uses_alpha() const { return m_uses_alpha; }
+		inline void set_uses_alpha(bool uses_alpha) { m_uses_alpha = uses_alpha; }
+
+		inline bool are_blocks_equal(const pvrtc4_image& rhs) const
+		{
+			return m_blocks == rhs.m_blocks;
+		}
+
+		inline void set_to_black()
+		{
+			memset(m_blocks.get_ptr(), 0, m_blocks.size_in_bytes());
+		}
+
+		inline bool get_block_uses_transparent_modulation(uint32_t bx, uint32_t by) const
+		{
+			return m_blocks(bx, by).get_block_uses_transparent_modulation();
+		}
+
+		inline bool is_endpoint_opaque(uint32_t bx, uint32_t by, uint32_t endpoint_index) const
+		{
+			return m_blocks(bx, by).is_endpoint_opaque(endpoint_index);
+		}
+				
+		color_rgba get_endpoint(uint32_t bx, uint32_t by, uint32_t endpoint_index, bool unpack) const
+		{
+			assert((bx < m_block_width) && (by < m_block_height));
+			return m_blocks(bx, by).get_endpoint(endpoint_index, unpack);
+		}
+
+		inline uint32_t get_modulation(uint32_t x, uint32_t y) const
+		{
+			assert((x < m_width) && (y < m_height));
+			return m_blocks(x >> 2, y >> 2).get_modulation(x & 3, y & 3);
+		}
+				
+		// Returns true if the block uses transparent modulation.
+		bool get_interpolated_colors(uint32_t x, uint32_t y, color_rgba* pColors) const;
+		
+		color_rgba get_pixel(uint32_t x, uint32_t y, uint32_t m) const;
+		
+		inline color_rgba get_pixel(uint32_t x, uint32_t y) const
+		{
+			assert((x < m_width) && (y < m_height));
+			return get_pixel(x, y, m_blocks(x >> 2, y >> 2).get_modulation(x & 3, y & 3));
+		}
+
+		void deswizzle()
+		{
+			pvrtc4_block_vector2D temp(m_blocks);
+
+			for (uint32_t y = 0; y < m_block_height; y++)
+				for (uint32_t x = 0; x < m_block_width; x++)
+					m_blocks(x, y) = temp[pvrtc4_swizzle_uv(m_block_width, m_block_height, x, y)];
+		}
+
+		void swizzle()
+		{
+			pvrtc4_block_vector2D temp(m_blocks);
+
+			for (uint32_t y = 0; y < m_block_height; y++)
+				for (uint32_t x = 0; x < m_block_width; x++)
+					m_blocks[pvrtc4_swizzle_uv(m_block_width, m_block_height, x, y)] = temp(x, y);
+		}
+
+		void unpack_all_pixels(image& img) const
+		{
+			img.crop(m_width, m_height);
+
+			for (uint32_t y = 0; y < m_height; y++)
+				for (uint32_t x = 0; x < m_width; x++)
+					img(x, y) = get_pixel(x, y);
+		}
+
+		void unpack_block(image &dst, uint32_t block_x, uint32_t block_y)
+		{
+			for (uint32_t y = 0; y < 4; y++)
+				for (uint32_t x = 0; x < 4; x++)
+					dst(x, y) = get_pixel(block_x * 4 + x, block_y * 4 + y);
+		}
+
+		inline int wrap_x(int x) const
+		{
+			return posmod(x, m_width);
+		}
+
+		inline int wrap_y(int y) const
+		{
+			return posmod(y, m_height);
+		}
+
+		inline int wrap_block_x(int bx) const
+		{
+			return posmod(bx, m_block_width);
+		}
+
+		inline int wrap_block_y(int by) const
+		{
+			return posmod(by, m_block_height);
+		}
+
+		inline vec2F get_interpolation_factors(uint32_t x, uint32_t y) const
+		{
+			// 0 1 2 3
+			// 2 3 0 1
+			// .5 .75 0 .25
+			static const float s_interp[4] = { 2, 3, 0, 1 };
+			return vec2F(s_interp[x & 3], s_interp[y & 3]);
+		}
+
+		inline color_rgba interpolate(int x, int y,
+			const color_rgba& p, const color_rgba& q,
+			const color_rgba& r, const color_rgba& s) const
+		{
+			static const int s_interp[4] = { 2, 3, 0, 1 };
+			const int u_interp = s_interp[x & 3];
+			const int v_interp = s_interp[y & 3];
+
+			color_rgba result;
+
+			for (uint32_t c = 0; c < 4; c++)
+			{
+				int t = p[c] * 4 + u_interp * ((int)q[c] - (int)p[c]);
+				int b = r[c] * 4 + u_interp * ((int)s[c] - (int)r[c]);
+				int v = t * 4 + v_interp * (b - t);
+				if (c < 3)
+				{
+					v >>= 1;
+					v += (v >> 5);
+				}
+				else
+				{
+					v += (v >> 4);
+				}
+				assert((v >= 0) && (v < 256));
+				result[c] = static_cast<uint8_t>(v);
+			}
+
+			return result;
+		}
+
+		inline void set_modulation(uint32_t x, uint32_t y, uint32_t s)
+		{
+			assert((x < m_width) && (y < m_height));
+			return m_blocks(x >> 2, y >> 2).set_modulation(x & 3, y & 3, s);
+		}
+
+		inline uint64_t map_pixel(uint32_t x, uint32_t y, const color_rgba& c, bool perceptual, bool alpha_is_significant, bool record = true)
+		{
+			color_rgba v[4];
+			get_interpolated_colors(x, y, v);
+
+			uint64_t best_dist = color_distance(perceptual, c, v[0], alpha_is_significant);
+			uint32_t best_v = 0;
+			for (uint32_t i = 1; i < 4; i++)
+			{
+				uint64_t dist = color_distance(perceptual, c, v[i], alpha_is_significant);
+				if (dist < best_dist)
+				{
+					best_dist = dist;
+					best_v = i;
+				}
+			}
+
+			if (record)
+				set_modulation(x, y, best_v);
+
+			return best_dist;
+		}
+
+		inline uint64_t remap_pixels_influenced_by_endpoint(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant)
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += map_pixel(x, y, orig_img(x, y), perceptual, alpha_is_significant);
+				}
+			}
+
+			return total_error;
+		}
+
+		inline uint64_t evaluate_1x1_endpoint_error(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual, bool alpha_is_significant, uint64_t threshold_error = 0) const
+		{
+			uint64_t total_error = 0;
+
+			for (int yd = -3; yd <= 3; yd++)
+			{
+				const int y = wrap_y((int)by * 4 + 2 + yd);
+
+				for (int xd = -3; xd <= 3; xd++)
+				{
+					const int x = wrap_x((int)bx * 4 + 2 + xd);
+
+					total_error += color_distance(perceptual, get_pixel(x, y), orig_img(x, y), alpha_is_significant);
+
+					if ((threshold_error) && (total_error >= threshold_error))
+						return total_error;
+				}
+			}
+
+			return total_error;
+		}
+
+		uint64_t local_endpoint_optimization_opaque(uint32_t bx, uint32_t by, const image& orig_img, bool perceptual);
+
+		inline uint64_t map_all_pixels(const image& img, bool perceptual, bool alpha_is_significant)
+		{
+			assert(m_width == img.get_width());
+			assert(m_height == img.get_height());
+
+			uint64_t total_error = 0;
+			for (uint32_t y = 0; y < img.get_height(); y++)
+				for (uint32_t x = 0; x < img.get_width(); x++)
+					total_error += map_pixel(x, y, img(x, y), perceptual, alpha_is_significant);
+
+			return total_error;
+		}
+	
+	public:						
+		uint32_t m_width, m_height;
+		pvrtc4_block_vector2D m_blocks;
+		uint32_t m_block_width, m_block_height;
+						
+		bool m_uses_alpha;
+	};
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
new file mode 100644
index 0000000000..597cb3f618
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp
@@ -0,0 +1,340 @@
+// basisu_resampler_filters.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_resampler_filters.h"
+
+#ifndef M_PI
+	#define M_PI 3.14159265358979323846
+#endif
+
+namespace basisu
+{
+#define BOX_FILTER_SUPPORT (0.5f)
+	static float box_filter(float t) /* pulse/Fourier window */
+	{
+		// make_clist() calls the filter function with t inverted (pos = left, neg = right)
+		if ((t >= -0.5f) && (t < 0.5f))
+			return 1.0f;
+		else
+			return 0.0f;
+	}
+
+#define TENT_FILTER_SUPPORT (1.0f)
+	static float tent_filter(float t) /* box (*) box, bilinear/triangle */
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 1.0f)
+			return 1.0f - t;
+		else
+			return 0.0f;
+	}
+
+#define BELL_SUPPORT (1.5f)
+	static float bell_filter(float t) /* box (*) box (*) box */
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < .5f)
+			return (.75f - (t * t));
+
+		if (t < 1.5f)
+		{
+			t = (t - 1.5f);
+			return (.5f * (t * t));
+		}
+
+		return (0.0f);
+	}
+
+#define B_SPLINE_SUPPORT (2.0f)
+	static float B_spline_filter(float t) /* box (*) box (*) box (*) box */
+	{
+		float tt;
+
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 1.0f)
+		{
+			tt = t * t;
+			return ((.5f * tt * t) - tt + (2.0f / 3.0f));
+		}
+		else if (t < 2.0f)
+		{
+			t = 2.0f - t;
+			return ((1.0f / 6.0f) * (t * t * t));
+		}
+
+		return (0.0f);
+	}
+
+	// Dodgson, N., "Quadratic Interpolation for Image Resampling"
+#define QUADRATIC_SUPPORT 1.5f
+	static float quadratic(float t, const float R)
+	{
+		if (t < 0.0f)
+			t = -t;
+		if (t < QUADRATIC_SUPPORT)
+		{
+			float tt = t * t;
+			if (t <= .5f)
+				return (-2.0f * R) * tt + .5f * (R + 1.0f);
+			else
+				return (R * tt) + (-2.0f * R - .5f) * t + (3.0f / 4.0f) * (R + 1.0f);
+		}
+		else
+			return 0.0f;
+	}
+
+	static float quadratic_interp_filter(float t)
+	{
+		return quadratic(t, 1.0f);
+	}
+
+	static float quadratic_approx_filter(float t)
+	{
+		return quadratic(t, .5f);
+	}
+
+	static float quadratic_mix_filter(float t)
+	{
+		return quadratic(t, .8f);
+	}
+
+	// Mitchell, D. and A. Netravali, "Reconstruction Filters in Computer Graphics."
+	// Computer Graphics, Vol. 22, No. 4, pp. 221-228.
+	// (B, C)
+	// (1/3, 1/3)  - Defaults recommended by Mitchell and Netravali
+	// (1, 0)	   - Equivalent to the Cubic B-Spline
+	// (0, 0.5)		- Equivalent to the Catmull-Rom Spline
+	// (0, C)		- The family of Cardinal Cubic Splines
+	// (B, 0)		- Duff's tensioned B-Splines.
+	static float mitchell(float t, const float B, const float C)
+	{
+		float tt;
+
+		tt = t * t;
+
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 1.0f)
+		{
+			t = (((12.0f - 9.0f * B - 6.0f * C) * (t * tt)) + ((-18.0f + 12.0f * B + 6.0f * C) * tt) + (6.0f - 2.0f * B));
+
+			return (t / 6.0f);
+		}
+		else if (t < 2.0f)
+		{
+			t = (((-1.0f * B - 6.0f * C) * (t * tt)) + ((6.0f * B + 30.0f * C) * tt) + ((-12.0f * B - 48.0f * C) * t) + (8.0f * B + 24.0f * C));
+
+			return (t / 6.0f);
+		}
+
+		return (0.0f);
+	}
+
+#define MITCHELL_SUPPORT (2.0f)
+	static float mitchell_filter(float t)
+	{
+		return mitchell(t, 1.0f / 3.0f, 1.0f / 3.0f);
+	}
+
+#define CATMULL_ROM_SUPPORT (2.0f)
+	static float catmull_rom_filter(float t)
+	{
+		return mitchell(t, 0.0f, .5f);
+	}
+
+	static double sinc(double x)
+	{
+		x = (x * M_PI);
+
+		if ((x < 0.01f) && (x > -0.01f))
+			return 1.0f + x * x * (-1.0f / 6.0f + x * x * 1.0f / 120.0f);
+
+		return sin(x) / x;
+	}
+
+	static float clean(double t)
+	{
+		const float EPSILON = .0000125f;
+		if (fabs(t) < EPSILON)
+			return 0.0f;
+		return (float)t;
+	}
+
+	//static double blackman_window(double x)
+	//{
+	//	return .42f + .50f * cos(M_PI*x) + .08f * cos(2.0f*M_PI*x);
+	//}
+
+	static double blackman_exact_window(double x)
+	{
+		return 0.42659071f + 0.49656062f * cos(M_PI * x) + 0.07684867f * cos(2.0f * M_PI * x);
+	}
+
+#define BLACKMAN_SUPPORT (3.0f)
+	static float blackman_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 3.0f)
+			//return clean(sinc(t) * blackman_window(t / 3.0f));
+			return clean(sinc(t) * blackman_exact_window(t / 3.0f));
+		else
+			return (0.0f);
+	}
+
+#define GAUSSIAN_SUPPORT (1.25f)
+	static float gaussian_filter(float t) // with blackman window
+	{
+		if (t < 0)
+			t = -t;
+		if (t < GAUSSIAN_SUPPORT)
+			return clean(exp(-2.0f * t * t) * sqrt(2.0f / M_PI) * blackman_exact_window(t / GAUSSIAN_SUPPORT));
+		else
+			return 0.0f;
+	}
+
+	// Windowed sinc -- see "Jimm Blinn's Corner: Dirty Pixels" pg. 26.
+#define LANCZOS3_SUPPORT (3.0f)
+	static float lanczos3_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 3.0f)
+			return clean(sinc(t) * sinc(t / 3.0f));
+		else
+			return (0.0f);
+	}
+
+#define LANCZOS4_SUPPORT (4.0f)
+	static float lanczos4_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 4.0f)
+			return clean(sinc(t) * sinc(t / 4.0f));
+		else
+			return (0.0f);
+	}
+
+#define LANCZOS6_SUPPORT (6.0f)
+	static float lanczos6_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 6.0f)
+			return clean(sinc(t) * sinc(t / 6.0f));
+		else
+			return (0.0f);
+	}
+
+#define LANCZOS12_SUPPORT (12.0f)
+	static float lanczos12_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < 12.0f)
+			return clean(sinc(t) * sinc(t / 12.0f));
+		else
+			return (0.0f);
+	}
+
+	static double bessel0(double x)
+	{
+		const double EPSILON_RATIO = 1E-16;
+		double xh, sum, pow, ds;
+		int k;
+
+		xh = 0.5 * x;
+		sum = 1.0;
+		pow = 1.0;
+		k = 0;
+		ds = 1.0;
+		while (ds > sum * EPSILON_RATIO) // FIXME: Shouldn't this stop after X iterations for max. safety?
+		{
+			++k;
+			pow = pow * (xh / k);
+			ds = pow * pow;
+			sum = sum + ds;
+		}
+
+		return sum;
+	}
+
+	//static const float KAISER_ALPHA = 4.0;
+	static double kaiser(double alpha, double half_width, double x)
+	{
+		const double ratio = (x / half_width);
+		return bessel0(alpha * sqrt(1 - ratio * ratio)) / bessel0(alpha);
+	}
+
+#define KAISER_SUPPORT 3
+	static float kaiser_filter(float t)
+	{
+		if (t < 0.0f)
+			t = -t;
+
+		if (t < KAISER_SUPPORT)
+		{
+			// db atten
+			const float att = 40.0f;
+			const float alpha = (float)(exp(log((double)0.58417 * (att - 20.96)) * 0.4) + 0.07886 * (att - 20.96));
+			//const float alpha = KAISER_ALPHA;
+			return (float)clean(sinc(t) * kaiser(alpha, KAISER_SUPPORT, t));
+		}
+
+		return 0.0f;
+	}
+
+	const resample_filter g_resample_filters[] =
+	{
+		{ "box", box_filter, BOX_FILTER_SUPPORT }, 
+		{ "tent", tent_filter, TENT_FILTER_SUPPORT }, 
+		{ "bell", bell_filter, BELL_SUPPORT }, 
+		{ "b-spline", B_spline_filter, B_SPLINE_SUPPORT },
+		{ "mitchell", mitchell_filter, MITCHELL_SUPPORT }, 
+		{ "blackman", blackman_filter, BLACKMAN_SUPPORT }, 
+		{ "lanczos3", lanczos3_filter, LANCZOS3_SUPPORT },
+		{ "lanczos4", lanczos4_filter, LANCZOS4_SUPPORT },
+		{ "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, 
+		{ "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, 
+		{ "kaiser", kaiser_filter, KAISER_SUPPORT }, 
+		{ "gaussian", gaussian_filter, GAUSSIAN_SUPPORT },
+		{ "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, 
+		{ "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, 
+		{ "quadratic_mix", quadratic_mix_filter, QUADRATIC_SUPPORT },
+	};
+
+	const int g_num_resample_filters = BASISU_ARRAY_SIZE(g_resample_filters);
+
+	int find_resample_filter(const char *pName)
+	{
+		for (int i = 0; i < g_num_resample_filters; i++)
+			if (strcmp(pName, g_resample_filters[i].name) == 0)
+				return i;
+		return -1;
+	}
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.cpp b/thirdparty/basis_universal/encoder/basisu_resampler.cpp
new file mode 100644
index 0000000000..e193ce83ff
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.cpp
@@ -0,0 +1,852 @@
+// basisu_resampler.cpp
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_resampler.h"
+#include "basisu_resampler_filters.h"
+
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+
+#define RESAMPLER_DEBUG 0
+
+namespace basisu
+{
+	static inline int resampler_range_check(int v, int h)
+	{
+		BASISU_NOTE_UNUSED(h);
+		assert((v >= 0) && (v < h));
+		return v;
+	}
+
+	// Float to int cast with truncation.
+	static inline int cast_to_int(Resample_Real i)
+	{
+		return (int)i;
+	}
+
+	// Ensure that the contributing source sample is within bounds. If not, reflect, clamp, or wrap.
+	int Resampler::reflect(const int j, const int src_x, const Boundary_Op boundary_op)
+	{
+		int n;
+
+		if (j < 0)
+		{
+			if (boundary_op == BOUNDARY_REFLECT)
+			{
+				n = -j;
+
+				if (n >= src_x)
+					n = src_x - 1;
+			}
+			else if (boundary_op == BOUNDARY_WRAP)
+				n = posmod(j, src_x);
+			else
+				n = 0;
+		}
+		else if (j >= src_x)
+		{
+			if (boundary_op == BOUNDARY_REFLECT)
+			{
+				n = (src_x - j) + (src_x - 1);
+
+				if (n < 0)
+					n = 0;
+			}
+			else if (boundary_op == BOUNDARY_WRAP)
+				n = posmod(j, src_x);
+			else
+				n = src_x - 1;
+		}
+		else
+			n = j;
+
+		return n;
+	}
+
+	// The make_clist() method generates, for all destination samples,
+	// the list of all source samples with non-zero weighted contributions.
+	Resampler::Contrib_List * Resampler::make_clist(
+		int src_x, int dst_x, Boundary_Op boundary_op,
+		Resample_Real(*Pfilter)(Resample_Real),
+		Resample_Real filter_support,
+		Resample_Real filter_scale,
+		Resample_Real src_ofs)
+	{
+		struct Contrib_Bounds
+		{
+			// The center of the range in DISCRETE coordinates (pixel center = 0.0f).
+			Resample_Real center;
+			int left, right;
+		};
+
+		int i, j, k, n, left, right;
+		Resample_Real total_weight;
+		Resample_Real xscale, center, half_width, weight;
+		Contrib_List* Pcontrib;
+		Contrib* Pcpool;
+		Contrib* Pcpool_next;
+		Contrib_Bounds* Pcontrib_bounds;
+
+		if ((Pcontrib = (Contrib_List*)calloc(dst_x, sizeof(Contrib_List))) == NULL)
+			return NULL;
+
+		Pcontrib_bounds = (Contrib_Bounds*)calloc(dst_x, sizeof(Contrib_Bounds));
+		if (!Pcontrib_bounds)
+		{
+			free(Pcontrib);
+			return (NULL);
+		}
+
+		const Resample_Real oo_filter_scale = 1.0f / filter_scale;
+
+		const Resample_Real NUDGE = 0.5f;
+		xscale = dst_x / (Resample_Real)src_x;
+
+		if (xscale < 1.0f)
+		{
+			int total;
+			(void)total;
+
+			// Handle case when there are fewer destination samples than source samples (downsampling/minification).
+
+			// stretched half width of filter
+			half_width = (filter_support / xscale) * filter_scale;
+
+			// Find the range of source sample(s) that will contribute to each destination sample.
+
+			for (i = 0, n = 0; i < dst_x; i++)
+			{
+				// Convert from discrete to continuous coordinates, scale, then convert back to discrete.
+				center = ((Resample_Real)i + NUDGE) / xscale;
+				center -= NUDGE;
+				center += src_ofs;
+
+				left = cast_to_int((Resample_Real)floor(center - half_width));
+				right = cast_to_int((Resample_Real)ceil(center + half_width));
+
+				Pcontrib_bounds[i].center = center;
+				Pcontrib_bounds[i].left = left;
+				Pcontrib_bounds[i].right = right;
+
+				n += (right - left + 1);
+			}
+
+			// Allocate memory for contributors. 
+
+			if ((n == 0) || ((Pcpool = (Contrib*)calloc(n, sizeof(Contrib))) == NULL))
+			{
+				free(Pcontrib);
+				free(Pcontrib_bounds);
+				return NULL;
+			}
+			total = n;
+
+			Pcpool_next = Pcpool;
+
+			// Create the list of source samples which contribute to each destination sample.
+
+			for (i = 0; i < dst_x; i++)
+			{
+				int max_k = -1;
+				Resample_Real max_w = -1e+20f;
+
+				center = Pcontrib_bounds[i].center;
+				left = Pcontrib_bounds[i].left;
+				right = Pcontrib_bounds[i].right;
+
+				Pcontrib[i].n = 0;
+				Pcontrib[i].p = Pcpool_next;
+				Pcpool_next += (right - left + 1);
+				assert((Pcpool_next - Pcpool) <= total);
+
+				total_weight = 0;
+
+				for (j = left; j <= right; j++)
+					total_weight += (*Pfilter)((center - (Resample_Real)j) * xscale * oo_filter_scale);
+				const Resample_Real norm = static_cast<Resample_Real>(1.0f / total_weight);
+
+				total_weight = 0;
+
+#if RESAMPLER_DEBUG
+				printf("%i: ", i);
+#endif
+
+				for (j = left; j <= right; j++)
+				{
+					weight = (*Pfilter)((center - (Resample_Real)j) * xscale * oo_filter_scale) * norm;
+					if (weight == 0.0f)
+						continue;
+
+					n = reflect(j, src_x, boundary_op);
+
+#if RESAMPLER_DEBUG
+					printf("%i(%f), ", n, weight);
+#endif
+
+					// Increment the number of source samples which contribute to the current destination sample.
+
+					k = Pcontrib[i].n++;
+
+					Pcontrib[i].p[k].pixel = (unsigned short)n; /* store src sample number */
+					Pcontrib[i].p[k].weight = weight;           /* store src sample weight */
+
+					total_weight += weight; /* total weight of all contributors */
+
+					if (weight > max_w)
+					{
+						max_w = weight;
+						max_k = k;
+					}
+				}
+
+#if RESAMPLER_DEBUG
+				printf("\n\n");
+#endif
+
+				//assert(Pcontrib[i].n);
+				//assert(max_k != -1);
+				if ((max_k == -1) || (Pcontrib[i].n == 0))
+				{
+					free(Pcpool);
+					free(Pcontrib);
+					free(Pcontrib_bounds);
+					return NULL;
+				}
+
+				if (total_weight != 1.0f)
+					Pcontrib[i].p[max_k].weight += 1.0f - total_weight;
+			}
+		}
+		else
+		{
+			// Handle case when there are more destination samples than source samples (upsampling).
+
+			half_width = filter_support * filter_scale;
+
+			// Find the source sample(s) that contribute to each destination sample.
+
+			for (i = 0, n = 0; i < dst_x; i++)
+			{
+				// Convert from discrete to continuous coordinates, scale, then convert back to discrete.
+				center = ((Resample_Real)i + NUDGE) / xscale;
+				center -= NUDGE;
+				center += src_ofs;
+
+				left = cast_to_int((Resample_Real)floor(center - half_width));
+				right = cast_to_int((Resample_Real)ceil(center + half_width));
+
+				Pcontrib_bounds[i].center = center;
+				Pcontrib_bounds[i].left = left;
+				Pcontrib_bounds[i].right = right;
+
+				n += (right - left + 1);
+			}
+
+			/* Allocate memory for contributors. */
+
+			int total = n;
+			if ((total == 0) || ((Pcpool = (Contrib*)calloc(total, sizeof(Contrib))) == NULL))
+			{
+				free(Pcontrib);
+				free(Pcontrib_bounds);
+				return NULL;
+			}
+
+			Pcpool_next = Pcpool;
+
+			// Create the list of source samples which contribute to each destination sample.
+
+			for (i = 0; i < dst_x; i++)
+			{
+				int max_k = -1;
+				Resample_Real max_w = -1e+20f;
+
+				center = Pcontrib_bounds[i].center;
+				left = Pcontrib_bounds[i].left;
+				right = Pcontrib_bounds[i].right;
+
+				Pcontrib[i].n = 0;
+				Pcontrib[i].p = Pcpool_next;
+				Pcpool_next += (right - left + 1);
+				assert((Pcpool_next - Pcpool) <= total);
+
+				total_weight = 0;
+				for (j = left; j <= right; j++)
+					total_weight += (*Pfilter)((center - (Resample_Real)j) * oo_filter_scale);
+
+				const Resample_Real norm = static_cast<Resample_Real>(1.0f / total_weight);
+
+				total_weight = 0;
+
+#if RESAMPLER_DEBUG
+				printf("%i: ", i);
+#endif
+
+				for (j = left; j <= right; j++)
+				{
+					weight = (*Pfilter)((center - (Resample_Real)j) * oo_filter_scale) * norm;
+					if (weight == 0.0f)
+						continue;
+
+					n = reflect(j, src_x, boundary_op);
+
+#if RESAMPLER_DEBUG
+					printf("%i(%f), ", n, weight);
+#endif
+
+					// Increment the number of source samples which contribute to the current destination sample.
+
+					k = Pcontrib[i].n++;
+
+					Pcontrib[i].p[k].pixel = (unsigned short)n; /* store src sample number */
+					Pcontrib[i].p[k].weight = weight;           /* store src sample weight */
+
+					total_weight += weight; /* total weight of all contributors */
+
+					if (weight > max_w)
+					{
+						max_w = weight;
+						max_k = k;
+					}
+				}
+
+#if RESAMPLER_DEBUG
+				printf("\n\n");
+#endif
+
+				//assert(Pcontrib[i].n);
+				//assert(max_k != -1);
+
+				if ((max_k == -1) || (Pcontrib[i].n == 0))
+				{
+					free(Pcpool);
+					free(Pcontrib);
+					free(Pcontrib_bounds);
+					return NULL;
+				}
+
+				if (total_weight != 1.0f)
+					Pcontrib[i].p[max_k].weight += 1.0f - total_weight;
+			}
+		}
+
+#if RESAMPLER_DEBUG
+		printf("*******\n");
+#endif
+
+		free(Pcontrib_bounds);
+
+		return Pcontrib;
+	}
+
+	void Resampler::resample_x(Sample * Pdst, const Sample * Psrc)
+	{
+		assert(Pdst);
+		assert(Psrc);
+
+		int i, j;
+		Sample total;
+		Contrib_List* Pclist = m_Pclist_x;
+		Contrib* p;
+
+		for (i = m_resample_dst_x; i > 0; i--, Pclist++)
+		{
+#if BASISU_RESAMPLER_DEBUG_OPS
+			total_ops += Pclist->n;
+#endif
+
+			for (j = Pclist->n, p = Pclist->p, total = 0; j > 0; j--, p++)
+				total += Psrc[p->pixel] * p->weight;
+
+			*Pdst++ = total;
+		}
+	}
+
+	void Resampler::scale_y_mov(Sample * Ptmp, const Sample * Psrc, Resample_Real weight, int dst_x)
+	{
+		int i;
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+		total_ops += dst_x;
+#endif
+
+		// Not += because temp buf wasn't cleared.
+		for (i = dst_x; i > 0; i--)
+			* Ptmp++ = *Psrc++ * weight;
+	}
+
+	void Resampler::scale_y_add(Sample * Ptmp, const Sample * Psrc, Resample_Real weight, int dst_x)
+	{
+#if BASISU_RESAMPLER_DEBUG_OPS
+		total_ops += dst_x;
+#endif
+
+		for (int i = dst_x; i > 0; i--)
+			(*Ptmp++) += *Psrc++ * weight;
+	}
+
+	void Resampler::clamp(Sample * Pdst, int n)
+	{
+		while (n > 0)
+		{
+			Sample x = *Pdst;
+			*Pdst++ = clamp_sample(x);
+			n--;
+		}
+	}
+
+	void Resampler::resample_y(Sample * Pdst)
+	{
+		int i, j;
+		Sample* Psrc;
+		Contrib_List* Pclist = &m_Pclist_y[m_cur_dst_y];
+
+		Sample* Ptmp = m_delay_x_resample ? m_Ptmp_buf : Pdst;
+		assert(Ptmp);
+
+		/* Process each contributor. */
+
+		for (i = 0; i < Pclist->n; i++)
+		{
+			// locate the contributor's location in the scan buffer -- the contributor must always be found!
+			for (j = 0; j < MAX_SCAN_BUF_SIZE; j++)
+				if (m_Pscan_buf->scan_buf_y[j] == Pclist->p[i].pixel)
+					break;
+
+			assert(j < MAX_SCAN_BUF_SIZE);
+
+			Psrc = m_Pscan_buf->scan_buf_l[j];
+
+			if (!i)
+				scale_y_mov(Ptmp, Psrc, Pclist->p[i].weight, m_intermediate_x);
+			else
+				scale_y_add(Ptmp, Psrc, Pclist->p[i].weight, m_intermediate_x);
+
+			/* If this source line doesn't contribute to any
+			* more destination lines then mark the scanline buffer slot
+			* which holds this source line as free.
+			* (The max. number of slots used depends on the Y
+			* axis sampling factor and the scaled filter width.)
+			*/
+
+			if (--m_Psrc_y_count[resampler_range_check(Pclist->p[i].pixel, m_resample_src_y)] == 0)
+			{
+				m_Psrc_y_flag[resampler_range_check(Pclist->p[i].pixel, m_resample_src_y)] = false;
+				m_Pscan_buf->scan_buf_y[j] = -1;
+			}
+		}
+
+		/* Now generate the destination line */
+
+		if (m_delay_x_resample) // Was X resampling delayed until after Y resampling?
+		{
+			assert(Pdst != Ptmp);
+			resample_x(Pdst, Ptmp);
+		}
+		else
+		{
+			assert(Pdst == Ptmp);
+		}
+
+		if (m_lo < m_hi)
+			clamp(Pdst, m_resample_dst_x);
+	}
+
+	bool Resampler::put_line(const Sample * Psrc)
+	{
+		int i;
+
+		if (m_cur_src_y >= m_resample_src_y)
+			return false;
+
+		/* Does this source line contribute
+		* to any destination line? if not,
+		* exit now.
+		*/
+
+		if (!m_Psrc_y_count[resampler_range_check(m_cur_src_y, m_resample_src_y)])
+		{
+			m_cur_src_y++;
+			return true;
+		}
+
+		/* Find an empty slot in the scanline buffer. (FIXME: Perf. is terrible here with extreme scaling ratios.) */
+
+		for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+			if (m_Pscan_buf->scan_buf_y[i] == -1)
+				break;
+
+		/* If the buffer is full, exit with an error. */
+
+		if (i == MAX_SCAN_BUF_SIZE)
+		{
+			m_status = STATUS_SCAN_BUFFER_FULL;
+			return false;
+		}
+
+		m_Psrc_y_flag[resampler_range_check(m_cur_src_y, m_resample_src_y)] = true;
+		m_Pscan_buf->scan_buf_y[i] = m_cur_src_y;
+
+		/* Does this slot have any memory allocated to it? */
+
+		if (!m_Pscan_buf->scan_buf_l[i])
+		{
+			if ((m_Pscan_buf->scan_buf_l[i] = (Sample*)malloc(m_intermediate_x * sizeof(Sample))) == NULL)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return false;
+			}
+		}
+
+		// Resampling on the X axis first?
+		if (m_delay_x_resample)
+		{
+			assert(m_intermediate_x == m_resample_src_x);
+
+			// Y-X resampling order
+			memcpy(m_Pscan_buf->scan_buf_l[i], Psrc, m_intermediate_x * sizeof(Sample));
+		}
+		else
+		{
+			assert(m_intermediate_x == m_resample_dst_x);
+
+			// X-Y resampling order
+			resample_x(m_Pscan_buf->scan_buf_l[i], Psrc);
+		}
+
+		m_cur_src_y++;
+
+		return true;
+	}
+
+	const Resampler::Sample* Resampler::get_line()
+	{
+		int i;
+
+		/* If all the destination lines have been
+		* generated, then always return NULL.
+		*/
+
+		if (m_cur_dst_y == m_resample_dst_y)
+			return NULL;
+
+		/* Check to see if all the required
+		* contributors are present, if not,
+		* return NULL.
+		*/
+
+		for (i = 0; i < m_Pclist_y[m_cur_dst_y].n; i++)
+			if (!m_Psrc_y_flag[resampler_range_check(m_Pclist_y[m_cur_dst_y].p[i].pixel, m_resample_src_y)])
+				return NULL;
+
+		resample_y(m_Pdst_buf);
+
+		m_cur_dst_y++;
+
+		return m_Pdst_buf;
+	}
+
+	Resampler::~Resampler()
+	{
+		int i;
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+		printf("actual ops: %i\n", total_ops);
+#endif
+
+		free(m_Pdst_buf);
+		m_Pdst_buf = NULL;
+
+		if (m_Ptmp_buf)
+		{
+			free(m_Ptmp_buf);
+			m_Ptmp_buf = NULL;
+		}
+
+		/* Don't deallocate a contibutor list
+		* if the user passed us one of their own.
+	*/
+
+		if ((m_Pclist_x) && (!m_clist_x_forced))
+		{
+			free(m_Pclist_x->p);
+			free(m_Pclist_x);
+			m_Pclist_x = NULL;
+		}
+
+		if ((m_Pclist_y) && (!m_clist_y_forced))
+		{
+			free(m_Pclist_y->p);
+			free(m_Pclist_y);
+			m_Pclist_y = NULL;
+		}
+
+		free(m_Psrc_y_count);
+		m_Psrc_y_count = NULL;
+
+		free(m_Psrc_y_flag);
+		m_Psrc_y_flag = NULL;
+
+		if (m_Pscan_buf)
+		{
+			for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+				free(m_Pscan_buf->scan_buf_l[i]);
+
+			free(m_Pscan_buf);
+			m_Pscan_buf = NULL;
+		}
+	}
+
+	void Resampler::restart()
+	{
+		if (STATUS_OKAY != m_status)
+			return;
+
+		m_cur_src_y = m_cur_dst_y = 0;
+
+		int i, j;
+		for (i = 0; i < m_resample_src_y; i++)
+		{
+			m_Psrc_y_count[i] = 0;
+			m_Psrc_y_flag[i] = false;
+		}
+
+		for (i = 0; i < m_resample_dst_y; i++)
+		{
+			for (j = 0; j < m_Pclist_y[i].n; j++)
+				m_Psrc_y_count[resampler_range_check(m_Pclist_y[i].p[j].pixel, m_resample_src_y)]++;
+		}
+
+		for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+		{
+			m_Pscan_buf->scan_buf_y[i] = -1;
+
+			free(m_Pscan_buf->scan_buf_l[i]);
+			m_Pscan_buf->scan_buf_l[i] = NULL;
+		}
+	}
+
+	Resampler::Resampler(int src_x, int src_y,
+		int dst_x, int dst_y,
+		Boundary_Op boundary_op,
+		Resample_Real sample_low, Resample_Real sample_high,
+		const char* Pfilter_name,
+		Contrib_List * Pclist_x,
+		Contrib_List * Pclist_y,
+		Resample_Real filter_x_scale,
+		Resample_Real filter_y_scale,
+		Resample_Real src_x_ofs,
+		Resample_Real src_y_ofs)
+	{
+		int i, j;
+		Resample_Real support, (*func)(Resample_Real);
+
+		assert(src_x > 0);
+		assert(src_y > 0);
+		assert(dst_x > 0);
+		assert(dst_y > 0);
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+		total_ops = 0;
+#endif
+
+		m_lo = sample_low;
+		m_hi = sample_high;
+
+		m_delay_x_resample = false;
+		m_intermediate_x = 0;
+		m_Pdst_buf = NULL;
+		m_Ptmp_buf = NULL;
+		m_clist_x_forced = false;
+		m_Pclist_x = NULL;
+		m_clist_y_forced = false;
+		m_Pclist_y = NULL;
+		m_Psrc_y_count = NULL;
+		m_Psrc_y_flag = NULL;
+		m_Pscan_buf = NULL;
+		m_status = STATUS_OKAY;
+
+		m_resample_src_x = src_x;
+		m_resample_src_y = src_y;
+		m_resample_dst_x = dst_x;
+		m_resample_dst_y = dst_y;
+
+		m_boundary_op = boundary_op;
+
+		if ((m_Pdst_buf = (Sample*)malloc(m_resample_dst_x * sizeof(Sample))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		// Find the specified filter.
+
+		if (Pfilter_name == NULL)
+			Pfilter_name = BASISU_RESAMPLER_DEFAULT_FILTER;
+
+		for (i = 0; i < g_num_resample_filters; i++)
+			if (strcmp(Pfilter_name, g_resample_filters[i].name) == 0)
+				break;
+
+		if (i == g_num_resample_filters)
+		{
+			m_status = STATUS_BAD_FILTER_NAME;
+			return;
+		}
+
+		func = g_resample_filters[i].func;
+		support = g_resample_filters[i].support;
+
+		/* Create contributor lists, unless the user supplied custom lists. */
+
+		if (!Pclist_x)
+		{
+			m_Pclist_x = make_clist(m_resample_src_x, m_resample_dst_x, m_boundary_op, func, support, filter_x_scale, src_x_ofs);
+			if (!m_Pclist_x)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return;
+			}
+		}
+		else
+		{
+			m_Pclist_x = Pclist_x;
+			m_clist_x_forced = true;
+		}
+
+		if (!Pclist_y)
+		{
+			m_Pclist_y = make_clist(m_resample_src_y, m_resample_dst_y, m_boundary_op, func, support, filter_y_scale, src_y_ofs);
+			if (!m_Pclist_y)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return;
+			}
+		}
+		else
+		{
+			m_Pclist_y = Pclist_y;
+			m_clist_y_forced = true;
+		}
+
+		if ((m_Psrc_y_count = (int*)calloc(m_resample_src_y, sizeof(int))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		if ((m_Psrc_y_flag = (unsigned char*)calloc(m_resample_src_y, sizeof(unsigned char))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		// Count how many times each source line contributes to a destination line.
+
+		for (i = 0; i < m_resample_dst_y; i++)
+			for (j = 0; j < m_Pclist_y[i].n; j++)
+				m_Psrc_y_count[resampler_range_check(m_Pclist_y[i].p[j].pixel, m_resample_src_y)]++;
+
+		if ((m_Pscan_buf = (Scan_Buf*)malloc(sizeof(Scan_Buf))) == NULL)
+		{
+			m_status = STATUS_OUT_OF_MEMORY;
+			return;
+		}
+
+		for (i = 0; i < MAX_SCAN_BUF_SIZE; i++)
+		{
+			m_Pscan_buf->scan_buf_y[i] = -1;
+			m_Pscan_buf->scan_buf_l[i] = NULL;
+		}
+
+		m_cur_src_y = m_cur_dst_y = 0;
+		{
+			// Determine which axis to resample first by comparing the number of multiplies required
+			// for each possibility.
+			int x_ops = count_ops(m_Pclist_x, m_resample_dst_x);
+			int y_ops = count_ops(m_Pclist_y, m_resample_dst_y);
+
+			// Hack 10/2000: Weight Y axis ops a little more than X axis ops.
+			// (Y axis ops use more cache resources.)
+			int xy_ops = x_ops * m_resample_src_y +
+				(4 * y_ops * m_resample_dst_x) / 3;
+
+			int yx_ops = (4 * y_ops * m_resample_src_x) / 3 +
+				x_ops * m_resample_dst_y;
+
+#if BASISU_RESAMPLER_DEBUG_OPS
+			printf("src: %i %i\n", m_resample_src_x, m_resample_src_y);
+			printf("dst: %i %i\n", m_resample_dst_x, m_resample_dst_y);
+			printf("x_ops: %i\n", x_ops);
+			printf("y_ops: %i\n", y_ops);
+			printf("xy_ops: %i\n", xy_ops);
+			printf("yx_ops: %i\n", yx_ops);
+#endif
+
+			// Now check which resample order is better. In case of a tie, choose the order
+			// which buffers the least amount of data.
+			if ((xy_ops > yx_ops) ||
+				((xy_ops == yx_ops) && (m_resample_src_x < m_resample_dst_x)))
+			{
+				m_delay_x_resample = true;
+				m_intermediate_x = m_resample_src_x;
+			}
+			else
+			{
+				m_delay_x_resample = false;
+				m_intermediate_x = m_resample_dst_x;
+			}
+#if BASISU_RESAMPLER_DEBUG_OPS
+			printf("delaying: %i\n", m_delay_x_resample);
+#endif
+		}
+
+		if (m_delay_x_resample)
+		{
+			if ((m_Ptmp_buf = (Sample*)malloc(m_intermediate_x * sizeof(Sample))) == NULL)
+			{
+				m_status = STATUS_OUT_OF_MEMORY;
+				return;
+			}
+		}
+	}
+
+	void Resampler::get_clists(Contrib_List * *ptr_clist_x, Contrib_List * *ptr_clist_y)
+	{
+		if (ptr_clist_x)
+			* ptr_clist_x = m_Pclist_x;
+
+		if (ptr_clist_y)
+			* ptr_clist_y = m_Pclist_y;
+	}
+
+	int Resampler::get_filter_num()
+	{
+		return g_num_resample_filters;
+	}
+
+	const char* Resampler::get_filter_name(int filter_num)
+	{
+		if ((filter_num < 0) || (filter_num >= g_num_resample_filters))
+			return NULL;
+		else
+			return g_resample_filters[filter_num].name;
+	}
+	
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.h b/thirdparty/basis_universal/encoder/basisu_resampler.h
new file mode 100644
index 0000000000..dc0978caeb
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_resampler.h
@@ -0,0 +1,196 @@
+// basisu_resampler.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "../transcoder/basisu.h"
+
+#define BASISU_RESAMPLER_DEBUG_OPS (0)
+#define BASISU_RESAMPLER_DEFAULT_FILTER "lanczos4"
+#define BASISU_RESAMPLER_MAX_DIMENSION (16384)
+
+namespace basisu
+{
+	// float or double
+	typedef float Resample_Real;
+
+	class Resampler
+	{
+	public:
+		typedef Resample_Real Sample;
+
+		struct Contrib
+		{
+			Resample_Real weight;
+			uint16_t pixel;
+		};
+
+		struct Contrib_List
+		{
+			uint16_t n;
+			Contrib *p;
+		};
+
+		enum Boundary_Op
+		{
+			BOUNDARY_WRAP = 0,
+			BOUNDARY_REFLECT = 1,
+			BOUNDARY_CLAMP = 2
+		};
+
+		enum Status
+		{
+			STATUS_OKAY = 0,
+			STATUS_OUT_OF_MEMORY = 1,
+			STATUS_BAD_FILTER_NAME = 2,
+			STATUS_SCAN_BUFFER_FULL = 3
+		};
+
+		// src_x/src_y - Input dimensions
+		// dst_x/dst_y - Output dimensions
+		// boundary_op - How to sample pixels near the image boundaries
+		// sample_low/sample_high - Clamp output samples to specified range, or disable clamping if sample_low >= sample_high
+		// Pclist_x/Pclist_y - Optional pointers to contributor lists from another instance of a Resampler
+		// src_x_ofs/src_y_ofs - Offset input image by specified amount (fractional values okay)
+		Resampler(
+			int src_x, int src_y,
+			int dst_x, int dst_y,
+			Boundary_Op boundary_op = BOUNDARY_CLAMP,
+			Resample_Real sample_low = 0.0f, Resample_Real sample_high = 0.0f,
+			const char *Pfilter_name = BASISU_RESAMPLER_DEFAULT_FILTER,
+			Contrib_List *Pclist_x = NULL,
+			Contrib_List *Pclist_y = NULL,
+			Resample_Real filter_x_scale = 1.0f,
+			Resample_Real filter_y_scale = 1.0f,
+			Resample_Real src_x_ofs = 0.0f,
+			Resample_Real src_y_ofs = 0.0f);
+
+		~Resampler();
+
+		// Reinits resampler so it can handle another frame.
+		void restart();
+
+		// false on out of memory.
+		bool put_line(const Sample *Psrc);
+
+		// NULL if no scanlines are currently available (give the resampler more scanlines!)
+		const Sample *get_line();
+
+		Status status() const
+		{
+			return m_status;
+		}
+
+		// Returned contributor lists can be shared with another Resampler.
+		void get_clists(Contrib_List **ptr_clist_x, Contrib_List **ptr_clist_y);
+		Contrib_List *get_clist_x() const
+		{
+			return m_Pclist_x;
+		}
+		Contrib_List *get_clist_y() const
+		{
+			return m_Pclist_y;
+		}
+
+		// Filter accessors.
+		static int get_filter_num();
+		static const char *get_filter_name(int filter_num);
+
+		static Contrib_List *make_clist(
+			int src_x, int dst_x, Boundary_Op boundary_op,
+			Resample_Real(*Pfilter)(Resample_Real),
+			Resample_Real filter_support,
+			Resample_Real filter_scale,
+			Resample_Real src_ofs);
+
+	private:
+		Resampler();
+		Resampler(const Resampler &o);
+		Resampler &operator=(const Resampler &o);
+
+#ifdef BASISU_RESAMPLER_DEBUG_OPS
+		int total_ops;
+#endif
+
+		int m_intermediate_x;
+
+		int m_resample_src_x;
+		int m_resample_src_y;
+		int m_resample_dst_x;
+		int m_resample_dst_y;
+
+		Boundary_Op m_boundary_op;
+
+		Sample *m_Pdst_buf;
+		Sample *m_Ptmp_buf;
+
+		Contrib_List *m_Pclist_x;
+		Contrib_List *m_Pclist_y;
+
+		bool m_clist_x_forced;
+		bool m_clist_y_forced;
+
+		bool m_delay_x_resample;
+
+		int *m_Psrc_y_count;
+		uint8_t *m_Psrc_y_flag;
+
+		// The maximum number of scanlines that can be buffered at one time.
+		enum
+		{
+			MAX_SCAN_BUF_SIZE = BASISU_RESAMPLER_MAX_DIMENSION
+		};
+
+		struct Scan_Buf
+		{
+			int scan_buf_y[MAX_SCAN_BUF_SIZE];
+			Sample *scan_buf_l[MAX_SCAN_BUF_SIZE];
+		};
+
+		Scan_Buf *m_Pscan_buf;
+
+		int m_cur_src_y;
+		int m_cur_dst_y;
+
+		Status m_status;
+
+		void resample_x(Sample *Pdst, const Sample *Psrc);
+		void scale_y_mov(Sample *Ptmp, const Sample *Psrc, Resample_Real weight, int dst_x);
+		void scale_y_add(Sample *Ptmp, const Sample *Psrc, Resample_Real weight, int dst_x);
+		void clamp(Sample *Pdst, int n);
+		void resample_y(Sample *Pdst);
+
+		static int reflect(const int j, const int src_x, const Boundary_Op boundary_op);
+
+		inline int count_ops(Contrib_List *Pclist, int k)
+		{
+			int i, t = 0;
+			for (i = 0; i < k; i++)
+				t += Pclist[i].n;
+			return (t);
+		}
+
+		Resample_Real m_lo;
+		Resample_Real m_hi;
+
+		inline Resample_Real clamp_sample(Resample_Real f) const
+		{
+			if (f < m_lo)
+				f = m_lo;
+			else if (f > m_hi)
+				f = m_hi;
+			return f;
+		}
+	};
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
new file mode 100644
index 0000000000..0ebb51c334
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h
@@ -0,0 +1,35 @@
+// basisu_resampler_filters.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "../transcoder/basisu.h"
+
+namespace basisu
+{
+	typedef float (*resample_filter_func)(float t);
+
+	struct resample_filter
+	{
+		const char *name;
+		resample_filter_func func;
+		float support;
+	};
+
+	extern const resample_filter g_resample_filters[];
+	extern const int g_num_resample_filters;
+
+	int find_resample_filter(const char *pName);
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_ssim.cpp b/thirdparty/basis_universal/encoder/basisu_ssim.cpp
new file mode 100644
index 0000000000..cceb400b88
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.cpp
@@ -0,0 +1,408 @@
+// basisu_ssim.cpp
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_ssim.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+namespace basisu
+{
+	float gauss(int x, int y, float sigma_sqr)
+	{
+		float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr)));
+		float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow;
+		return g;
+	}
+		
+	// size_x/y should be odd
+	void compute_gaussian_kernel(float *pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags)
+	{
+		assert(size_x & size_y & 1);
+
+		if (!(size_x | size_y))
+			return;
+
+		int mid_x = size_x / 2;
+		int mid_y = size_y / 2;
+
+		double sum = 0;
+		for (int x = 0; x < size_x; x++)
+		{
+			for (int y = 0; y < size_y; y++)
+			{
+				float g;
+				if ((x > mid_x) && (y < mid_y))
+					g = pDst[(size_x - x - 1) + y * size_x];
+				else if ((x < mid_x) && (y > mid_y))
+					g = pDst[x + (size_y - y - 1) * size_x];
+				else if ((x > mid_x) && (y > mid_y))
+					g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x];
+				else
+					g = gauss(x - mid_x, y - mid_y, sigma_sqr);
+
+				pDst[x + y * size_x] = g;
+				sum += g;
+			}
+		}
+
+		if (flags & cComputeGaussianFlagNormalizeCenterToOne)
+		{
+			sum = pDst[mid_x + mid_y * size_x];
+		}
+
+		if (flags & (cComputeGaussianFlagNormalizeCenterToOne | cComputeGaussianFlagNormalize))
+		{
+			double one_over_sum = 1.0f / sum;
+			for (int i = 0; i < size_x * size_y; i++)
+				pDst[i] = static_cast<float>(pDst[i] * one_over_sum);
+
+			if (flags & cComputeGaussianFlagNormalizeCenterToOne)
+				pDst[mid_x + mid_y * size_x] = 1.0f;
+		}
+
+		if (flags & cComputeGaussianFlagPrint)
+		{
+			printf("{\n");
+			for (int y = 0; y < size_y; y++)
+			{
+				printf("  ");
+				for (int x = 0; x < size_x; x++)
+				{
+					printf("%f, ", pDst[x + y * size_x]);
+				}
+				printf("\n");
+			}
+			printf("}");
+		}
+	}
+
+	void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor)
+	{
+		assert(odd_filter_width && (odd_filter_width & 1));
+		odd_filter_width |= 1;
+
+		vector2D<float> kernel(odd_filter_width, odd_filter_width);
+		compute_gaussian_kernel(kernel.get_ptr(), odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize);
+
+		const int dst_width = orig_img.get_width() / width_divisor;
+		const int dst_height = orig_img.get_height() / height_divisor;
+
+		const int H = odd_filter_width / 2;
+		const int L = -H;
+
+		dst.crop(dst_width, dst_height);
+
+//#pragma omp parallel for
+		for (int oy = 0; oy < dst_height; oy++)
+		{
+			for (int ox = 0; ox < dst_width; ox++)
+			{
+				vec4F c(0.0f);
+
+				for (int yd = L; yd <= H; yd++)
+				{
+					int y = oy * height_divisor + (height_divisor >> 1) + yd;
+
+					for (int xd = L; xd <= H; xd++)
+					{
+						int x = ox * width_divisor + (width_divisor >> 1) + xd;
+
+						const vec4F &p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping);
+
+						float w = kernel(xd + H, yd + H);
+						c[0] += p[0] * w;
+						c[1] += p[1] * w;
+						c[2] += p[2] * w;
+						c[3] += p[3] * w;
+					}
+				}
+
+				dst(ox, oy).set(c[0], c[1], c[2], c[3]);
+			}
+		}
+	}
+
+	void pow_image(const imagef &src, imagef &dst, const vec4F &power)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+
+				if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f))
+					dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]);
+				else
+					dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3]));
+			}
+		}
+	}
+
+	void mul_image(const imagef &src, imagef &dst, const vec4F &mul)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+				dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]);
+			}
+		}
+	}
+
+	void scale_image(const imagef &src, imagef &dst, const vec4F &scale, const vec4F &shift)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+
+				vec4F d;
+
+				for (uint32_t c = 0; c < 4; c++)
+					d[c] = scale[c] * p[c] + shift[c];
+
+				dst(x, y).set(d[0], d[1], d[2], d[3]);
+			}
+		}
+	}
+
+	void add_weighted_image(const imagef &src1, const vec4F &alpha, const imagef &src2, const vec4F &beta, const vec4F &gamma, imagef &dst)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				dst(x, y).set(
+					s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0],
+					s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1],
+					s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2],
+					s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]);
+			}
+		}
+	}
+
+	void add_image(const imagef &src1, const imagef &src2, imagef &dst)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]);
+			}
+		}
+	}
+
+	void adds_image(const imagef &src, const vec4F &value, imagef &dst)
+	{
+		dst.resize(src);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &p = src(x, y);
+
+				dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]);
+			}
+		}
+	}
+
+	void mul_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				vec4F d;
+
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					float v1 = s1[c];
+					float v2 = s2[c];
+					d[c] = v1 * v2 * scale[c];
+				}
+
+				dst(x, y) = d;
+			}
+		}
+	}
+
+	void div_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale)
+	{
+		dst.resize(src1);
+
+//#pragma omp parallel for
+		for (int y = 0; y < (int)dst.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < dst.get_width(); x++)
+			{
+				const vec4F &s1 = src1(x, y);
+				const vec4F &s2 = src2(x, y);
+
+				vec4F d;
+
+				for (uint32_t c = 0; c < 4; c++)
+				{
+					float v = s2[c];
+					if (v == 0.0f)
+						d[c] = 0.0f;
+					else
+						d[c] = (s1[c] * scale[c]) / v;
+				}
+
+				dst(x, y) = d;
+			}
+		}
+	}
+
+	vec4F avg_image(const imagef &src)
+	{
+		vec4F avg(0.0f);
+
+		for (uint32_t y = 0; y < src.get_height(); y++)
+		{
+			for (uint32_t x = 0; x < src.get_width(); x++)
+			{
+				const vec4F &s = src(x, y);
+
+				avg += vec4F(s[0], s[1], s[2], s[3]);
+			}
+		}
+
+		avg /= static_cast<float>(src.get_total_pixels());
+
+		return avg;
+	}
+		
+	// Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html
+	vec4F compute_ssim(const imagef &a, const imagef &b)
+	{
+		imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3;
+
+		const float C1 = 6.50250f, C2 = 58.52250f;
+				
+		pow_image(a, a_sq, vec4F(2));
+		pow_image(b, b_sq, vec4F(2));
+		mul_image(a, b, axb, vec4F(1.0f));
+
+		gaussian_filter(mu1, a, 11, 1.5f * 1.5f);
+		gaussian_filter(mu2, b, 11, 1.5f * 1.5f);
+
+		pow_image(mu1, mu1_sq, vec4F(2));
+		pow_image(mu2, mu2_sq, vec4F(2));
+		mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f));
+
+		gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f);
+		add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq);
+
+		gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f);
+		add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq);
+
+		gaussian_filter(s12, axb, 11, 1.5f * 1.5f);
+		add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12);
+
+		scale_image(mu1_mu2, t1, vec4F(2), vec4F(0));
+		adds_image(t1, vec4F(C1), t1);
+
+		scale_image(s12, t2, vec4F(2), vec4F(0));
+		adds_image(t2, vec4F(C2), t2);
+
+		mul_image(t1, t2, t3, vec4F(1));
+
+		add_image(mu1_sq, mu2_sq, t1);
+		adds_image(t1, vec4F(C1), t1);
+
+		add_image(s1_sq, s2_sq, t2);
+		adds_image(t2, vec4F(C2), t2);
+
+		mul_image(t1, t2, t1, vec4F(1));
+
+		div_image(t3, t1, smap, vec4F(1));
+
+		return avg_image(smap);
+	}
+
+	vec4F compute_ssim(const image &a, const image &b, bool luma, bool luma_601)
+	{
+		image ta(a), tb(b);
+
+		if ((ta.get_width() != tb.get_width()) || (ta.get_height() != tb.get_height()))
+		{
+			debug_printf("compute_ssim: Cropping input images to equal dimensions\n");
+
+			const uint32_t w = minimum(a.get_width(), b.get_width());
+			const uint32_t h = minimum(a.get_height(), b.get_height());
+			ta.crop(w, h);
+			tb.crop(w, h);
+		}
+
+		if (!ta.get_width() || !ta.get_height())
+		{
+			assert(0);
+			return vec4F(0);
+		}
+
+		if (luma)
+		{
+			for (uint32_t y = 0; y < ta.get_height(); y++)
+			{
+				for (uint32_t x = 0; x < ta.get_width(); x++)
+				{
+					ta(x, y).set(ta(x, y).get_luma(luma_601), ta(x, y).a);
+					tb(x, y).set(tb(x, y).get_luma(luma_601), tb(x, y).a);
+				}
+			}
+		}
+
+		imagef fta, ftb;
+
+		fta.set(ta);
+		ftb.set(tb);
+
+		return compute_ssim(fta, ftb);
+	}
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_ssim.h b/thirdparty/basis_universal/encoder/basisu_ssim.h
new file mode 100644
index 0000000000..986ca3bbdf
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_ssim.h
@@ -0,0 +1,44 @@
+// basisu_ssim.h
+// Copyright (C) 2019 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_enc.h"
+
+namespace basisu
+{
+	float gauss(int x, int y, float sigma_sqr);
+
+	enum
+	{
+		cComputeGaussianFlagNormalize = 1,
+		cComputeGaussianFlagPrint = 2,
+		cComputeGaussianFlagNormalizeCenterToOne = 4
+	};
+
+	void compute_gaussian_kernel(float *pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags = 0);
+
+	void scale_image(const imagef &src, imagef &dst, const vec4F &scale, const vec4F &shift);
+	void add_weighted_image(const imagef &src1, const vec4F &alpha, const imagef &src2, const vec4F &beta, const vec4F &gamma, imagef &dst);
+	void add_image(const imagef &src1, const imagef &src2, imagef &dst);
+	void adds_image(const imagef &src, const vec4F &value, imagef &dst);
+	void mul_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale);
+	void div_image(const imagef &src1, const imagef &src2, imagef &dst, const vec4F &scale);
+	vec4F avg_image(const imagef &src);
+
+	void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping = false, uint32_t width_divisor = 1, uint32_t height_divisor = 1);
+
+	vec4F compute_ssim(const imagef &a, const imagef &b);
+	vec4F compute_ssim(const image &a, const image &b, bool luma, bool luma_601);
+
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
new file mode 100644
index 0000000000..ca2b325693
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp
@@ -0,0 +1,4189 @@
+// basisu_uastc_enc.cpp
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "basisu_uastc_enc.h"
+#include "basisu_astc_decomp.h"
+#include "basisu_gpu_texture.h"
+#include "basisu_bc7enc.h"
+
+#ifdef _DEBUG
+// When BASISU_VALIDATE_UASTC_ENC is 1, we pack and unpack to/from UASTC and ASTC, then validate that each codec returns the exact same results. This is slower.
+#define BASISU_VALIDATE_UASTC_ENC 1
+#endif
+
+#define BASISU_SUPPORT_FORCE_MODE 0
+
+using namespace basist;
+
+namespace basisu
+{
+	const uint32_t MAX_ENCODE_RESULTS = 512;
+
+#if BASISU_VALIDATE_UASTC_ENC
+	static void validate_func(bool condition, int line)
+	{
+		if (!condition)
+		{
+			fprintf(stderr, "basisu_uastc_enc: Internal validation failed on line %u!\n", line);
+		}
+	}
+
+	#define VALIDATE(c) validate_func(c, __LINE__);
+#else
+	#define VALIDATE(c)
+#endif
+
+	enum dxt_constants
+	{
+		cDXT1SelectorBits = 2U, cDXT1SelectorValues = 1U << cDXT1SelectorBits, cDXT1SelectorMask = cDXT1SelectorValues - 1U,
+		cDXT5SelectorBits = 3U, cDXT5SelectorValues = 1U << cDXT5SelectorBits, cDXT5SelectorMask = cDXT5SelectorValues - 1U,
+	};
+
+	struct dxt1_block
+	{
+		enum { cTotalEndpointBytes = 2, cTotalSelectorBytes = 4 };
+
+		uint8_t m_low_color[cTotalEndpointBytes];
+		uint8_t m_high_color[cTotalEndpointBytes];
+		uint8_t m_selectors[cTotalSelectorBytes];
+
+		inline void clear() { basisu::clear_obj(*this); }
+
+		inline uint32_t get_high_color() const { return m_high_color[0] | (m_high_color[1] << 8U); }
+		inline uint32_t get_low_color() const { return m_low_color[0] | (m_low_color[1] << 8U); }
+		inline void set_low_color(uint16_t c) { m_low_color[0] = static_cast<uint8_t>(c & 0xFF); m_low_color[1] = static_cast<uint8_t>((c >> 8) & 0xFF); }
+		inline void set_high_color(uint16_t c) { m_high_color[0] = static_cast<uint8_t>(c & 0xFF); m_high_color[1] = static_cast<uint8_t>((c >> 8) & 0xFF); }
+		inline uint32_t get_selector(uint32_t x, uint32_t y) const { assert((x < 4U) && (y < 4U)); return (m_selectors[y] >> (x * cDXT1SelectorBits))& cDXT1SelectorMask; }
+		inline void set_selector(uint32_t x, uint32_t y, uint32_t val) { assert((x < 4U) && (y < 4U) && (val < 4U)); m_selectors[y] &= (~(cDXT1SelectorMask << (x * cDXT1SelectorBits))); m_selectors[y] |= (val << (x * cDXT1SelectorBits)); }
+
+		static uint16_t pack_color(const color_rgba& color, bool scaled, uint32_t bias = 127U)
+		{
+			uint32_t r = color.r, g = color.g, b = color.b;
+			if (scaled)
+			{
+				r = (r * 31U + bias) / 255U;
+				g = (g * 63U + bias) / 255U;
+				b = (b * 31U + bias) / 255U;
+			}
+			return static_cast<uint16_t>(basisu::minimum(b, 31U) | (basisu::minimum(g, 63U) << 5U) | (basisu::minimum(r, 31U) << 11U));
+		}
+
+		static uint16_t pack_unscaled_color(uint32_t r, uint32_t g, uint32_t b) { return static_cast<uint16_t>(b | (g << 5U) | (r << 11U)); }
+	};
+
+#define UASTC_WRITE_MODE_DESCS 0
+
+	static inline void uastc_write_bits(uint8_t* pBuf, uint32_t& bit_offset, uint64_t code, uint32_t codesize, const char* pDesc)
+	{
+		(void)pDesc;
+
+#if UASTC_WRITE_MODE_DESCS
+		if (pDesc)
+			printf("%s: %u %u\n", pDesc, bit_offset, codesize);
+#endif
+
+		assert((codesize == 64) || (code < (1ULL << codesize)));
+
+		while (codesize)
+		{
+			uint32_t byte_bit_offset = bit_offset & 7;
+			uint32_t bits_to_write = basisu::minimum<int>(codesize, 8 - byte_bit_offset);
+
+			pBuf[bit_offset >> 3] |= (code << byte_bit_offset);
+
+			code >>= bits_to_write;
+			codesize -= bits_to_write;
+			bit_offset += bits_to_write;
+		}
+	}
+
+	void pack_uastc(basist::uastc_block& blk, const uastc_encode_results& result, const etc_block& etc1_blk, uint32_t etc1_bias, const eac_a8_block& etc_eac_a8_blk, bool bc1_hint0, bool bc1_hint1)
+	{
+		if ((g_uastc_mode_has_alpha[result.m_uastc_mode]) && (result.m_uastc_mode != UASTC_MODE_INDEX_SOLID_COLOR))
+		{
+			assert(etc_eac_a8_blk.m_multiplier >= 1);
+		}
+
+		uint8_t buf[32];
+		memset(buf, 0, sizeof(buf));
+
+		uint32_t block_bit_offset = 0;
+
+#if UASTC_WRITE_MODE_DESCS
+		printf("**** Mode: %u\n", result.m_uastc_mode);
+#endif
+
+		uastc_write_bits(buf, block_bit_offset, g_uastc_mode_huff_codes[result.m_uastc_mode][0], g_uastc_mode_huff_codes[result.m_uastc_mode][1], "mode");
+
+		if (result.m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.r, 8, "R");
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.g, 8, "G");
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.b, 8, "B");
+			uastc_write_bits(buf, block_bit_offset, result.m_solid_color.a, 8, "A");
+
+			uastc_write_bits(buf, block_bit_offset, etc1_blk.get_diff_bit(), 1, "ETC1D");
+			uastc_write_bits(buf, block_bit_offset, etc1_blk.get_inten_table(0), 3, "ETC1I");
+			uastc_write_bits(buf, block_bit_offset, etc1_blk.get_selector(0, 0), 2, "ETC1S");
+
+			uint32_t r, g, b;
+			if (etc1_blk.get_diff_bit())
+				etc_block::unpack_color5(r, g, b, etc1_blk.get_base5_color(), false);
+			else
+				etc_block::unpack_color4(r, g, b, etc1_blk.get_base4_color(0), false);
+
+			uastc_write_bits(buf, block_bit_offset, r, 5, "ETC1R");
+			uastc_write_bits(buf, block_bit_offset, g, 5, "ETC1G");
+			uastc_write_bits(buf, block_bit_offset, b, 5, "ETC1B");
+
+			memcpy(&blk, buf, sizeof(blk));
+			return;
+		}
+
+		if (g_uastc_mode_has_bc1_hint0[result.m_uastc_mode])
+			uastc_write_bits(buf, block_bit_offset, bc1_hint0, 1, "BC1H0");
+		else
+		{
+			assert(bc1_hint0 == false);
+		}
+
+		if (g_uastc_mode_has_bc1_hint1[result.m_uastc_mode])
+			uastc_write_bits(buf, block_bit_offset, bc1_hint1, 1, "BC1H1");
+		else
+		{
+			assert(bc1_hint1 == false);
+		}
+
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_flip_bit(), 1, "ETC1F");
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_diff_bit(), 1, "ETC1D");
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_inten_table(0), 3, "ETC1I0");
+		uastc_write_bits(buf, block_bit_offset, etc1_blk.get_inten_table(1), 3, "ETC1I1");
+
+		if (g_uastc_mode_has_etc1_bias[result.m_uastc_mode])
+			uastc_write_bits(buf, block_bit_offset, etc1_bias, 5, "ETC1BIAS");
+		else
+		{
+			assert(etc1_bias == 0);
+		}
+
+		if (g_uastc_mode_has_alpha[result.m_uastc_mode])
+		{
+			const uint32_t etc2_hints = etc_eac_a8_blk.m_table | (etc_eac_a8_blk.m_multiplier << 4);
+
+			assert(etc2_hints > 0 && etc2_hints <= 0xFF);
+			uastc_write_bits(buf, block_bit_offset, etc2_hints, 8, "ETC2TM");
+		}
+
+		uint32_t subsets = 1;
+		switch (result.m_uastc_mode)
+		{
+		case 2:
+		case 4:
+		case 7:
+		case 9:
+		case 16:
+			uastc_write_bits(buf, block_bit_offset, result.m_common_pattern, 5, "PAT");
+			subsets = 2;
+			break;
+		case 3:
+			uastc_write_bits(buf, block_bit_offset, result.m_common_pattern, 4, "PAT");
+			subsets = 3;
+			break;
+		default:
+			break;
+		}
+
+#ifdef _DEBUG
+		uint32_t part_seed = 0;
+		switch (result.m_uastc_mode)
+		{
+		case 2:
+		case 4:
+		case 9:
+		case 16:
+			part_seed = g_astc_bc7_common_partitions2[result.m_common_pattern].m_astc;
+			break;
+		case 3:
+			part_seed = g_astc_bc7_common_partitions3[result.m_common_pattern].m_astc;
+			break;
+		case 7:
+			part_seed = g_bc7_3_astc2_common_partitions[result.m_common_pattern].m_astc2;
+			break;
+		default:
+			break;
+		}
+#endif		
+
+		uint32_t total_planes = 1;
+		switch (result.m_uastc_mode)
+		{
+		case 6:
+		case 11:
+		case 13:
+			uastc_write_bits(buf, block_bit_offset, result.m_astc.m_ccs, 2, "COMPSEL");
+			total_planes = 2;
+			break;
+		case 17:
+			// CCS field is always 3 for dual plane LA.
+			assert(result.m_astc.m_ccs == 3);
+			total_planes = 2;
+			break;
+		default:
+			break;
+		}
+
+		uint8_t weights[32];
+		memcpy(weights, result.m_astc.m_weights, 16 * total_planes);
+
+		uint8_t endpoints[18];
+		memcpy(endpoints, result.m_astc.m_endpoints, sizeof(endpoints));
+
+		const uint32_t total_comps = g_uastc_mode_comps[result.m_uastc_mode];
+
+		// LLAA
+		// LLAA LLAA
+		// LLAA LLAA LLAA
+		// RRGGBB
+		// RRGGBB RRGGBB
+		// RRGGBB RRGGBB RRGGBB
+		// RRGGBBAA
+		// RRGGBBAA RRGGBBAA
+
+		const uint32_t weight_bits = g_uastc_mode_weight_bits[result.m_uastc_mode];
+
+		const uint8_t* pPartition_pattern;
+		const uint8_t* pSubset_anchor_indices = basist::get_anchor_indices(subsets, result.m_uastc_mode, result.m_common_pattern, pPartition_pattern);
+
+		for (uint32_t plane_index = 0; plane_index < total_planes; plane_index++)
+		{
+			for (uint32_t subset_index = 0; subset_index < subsets; subset_index++)
+			{
+				const uint32_t anchor_index = pSubset_anchor_indices[subset_index];
+
+#ifdef _DEBUG
+				if (subsets >= 2)
+				{
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						const uint32_t part_index = astc_compute_texel_partition(part_seed, i & 3, i >> 2, 0, subsets, true);
+						if (part_index == subset_index)
+						{
+							assert(anchor_index == i);
+							break;
+						}
+					}
+				}
+				else
+				{
+					assert(!anchor_index);
+				}
+#endif
+
+				// Check anchor weight's MSB - if it's set then invert this subset's weights and swap the endpoints
+				if (weights[anchor_index * total_planes + plane_index] & (1 << (weight_bits - 1)))
+				{
+					for (uint32_t i = 0; i < 16; i++)
+					{
+						const uint32_t part_index = pPartition_pattern[i];
+
+#ifdef _DEBUG
+						if (subsets >= 2)
+						{
+							assert(part_index == (uint32_t)astc_compute_texel_partition(part_seed, i & 3, i >> 2, 0, subsets, true));
+						}
+						else
+						{
+							assert(!part_index);
+						}
+#endif
+
+						if (part_index == subset_index)
+							weights[i * total_planes + plane_index] = ((1 << weight_bits) - 1) - weights[i * total_planes + plane_index];
+					}
+
+					if (total_planes == 2)
+					{
+						for (int c = 0; c < (int)total_comps; c++)
+						{
+							const uint32_t comp_plane = (total_comps == 2) ? c : ((c == result.m_astc.m_ccs) ? 1 : 0);
+
+							if (comp_plane == plane_index)
+								std::swap(endpoints[c * 2 + 0], endpoints[c * 2 + 1]);
+						}
+					}
+					else
+					{
+						for (uint32_t c = 0; c < total_comps; c++)
+							std::swap(endpoints[subset_index * total_comps * 2 + c * 2 + 0], endpoints[subset_index * total_comps * 2 + c * 2 + 1]);
+					}
+				}
+			} // subset_index
+		} // plane_index
+
+		const uint32_t total_values = total_comps * 2 * subsets;
+		const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[result.m_uastc_mode];
+
+		uint32_t bit_values[18];
+		uint32_t tq_values[8];
+		uint32_t total_tq_values = 0;
+		uint32_t tq_accum = 0;
+		uint32_t tq_mul = 1;
+
+		const uint32_t ep_bits = g_astc_bise_range_table[endpoint_range][0];
+		const uint32_t ep_trits = g_astc_bise_range_table[endpoint_range][1];
+		const uint32_t ep_quints = g_astc_bise_range_table[endpoint_range][2];
+
+		for (uint32_t i = 0; i < total_values; i++)
+		{
+			uint32_t val = endpoints[i];
+
+			uint32_t bits = val & ((1 << ep_bits) - 1);
+			uint32_t tq = val >> ep_bits;
+
+			bit_values[i] = bits;
+
+			if (ep_trits)
+			{
+				assert(tq < 3);
+				tq_accum += tq * tq_mul;
+				tq_mul *= 3;
+				if (tq_mul == 243)
+				{
+					tq_values[total_tq_values++] = tq_accum;
+					tq_accum = 0;
+					tq_mul = 1;
+				}
+			}
+			else if (ep_quints)
+			{
+				assert(tq < 5);
+				tq_accum += tq * tq_mul;
+				tq_mul *= 5;
+				if (tq_mul == 125)
+				{
+					tq_values[total_tq_values++] = tq_accum;
+					tq_accum = 0;
+					tq_mul = 1;
+				}
+			}
+		}
+
+		uint32_t total_endpoint_bits = 0;
+
+		for (uint32_t i = 0; i < total_tq_values; i++)
+		{
+			const uint32_t num_bits = ep_trits ? 8 : 7;
+			uastc_write_bits(buf, block_bit_offset, tq_values[i], num_bits, "ETQ");
+			total_endpoint_bits += num_bits;
+		}
+
+		if (tq_mul > 1)
+		{
+			uint32_t num_bits;
+			if (ep_trits)
+			{
+				if (tq_mul == 3)
+					num_bits = 2;
+				else if (tq_mul == 9)
+					num_bits = 4;
+				else if (tq_mul == 27)
+					num_bits = 5;
+				else //if (tq_mul == 81)
+					num_bits = 7;
+			}
+			else
+			{
+				if (tq_mul == 5)
+					num_bits = 3;
+				else //if (tq_mul == 25)
+					num_bits = 5;
+			}
+			uastc_write_bits(buf, block_bit_offset, tq_accum, num_bits, "ETQ");
+			total_endpoint_bits += num_bits;
+		}
+
+		for (uint32_t i = 0; i < total_values; i++)
+		{
+			uastc_write_bits(buf, block_bit_offset, bit_values[i], ep_bits, "EBITS");
+			total_endpoint_bits += ep_bits;
+		}
+
+#if UASTC_WRITE_MODE_DESCS
+		uint32_t weight_start = block_bit_offset;
+#endif
+
+		uint32_t total_weight_bits = 0;
+		const uint32_t plane_shift = (total_planes == 2) ? 1 : 0;
+		for (uint32_t i = 0; i < 16 * total_planes; i++)
+		{
+			uint32_t numbits = weight_bits;
+			for (uint32_t s = 0; s < subsets; s++)
+			{
+				if (pSubset_anchor_indices[s] == (i >> plane_shift))
+				{
+					numbits--;
+					break;
+				}
+			}
+
+			uastc_write_bits(buf, block_bit_offset, weights[i], numbits, nullptr);
+
+			total_weight_bits += numbits;
+		}
+
+#if UASTC_WRITE_MODE_DESCS
+		printf("WEIGHTS: %u %u\n", weight_start, total_weight_bits);
+#endif
+
+		assert(block_bit_offset <= 128);
+		memcpy(&blk, buf, sizeof(blk));
+
+#if UASTC_WRITE_MODE_DESCS
+		printf("Total bits: %u, endpoint bits: %u, weight bits: %u\n", block_bit_offset, total_endpoint_bits, total_weight_bits);
+#endif
+	}
+	
+	// MODE 0
+	// 0. DualPlane: 0, WeightRange: 8 (16), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 19 (192)       MODE6 RGB
+	// 18. DualPlane: 0, WeightRange: 11 (32), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 11 (32)       MODE6 RGB
+	static void astc_mode0_or_18(uint32_t mode, const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, const uint8_t *pForce_selectors = nullptr)
+	{
+		const uint32_t endpoint_range = (mode == 18) ? 11 : 19;
+		const uint32_t weight_range = (mode == 18) ? 11 : 8;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = (mode == 18) ? 32 : 16;
+		ccell_params.m_pSelector_weights = (mode == 18) ? g_astc_weights5 : g_astc_weights4;
+		ccell_params.m_pSelector_weightsx = (mode == 18) ? (const bc7enc_vec4F*)g_astc_weights5x : (const bc7enc_vec4F*)g_astc_weights4x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_pForce_selectors = pForce_selectors;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;// (mode == 18) ? 11 : 8;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 8;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+				
+		bool invert = false;
+
+		if (pForce_selectors == nullptr)
+		{
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			invert = true;
+			}
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = ((mode == 18) ? 31 : 15) - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = mode;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// MODE 1
+	// 1-subset, 2-bit indices, 8-bit endpoints, BC7 mode 3
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 20 (256)        MODE3 or MODE5 RGB
+	static void astc_mode1(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 4;
+		ccell_params.m_pSelector_weights = g_bc7_weights2;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+		ccell_params.m_astc_endpoint_range = 20;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = 2;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 8;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+
+		const uint32_t range = 20;
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 1;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	static uint32_t estimate_partition2(uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeights, const color_rgba block[4][4], const uint32_t weights[4])
+	{
+		assert(pWeights[0] == 0 && pWeights[num_weights - 1] == 64);
+
+		uint64_t best_err = UINT64_MAX;
+		uint32_t best_common_pattern = 0;
+
+		for (uint32_t common_pattern = 0; common_pattern < TOTAL_ASTC_BC7_COMMON_PARTITIONS2; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			const uint8_t* pPartition = &g_bc7_partition2[bc7_pattern * 16];
+
+			color_quad_u8 subset_colors[2][16];
+			uint32_t subset_total_colors[2] = { 0, 0 };
+			for (uint32_t index = 0; index < 16; index++)
+				subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+			uint64_t total_subset_err = 0;
+			for (uint32_t subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
+				total_subset_err += color_cell_compression_est_astc(num_weights, num_comps, pWeights, subset_total_colors[subset], &subset_colors[subset][0], best_err, weights);
+
+			if (total_subset_err < best_err)
+			{
+				best_err = total_subset_err;
+				best_common_pattern = common_pattern;
+			}
+		}
+
+		return best_common_pattern;
+	}
+
+	// MODE 2
+	// 2-subset, 3-bit indices, 4-bit endpoints, BC7 mode 1
+	// DualPlane: 0, WeightRange: 5 (8), Subsets: 2, CEM: 8 (RGB Direct       ), EndpointRange: 8 (16)          MODE1
+	static void astc_mode2(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS2;
+
+		if (estimate_partition)
+		{
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+			first_common_pattern = estimate_partition2(8, 3, g_bc7_weights3, block, weights);
+			last_common_pattern = first_common_pattern + 1;
+		}
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[part];
+					part_pixels[part][num_part_pixels[part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t part = 0; part < 2; part++)
+			{
+				memset(&ccell_params[part], 0, sizeof(ccell_params[part]));
+
+				ccell_params[part].m_num_pixels = num_part_pixels[part];
+				ccell_params[part].m_pPixels = (color_quad_u8*)&part_pixels[part][0];
+				ccell_params[part].m_num_selector_weights = 8;
+				ccell_params[part].m_pSelector_weights = g_bc7_weights3;
+				ccell_params[part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights3x;
+				ccell_params[part].m_astc_endpoint_range = 8;
+				ccell_params[part].m_weights[0] = 1;
+				ccell_params[part].m_weights[1] = 1;
+				ccell_params[part].m_weights[2] = 1;
+				ccell_params[part].m_weights[3] = 1;
+
+				memset(&ccell_results[part], 0, sizeof(ccell_results[part]));
+				ccell_results[part].m_pSelectors = &ccell_result_selectors[part][0];
+				ccell_results[part].m_pSelectors_temp = &ccell_result_selectors_temp[part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[part], &ccell_results[part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			{
+				// ASTC
+				astc_block_desc astc_results;
+				memset(&astc_results, 0, sizeof(astc_results));
+
+				astc_results.m_dual_plane = false;
+				astc_results.m_weight_range = 5;
+
+				astc_results.m_ccs = 0;
+				astc_results.m_subsets = 2;
+				astc_results.m_partition_seed = g_astc_bc7_common_partitions2[common_pattern].m_astc;
+				astc_results.m_cem = 8;
+
+				uint32_t p0 = 0;
+				uint32_t p1 = 1;
+				if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+					std::swap(p0, p1);
+
+				astc_results.m_endpoints[0] = ccell_results[p0].m_astc_low_endpoint.m_c[0];
+				astc_results.m_endpoints[1] = ccell_results[p0].m_astc_high_endpoint.m_c[0];
+				astc_results.m_endpoints[2] = ccell_results[p0].m_astc_low_endpoint.m_c[1];
+				astc_results.m_endpoints[3] = ccell_results[p0].m_astc_high_endpoint.m_c[1];
+				astc_results.m_endpoints[4] = ccell_results[p0].m_astc_low_endpoint.m_c[2];
+				astc_results.m_endpoints[5] = ccell_results[p0].m_astc_high_endpoint.m_c[2];
+
+				const uint32_t range = 8;
+
+				bool invert[2] = { false, false };
+
+				int s0 = g_astc_unquant[range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[4]].m_unquant;
+				int s1 = g_astc_unquant[range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[5]].m_unquant;
+				if (s1 < s0)
+				{
+					std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+					std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+					std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+					invert[0] = true;
+				}
+
+				astc_results.m_endpoints[6] = ccell_results[p1].m_astc_low_endpoint.m_c[0];
+				astc_results.m_endpoints[7] = ccell_results[p1].m_astc_high_endpoint.m_c[0];
+				astc_results.m_endpoints[8] = ccell_results[p1].m_astc_low_endpoint.m_c[1];
+				astc_results.m_endpoints[9] = ccell_results[p1].m_astc_high_endpoint.m_c[1];
+				astc_results.m_endpoints[10] = ccell_results[p1].m_astc_low_endpoint.m_c[2];
+				astc_results.m_endpoints[11] = ccell_results[p1].m_astc_high_endpoint.m_c[2];
+
+				s0 = g_astc_unquant[range][astc_results.m_endpoints[0 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[2 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[4 + 6]].m_unquant;
+				s1 = g_astc_unquant[range][astc_results.m_endpoints[1 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[3 + 6]].m_unquant + g_astc_unquant[range][astc_results.m_endpoints[5 + 6]].m_unquant;
+
+				if (s1 < s0)
+				{
+					std::swap(astc_results.m_endpoints[0 + 6], astc_results.m_endpoints[1 + 6]);
+					std::swap(astc_results.m_endpoints[2 + 6], astc_results.m_endpoints[3 + 6]);
+					std::swap(astc_results.m_endpoints[4 + 6], astc_results.m_endpoints[5 + 6]);
+					invert[1] = true;
+				}
+
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						const uint32_t bc7_part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+						astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+						uint32_t astc_part = bc7_part;
+						if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+							astc_part = 1 - astc_part;
+
+						if (invert[astc_part])
+							astc_results.m_weights[x + y * 4] = 7 - astc_results.m_weights[x + y * 4];
+					}
+				}
+
+				assert(total_results < MAX_ENCODE_RESULTS);
+				if (total_results < MAX_ENCODE_RESULTS)
+				{
+					pResults[total_results].m_uastc_mode = 2;
+					pResults[total_results].m_common_pattern = common_pattern;
+					pResults[total_results].m_astc = astc_results;
+					pResults[total_results].m_astc_err = total_part_err;
+					total_results++;
+				}
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 3
+	// 3-subsets, 2-bit indices, [0,11] endpoints, BC7 mode 2
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 3, CEM: 8 (RGB Direct	     ), EndpointRange: 7 (12)		   MODE2
+	static void astc_mode3(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS3;
+
+		if (estimate_partition)
+		{
+			uint64_t best_err = UINT64_MAX;
+			uint32_t best_common_pattern = 0;
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+
+			for (uint32_t common_pattern = 0; common_pattern < TOTAL_ASTC_BC7_COMMON_PARTITIONS3; common_pattern++)
+			{
+				const uint32_t bc7_pattern = g_astc_bc7_common_partitions3[common_pattern].m_bc7;
+
+				const uint8_t* pPartition = &g_bc7_partition3[bc7_pattern * 16];
+
+				color_quad_u8 subset_colors[3][16];
+				uint32_t subset_total_colors[3] = { 0, 0 };
+				for (uint32_t index = 0; index < 16; index++)
+					subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+				uint64_t total_subset_err = 0;
+				for (uint32_t subset = 0; (subset < 3) && (total_subset_err < best_err); subset++)
+					total_subset_err += color_cell_compression_est_astc(4, 3, g_bc7_weights2, subset_total_colors[subset], &subset_colors[subset][0], best_err, weights);
+
+				if (total_subset_err < best_err)
+				{
+					best_err = total_subset_err;
+					best_common_pattern = common_pattern;
+				}
+			}
+
+			first_common_pattern = best_common_pattern;
+			last_common_pattern = best_common_pattern + 1;
+		}
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t endpoint_range = 7;
+
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions3[common_pattern].m_bc7;
+
+			color_rgba part_pixels[3][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[3] = { 0, 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t bc7_part = g_bc7_partition3[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[bc7_part];
+					part_pixels[bc7_part][num_part_pixels[bc7_part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[3];
+			color_cell_compressor_results ccell_results[3];
+			uint8_t ccell_result_selectors[3][16];
+			uint8_t ccell_result_selectors_temp[3][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t bc7_part = 0; bc7_part < 3; bc7_part++)
+			{
+				memset(&ccell_params[bc7_part], 0, sizeof(ccell_params[bc7_part]));
+
+				ccell_params[bc7_part].m_num_pixels = num_part_pixels[bc7_part];
+				ccell_params[bc7_part].m_pPixels = (color_quad_u8*)&part_pixels[bc7_part][0];
+				ccell_params[bc7_part].m_num_selector_weights = 4;
+				ccell_params[bc7_part].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[bc7_part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[bc7_part].m_astc_endpoint_range = endpoint_range;
+				ccell_params[bc7_part].m_weights[0] = 1;
+				ccell_params[bc7_part].m_weights[1] = 1;
+				ccell_params[bc7_part].m_weights[2] = 1;
+				ccell_params[bc7_part].m_weights[3] = 1;
+
+				memset(&ccell_results[bc7_part], 0, sizeof(ccell_results[bc7_part]));
+				ccell_results[bc7_part].m_pSelectors = &ccell_result_selectors[bc7_part][0];
+				ccell_results[bc7_part].m_pSelectors_temp = &ccell_result_selectors_temp[bc7_part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[bc7_part], &ccell_results[bc7_part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			{
+				// ASTC
+				astc_block_desc astc_results;
+				memset(&astc_results, 0, sizeof(astc_results));
+
+				astc_results.m_dual_plane = false;
+				astc_results.m_weight_range = 2;
+
+				astc_results.m_ccs = 0;
+				astc_results.m_subsets = 3;
+				astc_results.m_partition_seed = g_astc_bc7_common_partitions3[common_pattern].m_astc;
+				astc_results.m_cem = 8;
+
+				uint32_t astc_to_bc7_part[3]; // converts ASTC to BC7 partition index
+				const uint32_t perm = g_astc_bc7_common_partitions3[common_pattern].m_astc_to_bc7_perm;
+				astc_to_bc7_part[0] = g_astc_to_bc7_partition_index_perm_tables[perm][0];
+				astc_to_bc7_part[1] = g_astc_to_bc7_partition_index_perm_tables[perm][1];
+				astc_to_bc7_part[2] = g_astc_to_bc7_partition_index_perm_tables[perm][2];
+
+				bool invert_astc_part[3] = { false, false, false };
+
+				for (uint32_t astc_part = 0; astc_part < 3; astc_part++)
+				{
+					uint8_t* pEndpoints = &astc_results.m_endpoints[6 * astc_part];
+
+					pEndpoints[0] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_low_endpoint.m_c[0];
+					pEndpoints[1] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_high_endpoint.m_c[0];
+					pEndpoints[2] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_low_endpoint.m_c[1];
+					pEndpoints[3] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_high_endpoint.m_c[1];
+					pEndpoints[4] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_low_endpoint.m_c[2];
+					pEndpoints[5] = ccell_results[astc_to_bc7_part[astc_part]].m_astc_high_endpoint.m_c[2];
+
+					int s0 = g_astc_unquant[endpoint_range][pEndpoints[0]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[2]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[4]].m_unquant;
+					int s1 = g_astc_unquant[endpoint_range][pEndpoints[1]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[3]].m_unquant + g_astc_unquant[endpoint_range][pEndpoints[5]].m_unquant;
+					if (s1 < s0)
+					{
+						std::swap(pEndpoints[0], pEndpoints[1]);
+						std::swap(pEndpoints[2], pEndpoints[3]);
+						std::swap(pEndpoints[4], pEndpoints[5]);
+						invert_astc_part[astc_part] = true;
+					}
+				}
+
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						const uint32_t bc7_part = g_bc7_partition3[16 * bc7_pattern + x + y * 4];
+
+						astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+						uint32_t astc_part = 0;
+						for (uint32_t i = 0; i < 3; i++)
+						{
+							if (astc_to_bc7_part[i] == bc7_part)
+							{
+								astc_part = i;
+								break;
+							}
+						}
+
+						if (invert_astc_part[astc_part])
+							astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+					}
+				}
+
+				assert(total_results < MAX_ENCODE_RESULTS);
+				if (total_results < MAX_ENCODE_RESULTS)
+				{
+					pResults[total_results].m_uastc_mode = 3;
+					pResults[total_results].m_common_pattern = common_pattern;
+					pResults[total_results].m_astc = astc_results;
+					pResults[total_results].m_astc_err = total_part_err;
+					total_results++;
+				}
+
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 4
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 8 (RGB Direct       ), EndpointRange: 12 (40)         MODE3
+	static void astc_mode4(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		//const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = 12;
+
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS2;
+
+		if (estimate_partition)
+		{
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+			first_common_pattern = estimate_partition2(4, 3, g_bc7_weights2, block, weights);
+			last_common_pattern = first_common_pattern + 1;
+		}
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[part];
+					part_pixels[part][num_part_pixels[part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t part = 0; part < 2; part++)
+			{
+				memset(&ccell_params[part], 0, sizeof(ccell_params[part]));
+
+				ccell_params[part].m_num_pixels = num_part_pixels[part];
+				ccell_params[part].m_pPixels = (color_quad_u8*)&part_pixels[part][0];
+				ccell_params[part].m_num_selector_weights = 4;
+				ccell_params[part].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[part].m_astc_endpoint_range = endpoint_range;
+				ccell_params[part].m_weights[0] = 1;
+				ccell_params[part].m_weights[1] = 1;
+				ccell_params[part].m_weights[2] = 1;
+				ccell_params[part].m_weights[3] = 1;
+
+				memset(&ccell_results[part], 0, sizeof(ccell_results[part]));
+				ccell_results[part].m_pSelectors = &ccell_result_selectors[part][0];
+				ccell_results[part].m_pSelectors_temp = &ccell_result_selectors_temp[part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[part], &ccell_results[part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			// ASTC
+			astc_block_desc astc_results;
+			memset(&astc_results, 0, sizeof(astc_results));
+
+			astc_results.m_dual_plane = false;
+			astc_results.m_weight_range = 2;
+
+			astc_results.m_ccs = 0;
+			astc_results.m_subsets = 2;
+			astc_results.m_partition_seed = g_astc_bc7_common_partitions2[common_pattern].m_astc;
+			astc_results.m_cem = 8;
+
+			uint32_t p0 = 0;
+			uint32_t p1 = 1;
+			if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+				std::swap(p0, p1);
+
+			astc_results.m_endpoints[0] = ccell_results[p0].m_astc_low_endpoint.m_c[0];
+			astc_results.m_endpoints[1] = ccell_results[p0].m_astc_high_endpoint.m_c[0];
+			astc_results.m_endpoints[2] = ccell_results[p0].m_astc_low_endpoint.m_c[1];
+			astc_results.m_endpoints[3] = ccell_results[p0].m_astc_high_endpoint.m_c[1];
+			astc_results.m_endpoints[4] = ccell_results[p0].m_astc_low_endpoint.m_c[2];
+			astc_results.m_endpoints[5] = ccell_results[p0].m_astc_high_endpoint.m_c[2];
+
+			bool invert[2] = { false, false };
+
+			int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+				std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+				std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+				invert[0] = true;
+			}
+
+			astc_results.m_endpoints[6] = ccell_results[p1].m_astc_low_endpoint.m_c[0];
+			astc_results.m_endpoints[7] = ccell_results[p1].m_astc_high_endpoint.m_c[0];
+			astc_results.m_endpoints[8] = ccell_results[p1].m_astc_low_endpoint.m_c[1];
+			astc_results.m_endpoints[9] = ccell_results[p1].m_astc_high_endpoint.m_c[1];
+			astc_results.m_endpoints[10] = ccell_results[p1].m_astc_low_endpoint.m_c[2];
+			astc_results.m_endpoints[11] = ccell_results[p1].m_astc_high_endpoint.m_c[2];
+
+			s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4 + 6]].m_unquant;
+			s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3 + 6]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5 + 6]].m_unquant;
+
+			if (s1 < s0)
+			{
+				std::swap(astc_results.m_endpoints[0 + 6], astc_results.m_endpoints[1 + 6]);
+				std::swap(astc_results.m_endpoints[2 + 6], astc_results.m_endpoints[3 + 6]);
+				std::swap(astc_results.m_endpoints[4 + 6], astc_results.m_endpoints[5 + 6]);
+				invert[1] = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t bc7_part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+					astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+					uint32_t astc_part = bc7_part;
+					if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+						astc_part = 1 - astc_part;
+
+					if (invert[astc_part])
+						astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 4;
+				pResults[total_results].m_common_pattern = common_pattern;
+				pResults[total_results].m_astc = astc_results;
+				pResults[total_results].m_astc_err = total_part_err;
+				total_results++;
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 5 
+	// DualPlane: 0, WeightRange: 5 (8), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 20 (256) 		BC7 MODE 6 (or MODE 1 1-subset)
+	static void astc_mode5(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 5;
+		const uint32_t endpoint_range = 20;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 8;
+		ccell_params.m_pSelector_weights = g_bc7_weights3;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights3x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc blk;
+		memset(&blk, 0, sizeof(blk));
+
+		blk.m_dual_plane = false;
+		blk.m_weight_range = weight_range;
+
+		blk.m_ccs = 0;
+		blk.m_subsets = 1;
+		blk.m_partition_seed = 0;
+		blk.m_cem = 8;
+
+		blk.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		blk.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		blk.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		blk.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		blk.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		blk.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+			std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+			std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				blk.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					blk.m_weights[x + y * 4] = 7 - blk.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 5;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = blk;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// MODE 6
+	// DualPlane: 1, WeightRange: 2 (4), Subsets: 1, CEM: 8 (RGB Direct       ), EndpointRange: 18 (160)		BC7 MODE5
+	static void astc_mode6(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		for (uint32_t rot_comp = 0; rot_comp < 3; rot_comp++)
+		{
+			const uint32_t weight_range = 2;
+			const uint32_t endpoint_range = 18;
+
+			color_quad_u8 block_rgb[16];
+			color_quad_u8 block_a[16];
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				block_rgb[i] = ((color_quad_u8*)&block[0][0])[i];
+				block_a[i] = block_rgb[i];
+
+				uint8_t c = block_a[i].m_c[rot_comp];
+				block_a[i].m_c[0] = c;
+				block_a[i].m_c[1] = c;
+				block_a[i].m_c[2] = c;
+				block_a[i].m_c[3] = 255;
+
+				block_rgb[i].m_c[rot_comp] = 255;
+			}
+
+			uint8_t ccell_result_selectors_temp[16];
+
+			color_cell_compressor_params ccell_params_rgb;
+			memset(&ccell_params_rgb, 0, sizeof(ccell_params_rgb));
+
+			ccell_params_rgb.m_num_pixels = 16;
+			ccell_params_rgb.m_pPixels = block_rgb;
+			ccell_params_rgb.m_num_selector_weights = 4;
+			ccell_params_rgb.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_rgb.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_rgb.m_astc_endpoint_range = endpoint_range;
+			ccell_params_rgb.m_weights[0] = 1;
+			ccell_params_rgb.m_weights[1] = 1;
+			ccell_params_rgb.m_weights[2] = 1;
+			ccell_params_rgb.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_rgb;
+			uint8_t ccell_result_selectors_rgb[16];
+			memset(&ccell_results_rgb, 0, sizeof(ccell_results_rgb));
+			ccell_results_rgb.m_pSelectors = &ccell_result_selectors_rgb[0];
+			ccell_results_rgb.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_rgb = color_cell_compression(255, &ccell_params_rgb, &ccell_results_rgb, &comp_params);
+			
+			color_cell_compressor_params ccell_params_a;
+			memset(&ccell_params_a, 0, sizeof(ccell_params_a));
+
+			ccell_params_a.m_num_pixels = 16;
+			ccell_params_a.m_pPixels = block_a;
+			ccell_params_a.m_num_selector_weights = 4;
+			ccell_params_a.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_a.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_a.m_astc_endpoint_range = endpoint_range;
+			ccell_params_a.m_weights[0] = 1;
+			ccell_params_a.m_weights[1] = 1;
+			ccell_params_a.m_weights[2] = 1;
+			ccell_params_a.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_a;
+			uint8_t ccell_result_selectors_a[16];
+			memset(&ccell_results_a, 0, sizeof(ccell_results_a));
+			ccell_results_a.m_pSelectors = &ccell_result_selectors_a[0];
+			ccell_results_a.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_a = color_cell_compression(255, &ccell_params_a, &ccell_results_a, &comp_params) / 3;
+
+			uint64_t total_err = part_err_rgb + part_err_a;
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = true;
+			blk.m_weight_range = weight_range;
+
+			blk.m_ccs = rot_comp;
+			blk.m_subsets = 1;
+			blk.m_partition_seed = 0;
+			blk.m_cem = 8;
+
+			blk.m_endpoints[0] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[1] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[2] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[3] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[4] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[5] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[2];
+
+			bool invert = false;
+
+			int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+				std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+				std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+				invert = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					uint32_t rgb_index = ccell_result_selectors_rgb[x + y * 4];
+					uint32_t a_index = ccell_result_selectors_a[x + y * 4];
+
+					if (invert)
+					{
+						rgb_index = 3 - rgb_index;
+						a_index = 3 - a_index;
+					}
+
+					blk.m_weights[(x + y * 4) * 2 + 0] = (uint8_t)rgb_index;
+					blk.m_weights[(x + y * 4) * 2 + 1] = (uint8_t)a_index;
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 6;
+				pResults[total_results].m_common_pattern = 0;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+		} // rot_comp
+	}
+
+	// MODE 7 - 2 subset ASTC, 3 subset BC7
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 8 (RGB Direct       ), EndpointRange: 12 (40)         MODE2
+	static void astc_mode7(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, bool estimate_partition)
+	{
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS;
+
+		if (estimate_partition)
+		{
+			uint64_t best_err = UINT64_MAX;
+			uint32_t best_common_pattern = 0;
+			const uint32_t weights[4] = { 1, 1, 1, 1 };
+
+			for (uint32_t common_pattern = 0; common_pattern < TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS; common_pattern++)
+			{
+				const uint8_t* pPartition = &g_bc7_3_astc2_patterns2[common_pattern][0];
+
+#ifdef _DEBUG
+				const uint32_t astc_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_astc2;
+				const uint32_t bc7_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_bc73;
+				const uint32_t common_pattern_k = g_bc7_3_astc2_common_partitions[common_pattern].k;
+
+				for (uint32_t y = 0; y < 4; y++)
+				{
+					for (uint32_t x = 0; x < 4; x++)
+					{
+						const uint32_t astc_part = bc7_convert_partition_index_3_to_2(g_bc7_partition3[16 * bc7_pattern + x + y * 4], common_pattern_k);
+						assert((int)astc_part == astc_compute_texel_partition(astc_pattern, x, y, 0, 2, true));
+						assert(astc_part == pPartition[x + y * 4]);
+					}
+				}
+#endif
+
+				color_quad_u8 subset_colors[2][16];
+				uint32_t subset_total_colors[2] = { 0, 0 };
+				for (uint32_t index = 0; index < 16; index++)
+					subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+				uint64_t total_subset_err = 0;
+				for (uint32_t subset = 0; (subset < 2) && (total_subset_err < best_err); subset++)
+					total_subset_err += color_cell_compression_est_astc(4, 3, g_bc7_weights2, subset_total_colors[subset], &subset_colors[subset][0], best_err, weights);
+
+				if (total_subset_err < best_err)
+				{
+					best_err = total_subset_err;
+					best_common_pattern = common_pattern;
+				}
+			}
+
+			first_common_pattern = best_common_pattern;
+			last_common_pattern = best_common_pattern + 1;
+		}
+
+		//const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = 12;
+
+		for (uint32_t common_pattern = first_common_pattern; common_pattern < last_common_pattern; common_pattern++)
+		{
+			const uint32_t astc_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_astc2;
+			const uint32_t bc7_pattern = g_bc7_3_astc2_common_partitions[common_pattern].m_bc73;
+			const uint32_t common_pattern_k = g_bc7_3_astc2_common_partitions[common_pattern].k;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t astc_part = bc7_convert_partition_index_3_to_2(g_bc7_partition3[16 * bc7_pattern + x + y * 4], common_pattern_k);
+#ifdef _DEBUG					
+					assert((int)astc_part == astc_compute_texel_partition(astc_pattern, x, y, 0, 2, true));
+#endif					
+
+					part_pixel_index[y][x] = num_part_pixels[astc_part];
+					part_pixels[astc_part][num_part_pixels[astc_part]++] = block[y][x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_part_err = 0;
+			for (uint32_t part = 0; part < 2; part++)
+			{
+				memset(&ccell_params[part], 0, sizeof(ccell_params[part]));
+
+				ccell_params[part].m_num_pixels = num_part_pixels[part];
+				ccell_params[part].m_pPixels = (color_quad_u8*)&part_pixels[part][0];
+				ccell_params[part].m_num_selector_weights = 4;
+				ccell_params[part].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[part].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[part].m_astc_endpoint_range = endpoint_range;
+				ccell_params[part].m_weights[0] = 1;
+				ccell_params[part].m_weights[1] = 1;
+				ccell_params[part].m_weights[2] = 1;
+				ccell_params[part].m_weights[3] = 1;
+
+				memset(&ccell_results[part], 0, sizeof(ccell_results[part]));
+				ccell_results[part].m_pSelectors = &ccell_result_selectors[part][0];
+				ccell_results[part].m_pSelectors_temp = &ccell_result_selectors_temp[part][0];
+
+				uint64_t part_err = color_cell_compression(255, &ccell_params[part], &ccell_results[part], &comp_params);
+				total_part_err += part_err;
+			} // part
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = false;
+			blk.m_weight_range = 2;
+
+			blk.m_ccs = 0;
+			blk.m_subsets = 2;
+			blk.m_partition_seed = astc_pattern;
+			blk.m_cem = 8;
+
+			const uint32_t p0 = 0;
+			const uint32_t p1 = 1;
+
+			blk.m_endpoints[0] = ccell_results[p0].m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[1] = ccell_results[p0].m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[2] = ccell_results[p0].m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[3] = ccell_results[p0].m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[4] = ccell_results[p0].m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[5] = ccell_results[p0].m_astc_high_endpoint.m_c[2];
+
+			bool invert[2] = { false, false };
+
+			int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+				std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+				std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+				invert[0] = true;
+			}
+
+			blk.m_endpoints[6] = ccell_results[p1].m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[7] = ccell_results[p1].m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[8] = ccell_results[p1].m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[9] = ccell_results[p1].m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[10] = ccell_results[p1].m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[11] = ccell_results[p1].m_astc_high_endpoint.m_c[2];
+
+			s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4 + 6]].m_unquant;
+			s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3 + 6]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5 + 6]].m_unquant;
+
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0 + 6], blk.m_endpoints[1 + 6]);
+				std::swap(blk.m_endpoints[2 + 6], blk.m_endpoints[3 + 6]);
+				std::swap(blk.m_endpoints[4 + 6], blk.m_endpoints[5 + 6]);
+				invert[1] = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t astc_part = bc7_convert_partition_index_3_to_2(g_bc7_partition3[16 * bc7_pattern + x + y * 4], common_pattern_k);
+
+					blk.m_weights[x + y * 4] = ccell_result_selectors[astc_part][part_pixel_index[y][x]];
+
+					if (invert[astc_part])
+						blk.m_weights[x + y * 4] = 3 - blk.m_weights[x + y * 4];
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 7;
+				pResults[total_results].m_common_pattern = common_pattern;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_part_err;
+				total_results++;
+			}
+
+		} // common_pattern
+	}
+
+	static void estimate_partition2_list(uint32_t num_weights, uint32_t num_comps, const uint32_t* pWeights, const color_rgba block[4][4], uint32_t* pParts, uint32_t max_parts, const uint32_t weights[4])
+	{
+		assert(pWeights[0] == 0 && pWeights[num_weights - 1] == 64);
+
+		const uint32_t MAX_PARTS = 8;
+		assert(max_parts <= MAX_PARTS);
+
+		uint64_t part_error[MAX_PARTS];
+		memset(part_error, 0xFF, sizeof(part_error));
+		memset(pParts, 0, sizeof(pParts[0]) * max_parts);
+
+		for (uint32_t common_pattern = 0; common_pattern < TOTAL_ASTC_BC7_COMMON_PARTITIONS2; common_pattern++)
+		{
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			const uint8_t* pPartition = &g_bc7_partition2[bc7_pattern * 16];
+
+			color_quad_u8 subset_colors[2][16];
+			uint32_t subset_total_colors[2] = { 0, 0 };
+			for (uint32_t index = 0; index < 16; index++)
+				subset_colors[pPartition[index]][subset_total_colors[pPartition[index]]++] = ((const color_quad_u8*)block)[index];
+
+			uint64_t total_subset_err = 0;
+			for (uint32_t subset = 0; subset < 2; subset++)
+				total_subset_err += color_cell_compression_est_astc(num_weights, num_comps, pWeights, subset_total_colors[subset], &subset_colors[subset][0], UINT64_MAX, weights);
+
+			for (int i = 0; i < (int)max_parts; i++)
+			{
+				if (total_subset_err < part_error[i])
+				{
+					for (int j = max_parts - 1; j > i; --j)
+					{
+						pParts[j] = pParts[j - 1];
+						part_error[j] = part_error[j - 1];
+					}
+
+					pParts[i] = common_pattern;
+					part_error[i] = total_subset_err;
+
+					break;
+				}
+			}
+		}
+
+#ifdef _DEBUG
+		for (uint32_t i = 0; i < max_parts - 1; i++)
+		{
+			assert(part_error[i] <= part_error[i + 1]);
+		}
+#endif
+	}
+		
+	// 9. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 12 (RGBA Direct), EndpointRange: 8 (16) - BC7 MODE 7
+	// 16. DualPlane: 0, WeightRange : 2 (4), Subsets : 2, CEM: 4 (LA Direct), EndpointRange : 20 (256) - BC7 MODE 7
+	static void astc_mode9_or_16(uint32_t mode, const color_rgba source_block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params, uint32_t estimate_partition_list_size)
+	{
+		assert(mode == 9 || mode == 16);
+
+		const color_rgba* pBlock = &source_block[0][0];
+
+		color_rgba temp_block[16];
+		if (mode == 16)
+		{
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				if (mode == 16)
+				{
+					assert(pBlock[i].r == pBlock[i].g);
+					assert(pBlock[i].r == pBlock[i].b);
+				}
+
+				const uint32_t l = pBlock[i].r;
+				const uint32_t a = pBlock[i].a;
+
+				// Use (l,0,0,a) not (l,l,l,a) so both components are treated equally.
+				temp_block[i].set_noclamp_rgba(l, 0, 0, a);
+			}
+
+			pBlock = temp_block;
+		}
+
+		const uint32_t weights[4] = { 1, 1, 1, 1 };
+
+		//const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = (mode == 16) ? 20 : 8;
+
+		uint32_t first_common_pattern = 0;
+		uint32_t last_common_pattern = TOTAL_ASTC_BC7_COMMON_PARTITIONS2;
+		bool use_part_list = false;
+
+		const uint32_t MAX_PARTS = 8;
+		uint32_t parts[MAX_PARTS];
+
+		if (estimate_partition_list_size == 1)
+		{
+			first_common_pattern = estimate_partition2(4, 4, g_bc7_weights2, (const color_rgba(*)[4])pBlock, weights);
+			last_common_pattern = first_common_pattern + 1;
+		}
+		else if (estimate_partition_list_size > 0)
+		{
+			assert(estimate_partition_list_size <= MAX_PARTS);
+			estimate_partition_list_size = basisu::minimum(estimate_partition_list_size, MAX_PARTS);
+
+			estimate_partition2_list(4, 4, g_bc7_weights2, (const color_rgba(*)[4])pBlock, parts, estimate_partition_list_size, weights);
+
+			first_common_pattern = 0;
+			last_common_pattern = estimate_partition_list_size;
+			use_part_list = true;
+
+#ifdef _DEBUG
+			assert(parts[0] == estimate_partition2(4, 4, g_bc7_weights2, (const color_rgba(*)[4])pBlock, weights));
+#endif
+		}
+
+		for (uint32_t common_pattern_iter = first_common_pattern; common_pattern_iter < last_common_pattern; common_pattern_iter++)
+		{
+			const uint32_t common_pattern = use_part_list ? parts[common_pattern_iter] : common_pattern_iter;
+
+			const uint32_t bc7_pattern = g_astc_bc7_common_partitions2[common_pattern].m_bc7;
+
+			color_rgba part_pixels[2][16];
+			uint32_t part_pixel_index[4][4];
+			uint32_t num_part_pixels[2] = { 0, 0 };
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+					part_pixel_index[y][x] = num_part_pixels[part];
+					part_pixels[part][num_part_pixels[part]++] = pBlock[y * 4 + x];
+				}
+			}
+
+			color_cell_compressor_params ccell_params[2];
+			color_cell_compressor_results ccell_results[2];
+			uint8_t ccell_result_selectors[2][16];
+			uint8_t ccell_result_selectors_temp[2][16];
+
+			uint64_t total_err = 0;
+			for (uint32_t subset = 0; subset < 2; subset++)
+			{
+				memset(&ccell_params[subset], 0, sizeof(ccell_params[subset]));
+
+				ccell_params[subset].m_num_pixels = num_part_pixels[subset];
+				ccell_params[subset].m_pPixels = (color_quad_u8*)&part_pixels[subset][0];
+				ccell_params[subset].m_num_selector_weights = 4;
+				ccell_params[subset].m_pSelector_weights = g_bc7_weights2;
+				ccell_params[subset].m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+				ccell_params[subset].m_astc_endpoint_range = endpoint_range;
+				ccell_params[subset].m_weights[0] = weights[0];
+				ccell_params[subset].m_weights[1] = weights[1];
+				ccell_params[subset].m_weights[2] = weights[2];
+				ccell_params[subset].m_weights[3] = weights[3];
+				ccell_params[subset].m_has_alpha = true;
+
+				memset(&ccell_results[subset], 0, sizeof(ccell_results[subset]));
+				ccell_results[subset].m_pSelectors = &ccell_result_selectors[subset][0];
+				ccell_results[subset].m_pSelectors_temp = &ccell_result_selectors_temp[subset][0];
+
+				uint64_t subset_err = color_cell_compression(255, &ccell_params[subset], &ccell_results[subset], &comp_params);
+
+				if (mode == 16)
+				{
+					color_rgba colors[4];
+					for (uint32_t c = 0; c < 4; c++)
+					{
+						colors[0].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results[subset].m_astc_low_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+						colors[3].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results[subset].m_astc_high_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+					}
+
+					for (uint32_t i = 1; i < 4 - 1; i++)
+						for (uint32_t c = 0; c < 4; c++)
+							colors[i].m_comps[c] = (uint8_t)astc_interpolate(colors[0].m_comps[c], colors[3].m_comps[c], g_bc7_weights2[i], false);
+
+					for (uint32_t p = 0; p < ccell_params[subset].m_num_pixels; p++)
+					{
+						color_rgba orig_pix(part_pixels[subset][p]);
+						orig_pix.g = orig_pix.r;
+						orig_pix.b = orig_pix.r;
+						total_err += color_distance_la(orig_pix, colors[ccell_result_selectors[subset][p]]);
+					}
+				}
+				else
+				{
+					total_err += subset_err;
+				}
+			} // subset
+
+			// ASTC
+			astc_block_desc astc_results;
+			memset(&astc_results, 0, sizeof(astc_results));
+
+			astc_results.m_dual_plane = false;
+			astc_results.m_weight_range = 2;
+
+			astc_results.m_ccs = 0;
+			astc_results.m_subsets = 2;
+			astc_results.m_partition_seed = g_astc_bc7_common_partitions2[common_pattern].m_astc;
+			astc_results.m_cem = (mode == 16) ? 4 : 12;
+
+			uint32_t part[2] = { 0, 1 };
+			if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+				std::swap(part[0], part[1]);
+
+			bool invert[2] = { false, false };
+
+			for (uint32_t p = 0; p < 2; p++)
+			{
+				if (mode == 16)
+				{
+					astc_results.m_endpoints[p * 4 + 0] = ccell_results[part[p]].m_astc_low_endpoint.m_c[0];
+					astc_results.m_endpoints[p * 4 + 1] = ccell_results[part[p]].m_astc_high_endpoint.m_c[0];
+
+					astc_results.m_endpoints[p * 4 + 2] = ccell_results[part[p]].m_astc_low_endpoint.m_c[3];
+					astc_results.m_endpoints[p * 4 + 3] = ccell_results[part[p]].m_astc_high_endpoint.m_c[3];
+				}
+				else
+				{
+					for (uint32_t c = 0; c < 4; c++)
+					{
+						astc_results.m_endpoints[p * 8 + c * 2] = ccell_results[part[p]].m_astc_low_endpoint.m_c[c];
+						astc_results.m_endpoints[p * 8 + c * 2 + 1] = ccell_results[part[p]].m_astc_high_endpoint.m_c[c];
+					}
+
+					int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 0]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 2]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 4]].m_unquant;
+
+					int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 1]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 3]].m_unquant +
+						g_astc_unquant[endpoint_range][astc_results.m_endpoints[p * 8 + 5]].m_unquant;
+
+					if (s1 < s0)
+					{
+						std::swap(astc_results.m_endpoints[p * 8 + 0], astc_results.m_endpoints[p * 8 + 1]);
+						std::swap(astc_results.m_endpoints[p * 8 + 2], astc_results.m_endpoints[p * 8 + 3]);
+						std::swap(astc_results.m_endpoints[p * 8 + 4], astc_results.m_endpoints[p * 8 + 5]);
+						std::swap(astc_results.m_endpoints[p * 8 + 6], astc_results.m_endpoints[p * 8 + 7]);
+						invert[p] = true;
+					}
+				}
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					const uint32_t bc7_part = g_bc7_partition2[16 * bc7_pattern + x + y * 4];
+
+					astc_results.m_weights[x + y * 4] = ccell_result_selectors[bc7_part][part_pixel_index[y][x]];
+
+					uint32_t astc_part = bc7_part;
+					if (g_astc_bc7_common_partitions2[common_pattern].m_invert)
+						astc_part = 1 - astc_part;
+
+					if (invert[astc_part])
+						astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = mode;
+				pResults[total_results].m_common_pattern = common_pattern;
+				pResults[total_results].m_astc = astc_results;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+
+		} // common_pattern
+	}
+
+	// MODE 10
+	// DualPlane: 0, WeightRange: 8 (16), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 13 (48)       MODE6
+	static void astc_mode10(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 8;
+		const uint32_t endpoint_range = 13;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 16;
+		ccell_params.m_pSelector_weights = g_astc_weights4;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_astc_weights4x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 12;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+		astc_results.m_endpoints[6] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[7] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			std::swap(astc_results.m_endpoints[6], astc_results.m_endpoints[7]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 15 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 10;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// 11. DualPlane: 1, WeightRange: 2 (4), Subsets: 1, CEM: 12 (RGBA Direct), EndpointRange: 13 (48)        MODE5
+	// 17. DualPlane: 1, WeightRange : 2 (4), Subsets : 1, CEM : 4 (LA Direct), EndpointRange : 20 (256)    BC7 MODE5
+	static void astc_mode11_or_17(uint32_t mode, const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		assert((mode == 11) || (mode == 17));
+
+		const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = (mode == 17) ? 20 : 13;
+
+		bc7enc_compress_block_params local_comp_params(comp_params);
+		local_comp_params.m_perceptual = false;
+		local_comp_params.m_weights[0] = 1;
+		local_comp_params.m_weights[1] = 1;
+		local_comp_params.m_weights[2] = 1;
+		local_comp_params.m_weights[3] = 1;
+
+		const uint32_t last_rot_comp = (mode == 17) ? 1 : 4;
+
+		for (uint32_t rot_comp = 0; rot_comp < last_rot_comp; rot_comp++)
+		{
+			color_quad_u8 block_rgb[16];
+			color_quad_u8 block_a[16];
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				block_rgb[i] = ((color_quad_u8*)&block[0][0])[i];
+				block_a[i] = block_rgb[i];
+
+				if (mode == 17)
+				{
+					assert(block_rgb[i].m_c[0] == block_rgb[i].m_c[1]);
+					assert(block_rgb[i].m_c[0] == block_rgb[i].m_c[2]);
+
+					block_a[i].m_c[0] = block_rgb[i].m_c[3];
+					block_a[i].m_c[1] = block_rgb[i].m_c[3];
+					block_a[i].m_c[2] = block_rgb[i].m_c[3];
+					block_a[i].m_c[3] = 255;
+
+					block_rgb[i].m_c[1] = block_rgb[i].m_c[0];
+					block_rgb[i].m_c[2] = block_rgb[i].m_c[0];
+					block_rgb[i].m_c[3] = 255;
+				}
+				else
+				{
+					uint8_t c = block_a[i].m_c[rot_comp];
+					block_a[i].m_c[0] = c;
+					block_a[i].m_c[1] = c;
+					block_a[i].m_c[2] = c;
+					block_a[i].m_c[3] = 255;
+
+					block_rgb[i].m_c[rot_comp] = block_rgb[i].m_c[3];
+					block_rgb[i].m_c[3] = 255;
+				}
+			}
+
+			uint8_t ccell_result_selectors_temp[16];
+
+			color_cell_compressor_params ccell_params_rgb;
+			memset(&ccell_params_rgb, 0, sizeof(ccell_params_rgb));
+
+			ccell_params_rgb.m_num_pixels = 16;
+			ccell_params_rgb.m_pPixels = block_rgb;
+			ccell_params_rgb.m_num_selector_weights = 4;
+			ccell_params_rgb.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_rgb.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_rgb.m_astc_endpoint_range = endpoint_range;
+			ccell_params_rgb.m_weights[0] = 1;
+			ccell_params_rgb.m_weights[1] = 1;
+			ccell_params_rgb.m_weights[2] = 1;
+			ccell_params_rgb.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_rgb;
+			uint8_t ccell_result_selectors_rgb[16];
+			memset(&ccell_results_rgb, 0, sizeof(ccell_results_rgb));
+			ccell_results_rgb.m_pSelectors = &ccell_result_selectors_rgb[0];
+			ccell_results_rgb.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_rgb = color_cell_compression(255, &ccell_params_rgb, &ccell_results_rgb, &local_comp_params);
+
+			color_cell_compressor_params ccell_params_a;
+			memset(&ccell_params_a, 0, sizeof(ccell_params_a));
+
+			ccell_params_a.m_num_pixels = 16;
+			ccell_params_a.m_pPixels = block_a;
+			ccell_params_a.m_num_selector_weights = 4;
+			ccell_params_a.m_pSelector_weights = g_bc7_weights2;
+			ccell_params_a.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+			ccell_params_a.m_astc_endpoint_range = endpoint_range;
+			ccell_params_a.m_weights[0] = 1;
+			ccell_params_a.m_weights[1] = 1;
+			ccell_params_a.m_weights[2] = 1;
+			ccell_params_a.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_a;
+			uint8_t ccell_result_selectors_a[16];
+			memset(&ccell_results_a, 0, sizeof(ccell_results_a));
+			ccell_results_a.m_pSelectors = &ccell_result_selectors_a[0];
+			ccell_results_a.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_a = color_cell_compression(255, &ccell_params_a, &ccell_results_a, &local_comp_params) / 3;
+
+			uint64_t total_err = (mode == 17) ? ((part_err_rgb / 3) + part_err_a) : (part_err_rgb + part_err_a);
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = true;
+			blk.m_weight_range = weight_range;
+
+			blk.m_ccs = (mode == 17) ? 3 : rot_comp;
+			blk.m_subsets = 1;
+			blk.m_partition_seed = 0;
+			blk.m_cem = (mode == 17) ? 4 : 12;
+
+			bool invert = false;
+
+			if (mode == 17)
+			{
+				assert(ccell_results_rgb.m_astc_low_endpoint.m_c[0] == ccell_results_rgb.m_astc_low_endpoint.m_c[1]);
+				assert(ccell_results_rgb.m_astc_low_endpoint.m_c[0] == ccell_results_rgb.m_astc_low_endpoint.m_c[2]);
+
+				assert(ccell_results_rgb.m_astc_high_endpoint.m_c[0] == ccell_results_rgb.m_astc_high_endpoint.m_c[1]);
+				assert(ccell_results_rgb.m_astc_high_endpoint.m_c[0] == ccell_results_rgb.m_astc_high_endpoint.m_c[2]);
+
+				blk.m_endpoints[0] = ccell_results_rgb.m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[1] = ccell_results_rgb.m_astc_high_endpoint.m_c[0];
+
+				blk.m_endpoints[2] = ccell_results_a.m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[3] = ccell_results_a.m_astc_high_endpoint.m_c[0];
+			}
+			else
+			{
+				blk.m_endpoints[0] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[1] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[0];
+				blk.m_endpoints[2] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[1];
+				blk.m_endpoints[3] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[1];
+				blk.m_endpoints[4] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[2];
+				blk.m_endpoints[5] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[2];
+				if (rot_comp == 3)
+				{
+					blk.m_endpoints[6] = ccell_results_a.m_astc_low_endpoint.m_c[0];
+					blk.m_endpoints[7] = ccell_results_a.m_astc_high_endpoint.m_c[0];
+				}
+				else
+				{
+					blk.m_endpoints[6] = ccell_results_rgb.m_astc_low_endpoint.m_c[rot_comp];
+					blk.m_endpoints[7] = ccell_results_rgb.m_astc_high_endpoint.m_c[rot_comp];
+				}
+
+				int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+				int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+				if (s1 < s0)
+				{
+					std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+					std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+					std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+					std::swap(blk.m_endpoints[6], blk.m_endpoints[7]);
+					invert = true;
+				}
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					uint32_t rgb_index = ccell_result_selectors_rgb[x + y * 4];
+					uint32_t a_index = ccell_result_selectors_a[x + y * 4];
+
+					if (invert)
+					{
+						rgb_index = 3 - rgb_index;
+						a_index = 3 - a_index;
+					}
+
+					blk.m_weights[(x + y * 4) * 2 + 0] = (uint8_t)rgb_index;
+					blk.m_weights[(x + y * 4) * 2 + 1] = (uint8_t)a_index;
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = mode;
+				pResults[total_results].m_common_pattern = 0;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+		} // rot_comp
+	}
+
+	// MODE 12
+	// DualPlane: 0, WeightRange: 5 (8), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 19 (192)       MODE6
+	static void astc_mode12(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 5;
+		const uint32_t endpoint_range = 19;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 8;
+		ccell_params.m_pSelector_weights = g_bc7_weights3;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights3x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 12;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+		astc_results.m_endpoints[6] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[7] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			std::swap(astc_results.m_endpoints[6], astc_results.m_endpoints[7]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 7 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 12;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// 13. DualPlane: 1, WeightRange: 0 (2), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 20 (256)        MODE5
+	static void astc_mode13(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		bc7enc_compress_block_params local_comp_params(comp_params);
+		local_comp_params.m_perceptual = false;
+		local_comp_params.m_weights[0] = 1;
+		local_comp_params.m_weights[1] = 1;
+		local_comp_params.m_weights[2] = 1;
+		local_comp_params.m_weights[3] = 1;
+
+		for (uint32_t rot_comp = 0; rot_comp < 4; rot_comp++)
+		{
+			const uint32_t weight_range = 0;
+			const uint32_t endpoint_range = 20;
+
+			color_quad_u8 block_rgb[16];
+			color_quad_u8 block_a[16];
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				block_rgb[i] = ((color_quad_u8*)&block[0][0])[i];
+				block_a[i] = block_rgb[i];
+
+				uint8_t c = block_a[i].m_c[rot_comp];
+				block_a[i].m_c[0] = c;
+				block_a[i].m_c[1] = c;
+				block_a[i].m_c[2] = c;
+				block_a[i].m_c[3] = 255;
+
+				block_rgb[i].m_c[rot_comp] = block_rgb[i].m_c[3];
+				block_rgb[i].m_c[3] = 255;
+			}
+
+			uint8_t ccell_result_selectors_temp[16];
+
+			color_cell_compressor_params ccell_params_rgb;
+			memset(&ccell_params_rgb, 0, sizeof(ccell_params_rgb));
+
+			ccell_params_rgb.m_num_pixels = 16;
+			ccell_params_rgb.m_pPixels = block_rgb;
+			ccell_params_rgb.m_num_selector_weights = 2;
+			ccell_params_rgb.m_pSelector_weights = g_bc7_weights1;
+			ccell_params_rgb.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights1x;
+			ccell_params_rgb.m_astc_endpoint_range = endpoint_range;
+			ccell_params_rgb.m_weights[0] = 1;
+			ccell_params_rgb.m_weights[1] = 1;
+			ccell_params_rgb.m_weights[2] = 1;
+			ccell_params_rgb.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_rgb;
+			uint8_t ccell_result_selectors_rgb[16];
+			memset(&ccell_results_rgb, 0, sizeof(ccell_results_rgb));
+			ccell_results_rgb.m_pSelectors = &ccell_result_selectors_rgb[0];
+			ccell_results_rgb.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_rgb = color_cell_compression(255, &ccell_params_rgb, &ccell_results_rgb, &local_comp_params);
+
+			color_cell_compressor_params ccell_params_a;
+			memset(&ccell_params_a, 0, sizeof(ccell_params_a));
+
+			ccell_params_a.m_num_pixels = 16;
+			ccell_params_a.m_pPixels = block_a;
+			ccell_params_a.m_num_selector_weights = 2;
+			ccell_params_a.m_pSelector_weights = g_bc7_weights1;
+			ccell_params_a.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights1x;
+			ccell_params_a.m_astc_endpoint_range = endpoint_range;
+			ccell_params_a.m_weights[0] = 1;
+			ccell_params_a.m_weights[1] = 1;
+			ccell_params_a.m_weights[2] = 1;
+			ccell_params_a.m_weights[3] = 1;
+
+			color_cell_compressor_results ccell_results_a;
+			uint8_t ccell_result_selectors_a[16];
+			memset(&ccell_results_a, 0, sizeof(ccell_results_a));
+			ccell_results_a.m_pSelectors = &ccell_result_selectors_a[0];
+			ccell_results_a.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+			uint64_t part_err_a = color_cell_compression(255, &ccell_params_a, &ccell_results_a, &local_comp_params) / 3;
+
+			uint64_t total_err = part_err_rgb + part_err_a;
+
+			// ASTC
+			astc_block_desc blk;
+			memset(&blk, 0, sizeof(blk));
+
+			blk.m_dual_plane = true;
+			blk.m_weight_range = weight_range;
+
+			blk.m_ccs = rot_comp;
+			blk.m_subsets = 1;
+			blk.m_partition_seed = 0;
+			blk.m_cem = 12;
+
+			blk.m_endpoints[0] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[0];
+			blk.m_endpoints[1] = (rot_comp == 0 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[0];
+			blk.m_endpoints[2] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[1];
+			blk.m_endpoints[3] = (rot_comp == 1 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[1];
+			blk.m_endpoints[4] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_low_endpoint.m_c[2];
+			blk.m_endpoints[5] = (rot_comp == 2 ? ccell_results_a : ccell_results_rgb).m_astc_high_endpoint.m_c[2];
+			if (rot_comp == 3)
+			{
+				blk.m_endpoints[6] = ccell_results_a.m_astc_low_endpoint.m_c[0];
+				blk.m_endpoints[7] = ccell_results_a.m_astc_high_endpoint.m_c[0];
+			}
+			else
+			{
+				blk.m_endpoints[6] = ccell_results_rgb.m_astc_low_endpoint.m_c[rot_comp];
+				blk.m_endpoints[7] = ccell_results_rgb.m_astc_high_endpoint.m_c[rot_comp];
+			}
+
+			bool invert = false;
+
+			int s0 = g_astc_unquant[endpoint_range][blk.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[4]].m_unquant;
+			int s1 = g_astc_unquant[endpoint_range][blk.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][blk.m_endpoints[5]].m_unquant;
+			if (s1 < s0)
+			{
+				std::swap(blk.m_endpoints[0], blk.m_endpoints[1]);
+				std::swap(blk.m_endpoints[2], blk.m_endpoints[3]);
+				std::swap(blk.m_endpoints[4], blk.m_endpoints[5]);
+				std::swap(blk.m_endpoints[6], blk.m_endpoints[7]);
+				invert = true;
+			}
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					uint32_t rgb_index = ccell_result_selectors_rgb[x + y * 4];
+					uint32_t a_index = ccell_result_selectors_a[x + y * 4];
+
+					if (invert)
+					{
+						rgb_index = 1 - rgb_index;
+						a_index = 1 - a_index;
+					}
+
+					blk.m_weights[(x + y * 4) * 2 + 0] = (uint8_t)rgb_index;
+					blk.m_weights[(x + y * 4) * 2 + 1] = (uint8_t)a_index;
+				}
+			}
+
+			assert(total_results < MAX_ENCODE_RESULTS);
+			if (total_results < MAX_ENCODE_RESULTS)
+			{
+				pResults[total_results].m_uastc_mode = 13;
+				pResults[total_results].m_common_pattern = 0;
+				pResults[total_results].m_astc = blk;
+				pResults[total_results].m_astc_err = total_err;
+				total_results++;
+			}
+		} // rot_comp
+	}
+
+	// MODE14
+	// DualPlane: 0, WeightRange: 2 (4), Subsets: 1, CEM: 12 (RGBA Direct      ), EndpointRange: 20 (256)		MODE6
+	static void astc_mode14(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 2;
+		const uint32_t endpoint_range = 20;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		ccell_params.m_num_pixels = 16;
+		ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_num_selector_weights = 4;
+		ccell_params.m_pSelector_weights = g_bc7_weights2;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_bc7_weights2x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		uint64_t part_err = color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 12;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[1];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[1];
+		astc_results.m_endpoints[4] = ccell_results.m_astc_low_endpoint.m_c[2];
+		astc_results.m_endpoints[5] = ccell_results.m_astc_high_endpoint.m_c[2];
+		astc_results.m_endpoints[6] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[7] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		bool invert = false;
+
+		int s0 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[0]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[2]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[4]].m_unquant;
+		int s1 = g_astc_unquant[endpoint_range][astc_results.m_endpoints[1]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[3]].m_unquant + g_astc_unquant[endpoint_range][astc_results.m_endpoints[5]].m_unquant;
+		if (s1 < s0)
+		{
+			std::swap(astc_results.m_endpoints[0], astc_results.m_endpoints[1]);
+			std::swap(astc_results.m_endpoints[2], astc_results.m_endpoints[3]);
+			std::swap(astc_results.m_endpoints[4], astc_results.m_endpoints[5]);
+			std::swap(astc_results.m_endpoints[6], astc_results.m_endpoints[7]);
+			invert = true;
+		}
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+				if (invert)
+					astc_results.m_weights[x + y * 4] = 3 - astc_results.m_weights[x + y * 4];
+			}
+		}
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 14;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = part_err;
+			total_results++;
+		}
+	}
+
+	// MODE 15
+	// DualPlane: 0, WeightRange : 8 (16), Subsets : 1, CEM : 4 (LA Direct), EndpointRange : 20 (256)   BC7 MODE6
+	static void astc_mode15(const color_rgba block[4][4], uastc_encode_results* pResults, uint32_t& total_results, bc7enc_compress_block_params& comp_params)
+	{
+		const uint32_t weight_range = 8;
+		const uint32_t endpoint_range = 20;
+
+		color_cell_compressor_params ccell_params;
+		memset(&ccell_params, 0, sizeof(ccell_params));
+
+		color_rgba temp_block[16];
+		for (uint32_t i = 0; i < 16; i++)
+		{
+			const uint32_t l = ((const color_rgba*)block)[i].r;
+			const uint32_t a = ((const color_rgba*)block)[i].a;
+
+			// Use (l,0,0,a) not (l,l,l,a) so both components are treated equally.
+			temp_block[i].set_noclamp_rgba(l, 0, 0, a);
+		}
+
+		ccell_params.m_num_pixels = 16;
+		//ccell_params.m_pPixels = (color_quad_u8*)&block[0][0];
+		ccell_params.m_pPixels = (color_quad_u8*)temp_block;
+		ccell_params.m_num_selector_weights = 16;
+		ccell_params.m_pSelector_weights = g_astc_weights4;
+		ccell_params.m_pSelector_weightsx = (const bc7enc_vec4F*)g_astc_weights4x;
+		ccell_params.m_astc_endpoint_range = endpoint_range;
+		ccell_params.m_weights[0] = 1;
+		ccell_params.m_weights[1] = 1;
+		ccell_params.m_weights[2] = 1;
+		ccell_params.m_weights[3] = 1;
+		ccell_params.m_has_alpha = true;
+
+		color_cell_compressor_results ccell_results;
+		uint8_t ccell_result_selectors[16];
+		uint8_t ccell_result_selectors_temp[16];
+		memset(&ccell_results, 0, sizeof(ccell_results));
+		ccell_results.m_pSelectors = &ccell_result_selectors[0];
+		ccell_results.m_pSelectors_temp = &ccell_result_selectors_temp[0];
+
+		color_cell_compression(255, &ccell_params, &ccell_results, &comp_params);
+
+		// ASTC
+		astc_block_desc astc_results;
+		memset(&astc_results, 0, sizeof(astc_results));
+
+		astc_results.m_dual_plane = false;
+		astc_results.m_weight_range = weight_range;
+
+		astc_results.m_ccs = 0;
+		astc_results.m_subsets = 1;
+		astc_results.m_partition_seed = 0;
+		astc_results.m_cem = 4;
+
+		astc_results.m_endpoints[0] = ccell_results.m_astc_low_endpoint.m_c[0];
+		astc_results.m_endpoints[1] = ccell_results.m_astc_high_endpoint.m_c[0];
+
+		astc_results.m_endpoints[2] = ccell_results.m_astc_low_endpoint.m_c[3];
+		astc_results.m_endpoints[3] = ccell_results.m_astc_high_endpoint.m_c[3];
+
+		for (uint32_t y = 0; y < 4; y++)
+			for (uint32_t x = 0; x < 4; x++)
+				astc_results.m_weights[x + y * 4] = ccell_result_selectors[x + y * 4];
+
+		color_rgba colors[16];
+		for (uint32_t c = 0; c < 4; c++)
+		{
+			colors[0].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results.m_astc_low_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+			colors[15].m_comps[c] = g_astc_unquant[endpoint_range][ccell_results.m_astc_high_endpoint.m_c[(c < 3) ? 0 : 3]].m_unquant;
+		}
+
+		for (uint32_t i = 1; i < 16 - 1; i++)
+			for (uint32_t c = 0; c < 4; c++)
+				colors[i].m_comps[c] = (uint8_t)astc_interpolate(colors[0].m_comps[c], colors[15].m_comps[c], g_astc_weights4[i], false);
+
+		uint64_t total_err = 0;
+		for (uint32_t p = 0; p < 16; p++)
+			total_err += color_distance_la(((const color_rgba*)block)[p], colors[ccell_result_selectors[p]]);
+
+		assert(total_results < MAX_ENCODE_RESULTS);
+		if (total_results < MAX_ENCODE_RESULTS)
+		{
+			pResults[total_results].m_uastc_mode = 15;
+			pResults[total_results].m_common_pattern = 0;
+			pResults[total_results].m_astc = astc_results;
+			pResults[total_results].m_astc_err = total_err;
+			total_results++;
+		}
+	}
+		
+	static void compute_block_error(const color_rgba block[4][4], const color_rgba decoded_block[4][4], uint64_t &total_rgb_err, uint64_t &total_rgba_err, uint64_t &total_la_err)
+	{
+		uint64_t total_err_r = 0, total_err_g = 0, total_err_b = 0, total_err_a = 0;
+
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				const int dr = (int)block[y][x].m_comps[0] - (int)decoded_block[y][x].m_comps[0];
+				const int dg = (int)block[y][x].m_comps[1] - (int)decoded_block[y][x].m_comps[1];
+				const int db = (int)block[y][x].m_comps[2] - (int)decoded_block[y][x].m_comps[2];
+				const int da = (int)block[y][x].m_comps[3] - (int)decoded_block[y][x].m_comps[3];
+
+				total_err_r += dr * dr;
+				total_err_g += dg * dg;
+				total_err_b += db * db;
+				total_err_a += da * da;
+			}
+		}
+
+		total_la_err = total_err_r + total_err_a;
+		total_rgb_err = total_err_r + total_err_g + total_err_b;
+		total_rgba_err = total_rgb_err + total_err_a;
+	}
+
+	static void compute_bc1_hints(bool &bc1_hint0, bool &bc1_hint1, const uastc_encode_results &best_results, const color_rgba block[4][4], const color_rgba decoded_uastc_block[4][4])
+	{
+		const uint32_t best_mode = best_results.m_uastc_mode;
+		const bool perceptual = false;
+
+		bc1_hint0 = false;
+		bc1_hint1 = false;
+
+		if (best_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+			return;
+
+		if (!g_uastc_mode_has_bc1_hint0[best_mode] && !g_uastc_mode_has_bc1_hint1[best_mode])
+			return;
+
+		color_rgba tblock_bc1[4][4];
+		dxt1_block tbc1_block[8];
+		basist::encode_bc1(tbc1_block, (const uint8_t*)&decoded_uastc_block[0][0], 0);
+		unpack_block(texture_format::cBC1, tbc1_block, &tblock_bc1[0][0]);
+
+		color_rgba tblock_hint0_bc1[4][4];
+		color_rgba tblock_hint1_bc1[4][4];
+		
+		etc_block etc1_blk;
+		memset(&etc1_blk, 0, sizeof(etc1_blk));
+
+		eac_a8_block etc2_blk;
+		memset(&etc2_blk, 0, sizeof(etc2_blk));
+		etc2_blk.m_multiplier = 1;
+		
+		// Pack to UASTC, then unpack, because the endpoints may be swapped.
+
+		uastc_block temp_ublock;
+		pack_uastc(temp_ublock, best_results, etc1_blk, 0, etc2_blk, false, false);
+
+		unpacked_uastc_block temp_ublock_unpacked;
+		unpack_uastc(temp_ublock, temp_ublock_unpacked, false);
+										
+		unpacked_uastc_block ublock;
+		memset(&ublock, 0, sizeof(ublock));
+		ublock.m_mode = best_results.m_uastc_mode;
+		ublock.m_common_pattern = best_results.m_common_pattern;
+		ublock.m_astc = temp_ublock_unpacked.m_astc;
+
+		dxt1_block b;
+
+		// HINT1
+		if (!g_uastc_mode_has_bc1_hint1[best_mode])
+		{
+			memset(tblock_hint1_bc1, 0, sizeof(tblock_hint1_bc1));
+		}
+		else
+		{
+			transcode_uastc_to_bc1_hint1(ublock, (color32 (*)[4]) decoded_uastc_block, &b, false);
+
+			unpack_block(texture_format::cBC1, &b, &tblock_hint1_bc1[0][0]);
+		}
+
+		// HINT0
+		if (!g_uastc_mode_has_bc1_hint0[best_mode])
+		{
+			memset(tblock_hint0_bc1, 0, sizeof(tblock_hint0_bc1));
+		}
+		else
+		{
+			transcode_uastc_to_bc1_hint0(ublock, &b);
+			
+			unpack_block(texture_format::cBC1, &b, &tblock_hint0_bc1[0][0]);
+		}
+
+		// Compute block errors
+		uint64_t total_t_err = 0, total_hint0_err = 0, total_hint1_err = 0;
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				total_t_err += color_distance(perceptual, block[y][x], tblock_bc1[y][x], false);
+				total_hint0_err += color_distance(perceptual, block[y][x], tblock_hint0_bc1[y][x], false);
+				total_hint1_err += color_distance(perceptual, block[y][x], tblock_hint1_bc1[y][x], false);
+			}
+		}
+
+		const float t_err = sqrtf((float)total_t_err);
+		const float t_err_hint0 = sqrtf((float)total_hint0_err);
+		const float t_err_hint1 = sqrtf((float)total_hint1_err);
+
+		const float err_thresh0 = 1.075f;
+		const float err_thresh1 = 1.075f;
+		
+		if ((g_uastc_mode_has_bc1_hint0[best_mode]) && (t_err_hint0 <= t_err * err_thresh0))
+			bc1_hint0 = true;
+
+		if ((g_uastc_mode_has_bc1_hint1[best_mode]) && (t_err_hint1 <= t_err * err_thresh1))
+			bc1_hint1 = true;
+	}
+
+	struct ycbcr
+	{
+		int32_t m_y;
+		int32_t m_cb;
+		int32_t m_cr;
+	};
+
+	static inline void rgb_to_y_cb_cr(const color_rgba& c, ycbcr& dst)
+	{
+		const int y = c.r * 54 + c.g * 183 + c.b * 19;
+		dst.m_y = y;
+		dst.m_cb = (c.b << 8) - y;
+		dst.m_cr = (c.r << 8) - y;
+	}
+
+	static inline uint64_t color_diff(const ycbcr& a, const ycbcr& b)
+	{
+		const int y_delta = a.m_y - b.m_y;
+		const int cb_delta = a.m_cb - b.m_cb;
+		const int cr_delta = a.m_cr - b.m_cr;
+		return ((int64_t)y_delta * y_delta * 4) + ((int64_t)cr_delta * cr_delta) + ((int64_t)cb_delta * cb_delta);
+	}
+
+	static inline int gray_distance2(const color_rgba& c, int r, int g, int b)
+	{
+		int gray_dist = (((int)c[0] - r) + ((int)c[1] - g) + ((int)c[2] - b) + 1) / 3;
+
+		int gray_point_r = clamp255(r + gray_dist);
+		int gray_point_g = clamp255(g + gray_dist);
+		int gray_point_b = clamp255(b + gray_dist);
+
+		int dist_to_gray_point_r = c[0] - gray_point_r;
+		int dist_to_gray_point_g = c[1] - gray_point_g;
+		int dist_to_gray_point_b = c[2] - gray_point_b;
+
+		return (dist_to_gray_point_r * dist_to_gray_point_r) + (dist_to_gray_point_g * dist_to_gray_point_g) + (dist_to_gray_point_b * dist_to_gray_point_b);
+	}
+
+	static bool pack_etc1_estimate_flipped(const color_rgba* pSrc_pixels)
+	{
+		int sums[3][2][2];
+
+#define GET_XY(x, y, c) pSrc_pixels[(x) + ((y) * 4)][c]
+
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			sums[c][0][0] = GET_XY(0, 0, c) + GET_XY(0, 1, c) + GET_XY(1, 0, c) + GET_XY(1, 1, c);
+			sums[c][1][0] = GET_XY(2, 0, c) + GET_XY(2, 1, c) + GET_XY(3, 0, c) + GET_XY(3, 1, c);
+			sums[c][0][1] = GET_XY(0, 2, c) + GET_XY(0, 3, c) + GET_XY(1, 2, c) + GET_XY(1, 3, c);
+			sums[c][1][1] = GET_XY(2, 2, c) + GET_XY(2, 3, c) + GET_XY(3, 2, c) + GET_XY(3, 3, c);
+		}
+
+		int upper_avg[3], lower_avg[3], left_avg[3], right_avg[3];
+		for (uint32_t c = 0; c < 3; c++)
+		{
+			upper_avg[c] = (sums[c][0][0] + sums[c][1][0] + 4) / 8;
+			lower_avg[c] = (sums[c][0][1] + sums[c][1][1] + 4) / 8;
+			left_avg[c] = (sums[c][0][0] + sums[c][0][1] + 4) / 8;
+			right_avg[c] = (sums[c][1][0] + sums[c][1][1] + 4) / 8;
+		}
+
+#undef GET_XY
+#define GET_XY(x, y, a) gray_distance2(pSrc_pixels[(x) + ((y) * 4)], a[0], a[1], a[2])
+
+		int upper_gray_dist = 0, lower_gray_dist = 0, left_gray_dist = 0, right_gray_dist = 0;
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			for (uint32_t j = 0; j < 2; j++)
+			{
+				upper_gray_dist += GET_XY(i, j, upper_avg);
+				lower_gray_dist += GET_XY(i, 2 + j, lower_avg);
+				left_gray_dist += GET_XY(j, i, left_avg);
+				right_gray_dist += GET_XY(2 + j, i, right_avg);
+			}
+		}
+
+#undef GET_XY
+
+		int upper_lower_sum = upper_gray_dist + lower_gray_dist;
+		int left_right_sum = left_gray_dist + right_gray_dist;
+
+		return upper_lower_sum < left_right_sum;
+	}
+
+	static void compute_etc1_hints(etc_block& best_etc1_blk, uint32_t& best_etc1_bias, const uastc_encode_results& best_results, const color_rgba block[4][4], const color_rgba decoded_uastc_block[4][4], int level, uint32_t flags)
+	{
+		best_etc1_bias = 0;
+
+		if (best_results.m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+		{
+			pack_etc1_block_solid_color(best_etc1_blk, &best_results.m_solid_color.m_comps[0]);
+			return;
+		}
+
+		const bool faster_etc1 = (flags & cPackUASTCETC1FasterHints) != 0;
+		const bool fastest_etc1 = (flags & cPackUASTCETC1FastestHints) != 0;
+
+		const bool has_bias = g_uastc_mode_has_etc1_bias[best_results.m_uastc_mode];
+
+		// 0 should be at the top, but we need 13 first because it represents bias (0,0,0).
+		const uint8_t s_sorted_bias_modes[32] = { 13, 0, 22, 29, 27, 12, 26, 9, 30, 31, 8, 10, 25, 2, 23, 5, 15, 7, 3, 11, 6, 17, 28, 18, 1, 19, 20, 21, 24, 4, 14, 16 };
+
+		uint32_t last_bias = 1;
+		bool use_faster_bias_mode_table = false;
+		const bool flip_estimate = (level <= cPackUASTCLevelFaster) || (faster_etc1) || (fastest_etc1);
+		if (has_bias)
+		{
+			switch (level)
+			{
+			case cPackUASTCLevelFastest:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 1 : 2);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			case cPackUASTCLevelFaster:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 3 : 5);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			case cPackUASTCLevelDefault:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 10 : 20);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			case cPackUASTCLevelSlower:
+			{
+				last_bias = fastest_etc1 ? 1 : (faster_etc1 ? 16 : 32);
+				use_faster_bias_mode_table = true;
+				break;
+			}
+			default:
+			{
+				last_bias = 32;
+				break;
+			}
+			}
+		}
+
+		memset(&best_etc1_blk, 0, sizeof(best_etc1_blk));
+		uint64_t best_err = UINT64_MAX;
+
+		etc_block trial_block;
+		memset(&trial_block, 0, sizeof(trial_block));
+
+		ycbcr block_ycbcr[4][4], decoded_uastc_block_ycbcr[4][4];
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				rgb_to_y_cb_cr(block[y][x], block_ycbcr[y][x]);
+				rgb_to_y_cb_cr(decoded_uastc_block[y][x], decoded_uastc_block_ycbcr[y][x]);
+			}
+		}
+
+		uint32_t first_flip = 0, last_flip = 2;
+		uint32_t first_individ = 0, last_individ = 2;
+		
+		if (flags & cPackUASTCETC1DisableFlipAndIndividual)
+		{
+			last_flip = 1;
+			last_individ = 1;
+		}
+		else if (flip_estimate)
+		{
+			if (pack_etc1_estimate_flipped(&decoded_uastc_block[0][0]))
+				first_flip = 1;
+			last_flip = first_flip + 1;
+		}
+										
+		for (uint32_t flip = first_flip; flip < last_flip; flip++)
+		{
+			trial_block.set_flip_bit(flip != 0);
+
+			for (uint32_t individ = first_individ; individ < last_individ; individ++)
+			{
+				const uint32_t mul = individ ? 15 : 31;
+				
+				trial_block.set_diff_bit(individ == 0);
+
+				color_rgba unbiased_block_colors[2];
+
+				int min_r[2] = { 255, 255 }, min_g[2] = { 255, 255 }, min_b[2] = { 255, 255 }, max_r[2] = { 0, 0 }, max_g[2] = { 0, 0 }, max_b[2] = { 0, 0 };
+
+				for (uint32_t subset = 0; subset < 2; subset++)
+				{
+					uint32_t avg_color[3];
+					memset(avg_color, 0, sizeof(avg_color));
+
+					for (uint32_t j = 0; j < 8; j++)
+					{
+						const etc_coord2 &c = g_etc1_pixel_coords[flip][subset][j];
+						const color_rgba& p = decoded_uastc_block[c.m_y][c.m_x];
+												
+						avg_color[0] += p.r;
+						avg_color[1] += p.g;
+						avg_color[2] += p.b;
+
+						min_r[subset] = basisu::minimum<uint32_t>(min_r[subset], p.r);
+						min_g[subset] = basisu::minimum<uint32_t>(min_g[subset], p.g);
+						min_b[subset] = basisu::minimum<uint32_t>(min_b[subset], p.b);
+
+						max_r[subset] = basisu::maximum<uint32_t>(max_r[subset], p.r);
+						max_g[subset] = basisu::maximum<uint32_t>(max_g[subset], p.g);
+						max_b[subset] = basisu::maximum<uint32_t>(max_b[subset], p.b);
+					} // j
+
+					unbiased_block_colors[subset][0] = (uint8_t)((avg_color[0] * mul + 1020) / (8 * 255));
+					unbiased_block_colors[subset][1] = (uint8_t)((avg_color[1] * mul + 1020) / (8 * 255));
+					unbiased_block_colors[subset][2] = (uint8_t)((avg_color[2] * mul + 1020) / (8 * 255));
+					unbiased_block_colors[subset][3] = 0;
+										
+				} // subset
+												
+				for (uint32_t bias_iter = 0; bias_iter < last_bias; bias_iter++)
+				{
+					const uint32_t bias = use_faster_bias_mode_table ? s_sorted_bias_modes[bias_iter] : bias_iter;
+										
+					color_rgba block_colors[2];
+					for (uint32_t subset = 0; subset < 2; subset++)
+						block_colors[subset] = has_bias ? apply_etc1_bias((color32&)unbiased_block_colors[subset], bias, mul, subset) : unbiased_block_colors[subset];
+
+					if (individ)
+						trial_block.set_block_color4(block_colors[0], block_colors[1]);
+					else
+						trial_block.set_block_color5_clamp(block_colors[0], block_colors[1]);
+
+					uint32_t range[2];
+					for (uint32_t subset = 0; subset < 2; subset++)
+					{
+						const color_rgba base_c(trial_block.get_block_color(subset, true));
+
+						const int pos_r = iabs(max_r[subset] - base_c.r);
+						const int neg_r = iabs(base_c.r - min_r[subset]);
+
+						const int pos_g = iabs(max_g[subset] - base_c.g);
+						const int neg_g = iabs(base_c.g - min_g[subset]);
+
+						const int pos_b = iabs(max_b[subset] - base_c.b);
+						const int neg_b = iabs(base_c.b - min_b[subset]);
+
+						range[subset] = maximum(maximum(pos_r, neg_r, pos_g, neg_g), pos_b, neg_b);
+					}
+
+					uint32_t best_inten_table[2] = { 0, 0 };
+
+					for (uint32_t subset = 0; subset < 2; subset++)
+					{
+						uint64_t best_subset_err = UINT64_MAX;
+
+						const uint32_t inten_table_limit = (level == cPackUASTCLevelVerySlow) ? 8 : ((range[subset] > 51) ? 8 : (range[subset] >= 7 ? 4 : 2));
+						
+						for (uint32_t inten_table = 0; inten_table < inten_table_limit; inten_table++)
+						{
+							trial_block.set_inten_table(subset, inten_table);
+
+							color_rgba color_table[4];
+							trial_block.get_block_colors(color_table, subset);
+
+							ycbcr color_table_ycbcr[4];
+							for (uint32_t i = 0; i < 4; i++)
+								rgb_to_y_cb_cr(color_table[i], color_table_ycbcr[i]);
+
+							uint64_t total_error = 0;
+							if (flip)
+							{
+								for (uint32_t y = 0; y < 2; y++)
+								{
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][0];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][1];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][2];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][3];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									if (total_error >= best_subset_err)
+										break;
+								}
+							}
+							else
+							{
+								for (uint32_t y = 0; y < 4; y++)
+								{
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[y][subset * 2 + 0];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+									{
+										const ycbcr& c = decoded_uastc_block_ycbcr[y][subset * 2 + 1];
+										total_error += minimum(color_diff(color_table_ycbcr[0], c), color_diff(color_table_ycbcr[1], c), color_diff(color_table_ycbcr[2], c), color_diff(color_table_ycbcr[3], c));
+									}
+								}
+								if (total_error >= best_subset_err)
+									break;
+							}
+
+							if (total_error < best_subset_err)
+							{
+								best_subset_err = total_error;
+								best_inten_table[subset] = inten_table;
+							}
+
+						} // inten_table
+
+					} // subset
+
+					trial_block.set_inten_table(0, best_inten_table[0]);
+					trial_block.set_inten_table(1, best_inten_table[1]);
+
+					// Compute error against the ORIGINAL block.
+					uint64_t err = 0;
+
+					for (uint32_t subset = 0; subset < 2; subset++)
+					{
+						color_rgba color_table[4];
+						trial_block.get_block_colors(color_table, subset);
+
+						ycbcr color_table_ycbcr[4];
+						for (uint32_t i = 0; i < 4; i++)
+							rgb_to_y_cb_cr(color_table[i], color_table_ycbcr[i]);
+
+						if (flip)
+						{
+							for (uint32_t y = 0; y < 2; y++)
+							{
+								for (uint32_t x = 0; x < 4; x++)
+								{
+									const ycbcr& c = decoded_uastc_block_ycbcr[subset * 2 + y][x];
+									const uint64_t best_index_err = minimum(color_diff(color_table_ycbcr[0], c) << 2, (color_diff(color_table_ycbcr[1], c) << 2) + 1, (color_diff(color_table_ycbcr[2], c) << 2) + 2, (color_diff(color_table_ycbcr[3], c) << 2) + 3);
+
+									const uint32_t best_index = (uint32_t)best_index_err & 3;
+									err += color_diff(block_ycbcr[subset * 2 + y][x], color_table_ycbcr[best_index]);
+								}
+								if (err >= best_err)
+									break;
+							}
+						}
+						else
+						{
+							for (uint32_t y = 0; y < 4; y++)
+							{
+								for (uint32_t x = 0; x < 2; x++)
+								{
+									const ycbcr& c = decoded_uastc_block_ycbcr[y][subset * 2 + x];
+									const uint64_t best_index_err = minimum(color_diff(color_table_ycbcr[0], c) << 2, (color_diff(color_table_ycbcr[1], c) << 2) + 1, (color_diff(color_table_ycbcr[2], c) << 2) + 2, (color_diff(color_table_ycbcr[3], c) << 2) + 3);
+
+									const uint32_t best_index = (uint32_t)best_index_err & 3;
+									err += color_diff(block_ycbcr[y][subset * 2 + x], color_table_ycbcr[best_index]);
+								}
+								if (err >= best_err)
+									break;
+							}
+						}
+
+					} // subset
+
+					if (err < best_err)
+					{
+						best_err = err;
+
+						best_etc1_blk = trial_block;
+						best_etc1_bias = bias;
+					}
+
+				} // bias_iter
+
+			} // individ
+
+		} // flip
+	}
+
+	struct uastc_pack_eac_a8_results
+	{
+		uint32_t m_base;
+		uint32_t m_table;
+		uint32_t m_multiplier;
+	};
+	
+	static uint64_t uastc_pack_eac_a8(uastc_pack_eac_a8_results& results, const uint8_t* pPixels, uint32_t num_pixels, uint32_t base_search_rad, uint32_t mul_search_rad, uint32_t table_mask)
+	{
+		assert(num_pixels <= 16);
+
+		uint32_t min_alpha = 255, max_alpha = 0;
+		for (uint32_t i = 0; i < num_pixels; i++)
+		{
+			const uint32_t a = pPixels[i];
+			if (a < min_alpha) min_alpha = a;
+			if (a > max_alpha) max_alpha = a;
+		}
+
+		if (min_alpha == max_alpha)
+		{
+			results.m_base = min_alpha;
+			results.m_table = 13;
+			results.m_multiplier = 1;
+			return 0;
+		}
+
+		const uint32_t alpha_range = max_alpha - min_alpha;
+
+		uint64_t best_err = UINT64_MAX;
+
+		for (uint32_t table = 0; table < 16; table++)
+		{
+			if ((table_mask & (1U << table)) == 0)
+				continue;
+
+			const float range = (float)(g_etc2_eac_tables[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]);
+			const int center = (int)roundf(lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_etc2_eac_tables[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range));
+
+			const int base_min = clamp255(center - base_search_rad);
+			const int base_max = clamp255(center + base_search_rad);
+
+			const int mul = (int)roundf(alpha_range / range);
+			const int mul_low = clamp<int>(mul - mul_search_rad, 1, 15);
+			const int mul_high = clamp<int>(mul + mul_search_rad, 1, 15);
+
+			for (int base = base_min; base <= base_max; base++)
+			{
+				for (int multiplier = mul_low; multiplier <= mul_high; multiplier++)
+				{
+					uint64_t total_err = 0;
+
+					for (uint32_t i = 0; i < num_pixels; i++)
+					{
+						const int a = pPixels[i];
+
+						uint32_t best_s_err = UINT32_MAX;
+						//uint32_t best_s = 0;
+						for (uint32_t s = 0; s < 8; s++)
+						{
+							const int v = clamp255((int)multiplier * g_etc2_eac_tables[table][s] + (int)base);
+
+							uint32_t err = iabs(a - v);
+							if (err < best_s_err)
+							{
+								best_s_err = err;
+								//best_s = s;
+							}
+						}
+
+						total_err += best_s_err * best_s_err;
+						if (total_err >= best_err)
+							break;
+					}
+
+					if (total_err < best_err)
+					{
+						best_err = total_err;
+						results.m_base = base;
+						results.m_multiplier = multiplier;
+						results.m_table = table;
+						if (!best_err)
+							return best_err;
+					}
+
+				} // table
+
+			} // multiplier
+
+		} // base
+
+		return best_err;
+	}
+
+	const int32_t DEFAULT_BC7_ERROR_WEIGHT = 50;
+	const float UASTC_ERROR_THRESH = 1.3f;
+
+	// TODO: This is a quick hack to favor certain modes when we know we'll be followed up with an RDO postprocess.
+	static inline float get_uastc_mode_weight(uint32_t mode)
+	{
+		const float FAVORED_MODE_WEIGHT = .8f;
+
+		switch (mode)
+		{
+		case 0:
+		case 10:
+			return FAVORED_MODE_WEIGHT;
+		default:
+			break;
+		}
+
+		return 1.0f;
+	}
+
+	void encode_uastc(const uint8_t* pRGBAPixels, uastc_block& output_block, uint32_t flags)
+	{
+//		printf("encode_uastc: \n");
+//		for (int i = 0; i < 16; i++)
+//			printf("[%u %u %u %u] ", pRGBAPixels[i * 4 + 0], pRGBAPixels[i * 4 + 1], pRGBAPixels[i * 4 + 2], pRGBAPixels[i * 4 + 3]);
+//		printf("\n");
+
+		const color_rgba(*block)[4] = reinterpret_cast<const color_rgba(*)[4]>(pRGBAPixels);
+
+		bool solid_color = true, has_alpha = false, is_la = true;
+
+		const color_rgba first_color(block[0][0]);
+		for (uint32_t y = 0; y < 4; y++)
+		{
+			for (uint32_t x = 0; x < 4; x++)
+			{
+				if (block[y][x].a < 255)
+					has_alpha = true;
+
+				if (block[y][x] != first_color)
+					solid_color = false;
+
+				if ((block[y][x].r != block[y][x].g) || (block[y][x].r != block[y][x].b))
+					is_la = false;
+			}
+		}
+
+		if (solid_color)
+		{
+			// Solid color blocks are so common that we handle them specially and as quickly as we can.
+			uastc_encode_results solid_results;
+			solid_results.m_uastc_mode = UASTC_MODE_INDEX_SOLID_COLOR;
+			solid_results.m_astc_err = 0;
+			solid_results.m_common_pattern = 0;
+			solid_results.m_solid_color = first_color;
+			memset(&solid_results.m_astc, 0, sizeof(solid_results.m_astc));
+						
+			etc_block etc1_blk;
+			uint32_t etc1_bias = 0;
+
+			pack_etc1_block_solid_color(etc1_blk, &first_color.m_comps[0]);
+
+			eac_a8_block eac_a8_blk;
+			eac_a8_blk.m_table = 0;
+			eac_a8_blk.m_multiplier = 1;
+
+			pack_uastc(output_block, solid_results, etc1_blk, etc1_bias, eac_a8_blk, false, false);
+
+//			printf(" Solid\n");
+
+			return;
+		}
+		
+		int level = flags & 7;
+		const bool favor_uastc_error = (flags & cPackUASTCFavorUASTCError) != 0;
+		const bool favor_bc7_error = !favor_uastc_error && ((flags & cPackUASTCFavorBC7Error) != 0);
+		//const bool etc1_perceptual = true;
+		
+		uastc_encode_results results[MAX_ENCODE_RESULTS];
+						
+		level = clampi(level, cPackUASTCLevelFastest, cPackUASTCLevelVerySlow);
+		
+		// Set all options to slowest, then configure from there depending on the selected level.
+		uint32_t mode_mask = UINT32_MAX;
+		uint32_t uber_level = 6;
+		bool estimate_partition = false;
+		bool always_try_alpha_modes = true;
+		uint32_t eac_a8_mul_search_rad = 3;
+		uint32_t eac_a8_table_mask = UINT32_MAX;
+		uint32_t least_squares_passes = 2;
+		bool bc1_hints = true;
+		bool only_use_la_on_transparent_blocks = false;
+		
+		switch (level)
+		{
+		case cPackUASTCLevelFastest:
+		{
+			mode_mask = (1 << 0) | (1 << 8) | 
+				(1 << 11) | (1 << 12) |
+				(1 << 15);
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			uber_level = 0;
+			least_squares_passes = 1;
+			bc1_hints = false;
+			estimate_partition = true;
+			only_use_la_on_transparent_blocks = true;
+			break;
+		}
+		case cPackUASTCLevelFaster:
+		{
+			mode_mask = (1 << 0) | (1 << 4) | (1 << 6) | (1 << 8) |
+				(1 << 9) | (1 << 11) | (1 << 12) |
+				(1 << 15) | (1 << 17);
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			uber_level = 0;
+			least_squares_passes = 1;
+			estimate_partition = true;
+			break;
+		}
+		case cPackUASTCLevelDefault: 
+		{
+			mode_mask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 6) | (1 << 8) |
+				(1 << 9) | (1 << 10) | (1 << 11) | (1 << 12) | (1 << 13) |
+				(1 << 15) | (1 << 16) | (1 << 17);
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 1;
+			eac_a8_table_mask = (1 << 0) | (1 << 2) | (1 << 6) | (1 << 7) | (1 << 8) | (1 << 10) | (1 << 11) | (1 << 13);
+			uber_level = 1;
+			least_squares_passes = 1;
+			estimate_partition = true;
+			break;
+		}
+		case cPackUASTCLevelSlower:
+		{
+			always_try_alpha_modes = false;
+			eac_a8_mul_search_rad = 2;
+			uber_level = 3;
+			estimate_partition = true;
+			break;
+		}
+		case cPackUASTCLevelVerySlow:
+		{
+			break;
+		}
+		}
+
+#if BASISU_SUPPORT_FORCE_MODE
+		static int force_mode = -1;
+		force_mode = (force_mode + 1) % TOTAL_UASTC_MODES;
+		mode_mask = UINT32_MAX;
+		always_try_alpha_modes = true;
+		only_use_la_on_transparent_blocks = false;
+#endif
+
+		// HACK HACK
+		//mode_mask &= ~(1 << 18);
+		//mode_mask = (1 << 18)| (1 << 10);
+																				
+		uint32_t total_results = 0;
+				
+		if (only_use_la_on_transparent_blocks)
+		{
+			if ((is_la) && (!has_alpha))
+				is_la = false;
+		}
+
+		const bool try_alpha_modes = has_alpha || always_try_alpha_modes;
+		
+		bc7enc_compress_block_params comp_params;
+		memset(&comp_params, 0, sizeof(comp_params));
+		comp_params.m_max_partitions_mode1 = 64;
+		comp_params.m_least_squares_passes = least_squares_passes;
+		comp_params.m_weights[0] = 1;
+		comp_params.m_weights[1] = 1;
+		comp_params.m_weights[2] = 1;
+		comp_params.m_weights[3] = 1;
+		comp_params.m_uber_level = uber_level;
+
+		if (is_la)
+		{
+			if (mode_mask & (1U << 15))
+				astc_mode15(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 16))
+				astc_mode9_or_16(16, block, results, total_results, comp_params, estimate_partition ? 4 : 0);
+
+			if (mode_mask & (1U << 17))
+				astc_mode11_or_17(17, block, results, total_results, comp_params);
+		}
+
+		if (!has_alpha)
+		{
+			if (mode_mask & (1U << 0))
+				astc_mode0_or_18(0, block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 1))
+				astc_mode1(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 2))
+				astc_mode2(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 3))
+				astc_mode3(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 4))
+				astc_mode4(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 5))
+				astc_mode5(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 6))
+				astc_mode6(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 7))
+				astc_mode7(block, results, total_results, comp_params, estimate_partition);
+
+			if (mode_mask & (1U << 18))
+				astc_mode0_or_18(18, block, results, total_results, comp_params);
+		}
+
+		if (try_alpha_modes)
+		{
+			if (mode_mask & (1U << 9))
+				astc_mode9_or_16(9, block, results, total_results, comp_params, estimate_partition ? 4 : 0);
+
+			if (mode_mask & (1U << 10))
+				astc_mode10(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 11))
+				astc_mode11_or_17(11, block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 12))
+				astc_mode12(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 13))
+				astc_mode13(block, results, total_results, comp_params);
+
+			if (mode_mask & (1U << 14))
+				astc_mode14(block, results, total_results, comp_params);
+		}
+
+		assert(total_results);
+		
+		// Fix up the errors so we consistently have LA, RGB, or RGBA error.
+		for (uint32_t i = 0; i < total_results; i++)
+		{
+			uastc_encode_results& r = results[i];
+			if (!is_la)
+			{
+				if (g_uastc_mode_is_la[r.m_uastc_mode])
+				{
+					color_rgba unpacked_block[16];
+					unpack_uastc(r.m_uastc_mode, r.m_common_pattern, r.m_solid_color.get_color32(), r.m_astc, (basist::color32 *)unpacked_block, false);
+
+					uint64_t total_err = 0;
+					for (uint32_t j = 0; j < 16; j++)
+						total_err += color_distance(unpacked_block[j], ((const color_rgba*)block)[j], true);
+
+					r.m_astc_err = total_err;
+				}
+			}
+			else
+			{
+				if (!g_uastc_mode_is_la[r.m_uastc_mode])
+				{
+					color_rgba unpacked_block[16];
+					unpack_uastc(r.m_uastc_mode, r.m_common_pattern, r.m_solid_color.get_color32(), r.m_astc, (basist::color32 *)unpacked_block, false);
+
+					uint64_t total_err = 0;
+					for (uint32_t j = 0; j < 16; j++)
+						total_err += color_distance_la(unpacked_block[j], ((const color_rgba*)block)[j]);
+
+					r.m_astc_err = total_err;
+				}
+			}
+		}
+				
+		unpacked_uastc_block unpacked_ublock;
+		memset(&unpacked_ublock, 0, sizeof(unpacked_ublock));
+
+		uint64_t total_overall_err[MAX_ENCODE_RESULTS];
+		float uastc_err_f[MAX_ENCODE_RESULTS];
+		double best_uastc_err_f = 1e+20f;
+
+		int best_index = -1;
+
+		if (total_results == 1)
+		{
+			best_index = 0;
+		}
+		else
+		{
+			const uint32_t bc7_err_weight = favor_bc7_error ? 100 : ((favor_uastc_error ? 0 : DEFAULT_BC7_ERROR_WEIGHT));
+			const uint32_t uastc_err_weight = favor_bc7_error ? 0 : 100;
+
+			// Find best overall results, balancing UASTC and UASTC->BC7 error.
+			// We purposely allow UASTC error to increase a little, if doing so lowers the BC7 error.
+			for (uint32_t i = 0; i < total_results; i++)
+			{
+#if BASISU_SUPPORT_FORCE_MODE
+				if (results[i].m_uastc_mode == force_mode)
+				{
+					best_index = i;
+					break;
+				}
+#endif
+
+				unpacked_ublock.m_mode = results[i].m_uastc_mode;
+				unpacked_ublock.m_astc = results[i].m_astc;
+				unpacked_ublock.m_common_pattern = results[i].m_common_pattern;
+				unpacked_ublock.m_solid_color = results[i].m_solid_color.get_color32();
+
+				color_rgba decoded_uastc_block[4][4];
+				bool success = unpack_uastc(results[i].m_uastc_mode, results[i].m_common_pattern, results[i].m_solid_color.get_color32(), results[i].m_astc, (basist::color32 *)&decoded_uastc_block[0][0], false);
+				(void)success;
+				VALIDATE(success);
+
+				uint64_t total_uastc_rgb_err, total_uastc_rgba_err, total_uastc_la_err;
+				compute_block_error(block, decoded_uastc_block, total_uastc_rgb_err, total_uastc_rgba_err, total_uastc_la_err);
+
+				// Validate the computed error, or we're go mad if it's inaccurate.
+				if (results[i].m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				{
+					VALIDATE(total_uastc_rgba_err == 0);
+				}
+				else if (is_la)
+				{
+					VALIDATE(total_uastc_la_err == results[i].m_astc_err);
+				}
+				else if (g_uastc_mode_has_alpha[results[i].m_uastc_mode])
+				{
+					VALIDATE(total_uastc_rgba_err == results[i].m_astc_err);
+				}
+				else
+				{
+					VALIDATE(total_uastc_rgb_err == results[i].m_astc_err);
+				}
+
+				// Transcode to BC7
+				bc7_optimization_results bc7_results;
+				transcode_uastc_to_bc7(unpacked_ublock, bc7_results);
+
+				bc7_block bc7_data;
+				encode_bc7_block(&bc7_data, &bc7_results);
+
+				color_rgba decoded_bc7_block[4][4];
+				unpack_block(texture_format::cBC7, &bc7_data, &decoded_bc7_block[0][0]);
+
+				// Compute BC7 error
+				uint64_t total_bc7_la_err, total_bc7_rgb_err, total_bc7_rgba_err;
+				compute_block_error(block, decoded_bc7_block, total_bc7_rgb_err, total_bc7_rgba_err, total_bc7_la_err);
+
+				if (results[i].m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				{
+					VALIDATE(total_bc7_rgba_err == 0);
+
+					best_index = i;
+					break;
+				}
+
+				uint64_t total_uastc_err = 0, total_bc7_err = 0;
+				if (is_la)
+				{
+					total_bc7_err = total_bc7_la_err;
+					total_uastc_err = total_uastc_la_err;
+				}
+				else if (has_alpha)
+				{
+					total_bc7_err = total_bc7_rgba_err;
+					total_uastc_err = total_uastc_rgba_err;
+				}
+				else
+				{
+					total_bc7_err = total_bc7_rgb_err;
+					total_uastc_err = total_uastc_rgb_err;
+				}
+
+				total_overall_err[i] = ((total_bc7_err * bc7_err_weight) / 100) + ((total_uastc_err * uastc_err_weight) / 100);
+				if (!total_overall_err[i])
+				{
+					best_index = i;
+					break;
+				}
+
+				uastc_err_f[i] = sqrtf((float)total_uastc_err);
+
+				if (uastc_err_f[i] < best_uastc_err_f)
+				{
+					best_uastc_err_f = uastc_err_f[i];
+				}
+
+			} // total_results
+
+			if (best_index < 0)
+			{
+				uint64_t best_err = UINT64_MAX;
+
+				if ((best_uastc_err_f == 0.0f) || (favor_bc7_error))
+				{
+					for (uint32_t i = 0; i < total_results; i++)
+					{
+						// TODO: This is a quick hack to favor modes 0 or 10 for better RDO compression.
+						const float err_weight = (flags & cPackUASTCFavorSimplerModes) ? get_uastc_mode_weight(results[i].m_uastc_mode) : 1.0f;
+
+						const uint64_t w = (uint64_t)(total_overall_err[i] * err_weight);
+						if (w  < best_err)
+						{
+							best_err = w;
+							best_index = i;
+							if (!best_err)
+								break;
+						}
+					} // i
+				}
+				else
+				{
+					// Scan the UASTC results, and consider all results within a window that has the best UASTC+BC7 error.
+					for (uint32_t i = 0; i < total_results; i++)
+					{
+						double err_delta = uastc_err_f[i] / best_uastc_err_f;
+
+						if (err_delta <= UASTC_ERROR_THRESH)
+						{
+							// TODO: This is a quick hack to favor modes 0 or 10 for better RDO compression.
+							const float err_weight = (flags & cPackUASTCFavorSimplerModes) ? get_uastc_mode_weight(results[i].m_uastc_mode) : 1.0f;
+
+							const uint64_t w = (uint64_t)(total_overall_err[i] * err_weight);
+							if (w < best_err)
+							{
+								best_err = w;
+								best_index = i;
+								if (!best_err)
+									break;
+							}
+						}
+					} // i
+				}
+			}
+		}
+
+		const uastc_encode_results& best_results = results[best_index];
+		const uint32_t best_mode = best_results.m_uastc_mode;
+		const astc_block_desc& best_astc_results = best_results.m_astc;
+				
+		color_rgba decoded_uastc_block[4][4];
+		bool success = unpack_uastc(best_mode, best_results.m_common_pattern, best_results.m_solid_color.get_color32(), best_astc_results, (basist::color32 *)&decoded_uastc_block[0][0], false);
+		(void)success;
+		VALIDATE(success);
+
+#if BASISU_VALIDATE_UASTC_ENC
+		// Make sure that the UASTC block unpacks to the same exact pixels as the ASTC block does, using two different decoders.
+		{
+			// Round trip to packed UASTC and back, then decode to pixels.
+			etc_block etc1_blk;
+			memset(&etc1_blk, 0, sizeof(etc1_blk));
+			eac_a8_block etc_eac_a8_blk;
+			memset(&etc_eac_a8_blk, 0, sizeof(etc_eac_a8_blk));
+			etc_eac_a8_blk.m_multiplier = 1;
+
+			basist::uastc_block temp_block;
+			pack_uastc(temp_block, best_results, etc1_blk, 0, etc_eac_a8_blk, false, false);
+			
+			basist::color32 temp_block_unpacked[4][4];
+			success = basist::unpack_uastc(temp_block, (basist::color32 *)temp_block_unpacked, false);
+			VALIDATE(success);
+				
+			// Now round trip to packed ASTC and back, then decode to pixels.
+			uint32_t astc_data[4];
+			
+			if (best_results.m_uastc_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				pack_astc_solid_block(astc_data, (color32 &)best_results.m_solid_color);
+			else
+			{
+				success = pack_astc_block(astc_data, &best_astc_results, best_results.m_uastc_mode);
+				VALIDATE(success);
+			}
+
+			color_rgba decoded_astc_block[4][4];
+			success = basisu_astc::astc::decompress((uint8_t*)decoded_astc_block, (uint8_t*)&astc_data, false, 4, 4);
+			VALIDATE(success);
+
+			for (uint32_t y = 0; y < 4; y++)
+			{
+				for (uint32_t x = 0; x < 4; x++)
+				{
+					VALIDATE(decoded_astc_block[y][x] == decoded_uastc_block[y][x]);
+					
+					VALIDATE(temp_block_unpacked[y][x].c[0] == decoded_uastc_block[y][x].r);
+					VALIDATE(temp_block_unpacked[y][x].c[1] == decoded_uastc_block[y][x].g);
+					VALIDATE(temp_block_unpacked[y][x].c[2] == decoded_uastc_block[y][x].b);
+					VALIDATE(temp_block_unpacked[y][x].c[3] == decoded_uastc_block[y][x].a);
+				}
+			}
+		}
+#endif
+
+		// Compute BC1 hints
+		bool bc1_hint0 = false, bc1_hint1 = false;
+		if (bc1_hints)
+			compute_bc1_hints(bc1_hint0, bc1_hint1, best_results, block, decoded_uastc_block);
+		
+		eac_a8_block eac_a8_blk;
+		if ((g_uastc_mode_has_alpha[best_mode]) && (best_mode != UASTC_MODE_INDEX_SOLID_COLOR))
+		{
+			// Compute ETC2 hints
+			uint8_t decoded_uastc_block_alpha[16];
+			for (uint32_t i = 0; i < 16; i++)
+				decoded_uastc_block_alpha[i] = decoded_uastc_block[i >> 2][i & 3].a;
+
+			uastc_pack_eac_a8_results eac8_a8_results;
+			memset(&eac8_a8_results, 0, sizeof(eac8_a8_results));
+			uastc_pack_eac_a8(eac8_a8_results, decoded_uastc_block_alpha, 16, 0, eac_a8_mul_search_rad, eac_a8_table_mask);
+						
+			// All we care about for hinting is the table and multiplier.
+			eac_a8_blk.m_table = eac8_a8_results.m_table;
+			eac_a8_blk.m_multiplier = eac8_a8_results.m_multiplier;
+		}
+		else
+		{
+			memset(&eac_a8_blk, 0, sizeof(eac_a8_blk));
+		}
+
+		// Compute ETC1 hints
+		etc_block etc1_blk;
+		uint32_t etc1_bias = 0;
+		compute_etc1_hints(etc1_blk, etc1_bias, best_results, block, decoded_uastc_block, level, flags);
+
+		// Finally, pack the UASTC block with its hints and we're done.
+		pack_uastc(output_block, best_results, etc1_blk, etc1_bias, eac_a8_blk, bc1_hint0, bc1_hint1);
+
+//		printf(" Packed: ");
+//		for (int i = 0; i < 16; i++)
+//			printf("%X ", output_block.m_bytes[i]);
+//		printf("\n");
+	}
+
+	static bool uastc_recompute_hints(basist::uastc_block* pBlock, const color_rgba* pBlock_pixels, uint32_t flags, const unpacked_uastc_block *pUnpacked_blk)
+	{
+		unpacked_uastc_block unpacked_blk;
+
+		if (pUnpacked_blk)
+			unpacked_blk = *pUnpacked_blk;
+		else
+		{
+			if (!unpack_uastc(*pBlock, unpacked_blk, false, true))
+				return false;
+		}
+		color_rgba decoded_uastc_block[4][4];
+		if (!unpack_uastc(unpacked_blk, (basist::color32 *)decoded_uastc_block, false))
+			return false;
+		uastc_encode_results results;
+		results.m_uastc_mode = unpacked_blk.m_mode;
+		results.m_common_pattern = unpacked_blk.m_common_pattern;
+		results.m_astc = unpacked_blk.m_astc;
+		results.m_solid_color = unpacked_blk.m_solid_color;
+		results.m_astc_err = 0;
+		bool bc1_hints = true;
+		uint32_t eac_a8_mul_search_rad = 3;
+		uint32_t eac_a8_table_mask = UINT32_MAX;
+		const uint32_t level = flags & cPackUASTCLevelMask;
+		switch (level)
+		{
+		case cPackUASTCLevelFastest:
+		{
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			bc1_hints = false;
+			break;
+		}
+		case cPackUASTCLevelFaster:
+		{
+			eac_a8_mul_search_rad = 0;
+			eac_a8_table_mask = (1 << 2) | (1 << 8) | (1 << 11) | (1 << 13);
+			break;
+		}
+		case cPackUASTCLevelDefault:
+		{
+			eac_a8_mul_search_rad = 1;
+			eac_a8_table_mask = (1 << 0) | (1 << 2) | (1 << 6) | (1 << 7) | (1 << 8) | (1 << 10) | (1 << 11) | (1 << 13);
+			break;
+		}
+		case cPackUASTCLevelSlower:
+		{
+			eac_a8_mul_search_rad = 2;
+			break;
+		}
+		case cPackUASTCLevelVerySlow:
+		{
+			break;
+		}
+		}
+		bool bc1_hint0 = false, bc1_hint1 = false;
+		if (bc1_hints)
+			compute_bc1_hints(bc1_hint0, bc1_hint1, results, (color_rgba (*)[4])pBlock_pixels, decoded_uastc_block);
+		const uint32_t best_mode = unpacked_blk.m_mode;
+		eac_a8_block eac_a8_blk;
+		if ((g_uastc_mode_has_alpha[best_mode]) && (best_mode != UASTC_MODE_INDEX_SOLID_COLOR))
+		{
+			uint8_t decoded_uastc_block_alpha[16];
+			for (uint32_t i = 0; i < 16; i++)
+				decoded_uastc_block_alpha[i] = decoded_uastc_block[i >> 2][i & 3].a;
+			uastc_pack_eac_a8_results eac8_a8_results;
+			memset(&eac8_a8_results, 0, sizeof(eac8_a8_results));
+			uastc_pack_eac_a8(eac8_a8_results, decoded_uastc_block_alpha, 16, 0, eac_a8_mul_search_rad, eac_a8_table_mask);
+			eac_a8_blk.m_table = eac8_a8_results.m_table;
+			eac_a8_blk.m_multiplier = eac8_a8_results.m_multiplier;
+		}
+		else
+		{
+			memset(&eac_a8_blk, 0, sizeof(eac_a8_blk));
+		}
+		etc_block etc1_blk;
+		uint32_t etc1_bias = 0;
+		compute_etc1_hints(etc1_blk, etc1_bias, results, (color_rgba (*)[4])pBlock_pixels, decoded_uastc_block, level, flags);
+		pack_uastc(*pBlock, results, etc1_blk, etc1_bias, eac_a8_blk, bc1_hint0, bc1_hint1);
+		return true;
+	}
+
+	static const uint8_t g_uastc_mode_selector_bits[TOTAL_UASTC_MODES][2] =
+	{
+		{ 65, 63 }, { 69, 31 }, { 73, 46 }, { 89, 29 },
+		{ 89, 30 }, { 68, 47 }, { 66, 62 }, { 89, 30 },
+		{ 0, 0 }, { 97, 30 }, { 65, 63 }, { 66, 62 },
+		{ 81, 47 }, { 94, 30 }, { 92, 31 }, { 62, 63 },
+		{ 98, 30 }, { 61, 62 }, { 49, 79 }
+	};
+
+	static inline uint32_t set_block_bits(uint8_t* pBytes, uint64_t val, uint32_t num_bits, uint32_t cur_ofs)
+	{
+		assert(num_bits <= 64);
+		assert((num_bits == 64) || (val < (1ULL << num_bits)));
+		uint64_t mask = (num_bits == 64) ? UINT64_MAX : ((1ULL << num_bits) - 1);
+		while (num_bits)
+		{
+			const uint32_t n = basisu::minimum<uint32_t>(8U - (cur_ofs & 7U), num_bits);
+			pBytes[cur_ofs >> 3] &= ~static_cast<uint8_t>(mask << (cur_ofs & 7U));
+			pBytes[cur_ofs >> 3] |= static_cast<uint8_t>(val << (cur_ofs & 7U));
+			val >>= n;
+			mask >>= n;
+			num_bits -= n;
+			cur_ofs += n;
+		}
+		return cur_ofs;
+	}
+
+	static const uint8_t g_tdefl_small_dist_extra[512] =
+	{
+		0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+		6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+		6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+		7, 7, 7, 7, 7, 7, 7, 7
+	};
+
+	static const uint8_t g_tdefl_large_dist_extra[128] =
+	{
+		0, 0, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+		12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+		13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13
+	};
+
+	static inline uint32_t compute_match_cost_estimate(uint32_t dist)
+	{
+		uint32_t len_cost = 7;
+		uint32_t dist_cost = 5;
+		if (dist < 512)
+			dist_cost += g_tdefl_small_dist_extra[dist & 511];
+		else
+		{
+			dist_cost += g_tdefl_large_dist_extra[basisu::minimum<uint32_t>(dist, 32767) >> 8];
+			while (dist >= 32768)
+			{
+				dist_cost++;
+				dist >>= 1;
+			}
+		}
+		return len_cost + dist_cost;
+	}
+
+	struct selector_bitsequence
+	{
+		uint64_t m_sel;
+		uint32_t m_ofs;
+		selector_bitsequence() { }
+		selector_bitsequence(uint32_t bit_ofs, uint64_t sel) : m_sel(sel), m_ofs(bit_ofs) { }
+		bool operator== (const selector_bitsequence& other) const
+		{
+			return (m_ofs == other.m_ofs) && (m_sel == other.m_sel);
+		}
+
+		bool operator< (const selector_bitsequence& other) const
+		{
+			if (m_ofs < other.m_ofs)
+				return true;
+			else if (m_ofs == other.m_ofs)
+				return m_sel < other.m_sel;
+
+			return false;
+		}
+	};
+
+	struct selector_bitsequence_hash
+	{
+		std::size_t operator()(selector_bitsequence const& s) const noexcept
+		{
+			return static_cast<std::size_t>(hash_hsieh((uint8_t *)&s, sizeof(s)) ^ s.m_sel);
+		}
+	};
+
+	class tracked_stat
+	{
+	public:
+		tracked_stat() { clear(); }
+
+		void clear() { m_num = 0; m_total = 0; m_total2 = 0; }
+
+		void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; }
+
+		tracked_stat& operator += (uint32_t val) { update(val); return *this; }
+
+		uint32_t get_number_of_values() { return m_num; }
+		uint64_t get_total() const { return m_total; }
+		uint64_t get_total2() const { return m_total2; }
+
+		float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; };
+		float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; }
+		float get_variance() const { float s = get_std_dev(); return s * s; }
+
+	private:
+		uint32_t m_num;
+		uint64_t m_total;
+		uint64_t m_total2;
+	};
+		
+	static bool uastc_rdo_blocks(uint32_t first_index, uint32_t last_index, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params& params, uint32_t flags, 
+		uint32_t &total_skipped, uint32_t &total_refined, uint32_t &total_modified, uint32_t &total_smooth)
+	{
+		debug_printf("uastc_rdo_blocks: Processing blocks %u to %u\n", first_index, last_index);
+
+		const int total_blocks_to_check = basisu::maximum<uint32_t>(1U, params.m_lz_dict_size / sizeof(basist::uastc_block));
+		const bool perceptual = false;
+
+		std::unordered_map<selector_bitsequence, uint32_t, selector_bitsequence_hash> selector_history;
+						
+		for (uint32_t block_index = first_index; block_index < last_index; block_index++)
+		{
+			const basist::uastc_block& blk = pBlocks[block_index];
+			const color_rgba* pPixels = &pBlock_pixels[16 * block_index];
+
+			unpacked_uastc_block unpacked_blk;
+			if (!unpack_uastc(blk, unpacked_blk, false, true))
+				return false;
+
+			const uint32_t block_mode = unpacked_blk.m_mode;
+			if (block_mode == UASTC_MODE_INDEX_SOLID_COLOR)
+				continue;
+
+			tracked_stat r_stats, g_stats, b_stats, a_stats;
+
+			for (uint32_t i = 0; i < 16; i++)
+			{
+				r_stats.update(pPixels[i].r);
+				g_stats.update(pPixels[i].g);
+				b_stats.update(pPixels[i].b);
+				a_stats.update(pPixels[i].a);
+			}
+
+			const float max_std_dev = basisu::maximum<float>(basisu::maximum<float>(basisu::maximum(r_stats.get_std_dev(), g_stats.get_std_dev()), b_stats.get_std_dev()), a_stats.get_std_dev());
+
+			float yl = clamp<float>(max_std_dev / params.m_max_smooth_block_std_dev, 0.0f, 1.0f);
+			yl = yl * yl;
+			const float smooth_block_error_scale = lerp<float>(params.m_smooth_block_max_error_scale, 1.0f, yl);
+			if (smooth_block_error_scale > 1.0f)
+				total_smooth++;
+
+			color_rgba decoded_uastc_block[4][4];
+			if (!unpack_uastc(unpacked_blk, (basist::color32*)decoded_uastc_block, false))
+				return false;
+
+			uint64_t uastc_err = 0;
+			for (uint32_t i = 0; i < 16; i++)
+				uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_uastc_block)[i], true);
+
+			// Transcode to BC7
+			bc7_optimization_results b7_results;
+			if (!transcode_uastc_to_bc7(unpacked_blk, b7_results))
+				return false;
+
+			basist::bc7_block b7_block;
+			basist::encode_bc7_block(&b7_block, &b7_results);
+
+			color_rgba decoded_b7_blk[4][4];
+			unpack_block(texture_format::cBC7, &b7_block, &decoded_b7_blk[0][0]);
+						
+			uint64_t bc7_err = 0;
+			for (uint32_t i = 0; i < 16; i++)
+				bc7_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_b7_blk)[i], true);
+
+			uint64_t cur_err = (uastc_err + bc7_err) / 2;
+
+			// Divide by 16*4 to compute RMS error
+			const float cur_ms_err = (float)cur_err * (1.0f / 64.0f);
+			const float cur_rms_err = sqrt(cur_ms_err);
+
+			const uint32_t first_sel_bit = g_uastc_mode_selector_bits[block_mode][0];
+			const uint32_t total_sel_bits = g_uastc_mode_selector_bits[block_mode][1];
+			assert(first_sel_bit + total_sel_bits <= 128);
+			assert(total_sel_bits > 0);
+
+			uint32_t cur_bit_offset = first_sel_bit;
+			uint64_t cur_sel_bits = read_bits((const uint8_t*)&blk, cur_bit_offset, basisu::minimum(64U, total_sel_bits));
+
+			if (cur_rms_err >= params.m_skip_block_rms_thresh)
+			{
+				auto cur_search_res = selector_history.insert(std::make_pair(selector_bitsequence(first_sel_bit, cur_sel_bits), block_index));
+
+				// Block already has too much error, so don't mess with it.
+				if (!cur_search_res.second)
+					(*cur_search_res.first).second = block_index;
+
+				total_skipped++;
+				continue;
+			}
+
+			int cur_bits;
+			auto cur_find_res = selector_history.find(selector_bitsequence(first_sel_bit, cur_sel_bits));
+			if (cur_find_res == selector_history.end())
+			{
+				// Wasn't found - wildly estimate literal cost
+				//cur_bits = (total_sel_bits * 5) / 4;
+				cur_bits = (total_sel_bits * params.m_lz_literal_cost) / 100;
+			}
+			else
+			{
+				// Was found - wildly estimate match cost
+				uint32_t match_block_index = cur_find_res->second;
+				const int block_dist_in_bytes = (block_index - match_block_index) * 16;
+				cur_bits = compute_match_cost_estimate(block_dist_in_bytes);
+			}
+
+			int first_block_to_check = basisu::maximum<int>(first_index, block_index - total_blocks_to_check);
+			int last_block_to_check = block_index - 1;
+
+			basist::uastc_block best_block(blk);
+			uint32_t best_block_index = block_index;
+
+			float best_t = cur_ms_err * smooth_block_error_scale + cur_bits * params.m_lambda;
+
+			// Now scan through previous blocks, insert their selector bit patterns into the current block, and find 
+			// selector bit patterns which don't increase the overall block error too much.
+			for (int prev_block_index = last_block_to_check; prev_block_index >= first_block_to_check; --prev_block_index)
+			{
+				const basist::uastc_block& prev_blk = pBlocks[prev_block_index];
+
+				uint32_t bit_offset = first_sel_bit;
+				uint64_t sel_bits = read_bits((const uint8_t*)&prev_blk, bit_offset, basisu::minimum(64U, total_sel_bits));
+
+				int match_block_index = prev_block_index;
+				auto res = selector_history.find(selector_bitsequence(first_sel_bit, sel_bits));
+				if (res != selector_history.end())
+					match_block_index = res->second;
+				// Have we already checked this bit pattern? If so then skip this block.
+				if (match_block_index > prev_block_index)
+					continue;
+
+				unpacked_uastc_block unpacked_prev_blk;
+				if (!unpack_uastc(prev_blk, unpacked_prev_blk, false, true))
+					return false;
+
+				basist::uastc_block trial_blk(blk);
+
+				set_block_bits((uint8_t*)&trial_blk, sel_bits, basisu::minimum(64U, total_sel_bits), first_sel_bit);
+
+				if (total_sel_bits > 64)
+				{
+					sel_bits = read_bits((const uint8_t*)&prev_blk, bit_offset, total_sel_bits - 64U);
+
+					set_block_bits((uint8_t*)&trial_blk, sel_bits, total_sel_bits - 64U, first_sel_bit + basisu::minimum(64U, total_sel_bits));
+				}
+
+				unpacked_uastc_block unpacked_trial_blk;
+				if (!unpack_uastc(trial_blk, unpacked_trial_blk, false, true))
+					continue;
+
+				color_rgba decoded_trial_uastc_block[4][4];
+				if (!unpack_uastc(unpacked_trial_blk, (basist::color32*)decoded_trial_uastc_block, false))
+					continue;
+
+				uint64_t trial_uastc_err = 0;
+				for (uint32_t i = 0; i < 16; i++)
+					trial_uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_trial_uastc_block)[i], true);
+
+				// Transcode trial to BC7, compute error
+				bc7_optimization_results trial_b7_results;
+				if (!transcode_uastc_to_bc7(unpacked_trial_blk, trial_b7_results))
+					return false;
+
+				basist::bc7_block trial_b7_block;
+				basist::encode_bc7_block(&trial_b7_block, &trial_b7_results);
+
+				color_rgba decoded_trial_b7_blk[4][4];
+				unpack_block(texture_format::cBC7, &trial_b7_block, &decoded_trial_b7_blk[0][0]);
+
+				uint64_t trial_bc7_err = 0;
+				for (uint32_t i = 0; i < 16; i++)
+					trial_bc7_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_trial_b7_blk)[i], true);
+
+				uint64_t trial_err = (trial_uastc_err + trial_bc7_err) / 2;
+
+				const float trial_ms_err = (float)trial_err * (1.0f / 64.0f);
+				const float trial_rms_err = sqrtf(trial_ms_err);
+
+				if (trial_rms_err > cur_rms_err * params.m_max_allowed_rms_increase_ratio)
+					continue;
+
+				const int block_dist_in_bytes = (block_index - match_block_index) * 16;
+				const int match_bits = compute_match_cost_estimate(block_dist_in_bytes);
+
+				float t = trial_ms_err * smooth_block_error_scale + match_bits * params.m_lambda;
+				if (t < best_t)
+				{
+					best_t = t;
+					best_block_index = prev_block_index;
+
+					best_block = trial_blk;
+				}
+
+			} // prev_block_index
+
+			if (best_block_index != block_index)
+			{
+				total_modified++;
+
+				unpacked_uastc_block unpacked_best_blk;
+				if (!unpack_uastc(best_block, unpacked_best_blk, false, false))
+					return false;
+
+				if ((params.m_endpoint_refinement) && (block_mode == 0))
+				{
+					// Attempt to refine mode 0 block's endpoints, using the new selectors. This doesn't help much, but it does help.
+					// TODO: We could do this with the other modes too.
+					color_rgba decoded_best_uastc_block[4][4];
+					if (!unpack_uastc(unpacked_best_blk, (basist::color32*)decoded_best_uastc_block, false))
+						return false;
+
+					// Compute the block's current error (with the modified selectors).
+					uint64_t best_uastc_err = 0;
+					for (uint32_t i = 0; i < 16; i++)
+						best_uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_best_uastc_block)[i], true);
+
+					bc7enc_compress_block_params comp_params;
+					memset(&comp_params, 0, sizeof(comp_params));
+					comp_params.m_max_partitions_mode1 = 64;
+					comp_params.m_least_squares_passes = 1;
+					comp_params.m_weights[0] = 1;
+					comp_params.m_weights[1] = 1;
+					comp_params.m_weights[2] = 1;
+					comp_params.m_weights[3] = 1;
+					comp_params.m_uber_level = 0;
+
+					uastc_encode_results results;
+					uint32_t total_results = 0;
+					astc_mode0_or_18(0, (color_rgba(*)[4])pPixels, &results, total_results, comp_params, unpacked_best_blk.m_astc.m_weights);
+					assert(total_results == 1);
+
+					// See if the overall error has actually gone done.
+
+					color_rgba decoded_trial_uastc_block[4][4];
+					bool success = unpack_uastc(results.m_uastc_mode, results.m_common_pattern, results.m_solid_color.get_color32(), results.m_astc, (basist::color32*) & decoded_trial_uastc_block[0][0], false);
+					assert(success);
+					
+					BASISU_NOTE_UNUSED(success);
+
+					uint64_t trial_uastc_err = 0;
+					for (uint32_t i = 0; i < 16; i++)
+						trial_uastc_err += color_distance(perceptual, pPixels[i], ((color_rgba*)decoded_trial_uastc_block)[i], true);
+
+					if (trial_uastc_err < best_uastc_err)
+					{
+						// The error went down, so accept the new endpoints.
+
+						// Ensure the selectors haven't changed, otherwise we'll invalidate the LZ matches.
+						for (uint32_t i = 0; i < 16; i++)
+							assert(unpacked_best_blk.m_astc.m_weights[i] == results.m_astc.m_weights[i]);
+
+						unpacked_best_blk.m_astc = results.m_astc;
+
+						total_refined++;
+					}
+				} // if ((params.m_endpoint_refinement) && (block_mode == 0))
+
+				// The selectors have changed, so go recompute the block hints.
+				if (!uastc_recompute_hints(&best_block, pPixels, flags, &unpacked_best_blk))
+					return false;
+
+				// Write the modified block
+				pBlocks[block_index] = best_block;
+			
+			} // if (best_block_index != block_index)
+
+			{
+				uint32_t bit_offset = first_sel_bit;
+				uint64_t sel_bits = read_bits((const uint8_t*)&best_block, bit_offset, basisu::minimum(64U, total_sel_bits));
+
+				auto res = selector_history.insert(std::make_pair(selector_bitsequence(first_sel_bit, sel_bits), block_index));
+				if (!res.second)
+					(*res.first).second = block_index;
+			}
+
+		} // block_index
+
+		return true;
+	}
+				
+	// This function implements a basic form of rate distortion optimization (RDO) for UASTC. 
+	// It only changes selectors and then updates the hints. It uses very approximate LZ bitprice estimation.
+	// There's A LOT that can be done better in here, but it's a start.
+	// One nice advantage of the method used here is that it works for any input, no matter which or how many modes it uses.
+	bool uastc_rdo(uint32_t num_blocks, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params& params, uint32_t flags, job_pool* pJob_pool, uint32_t total_jobs)
+	{
+		assert(params.m_max_allowed_rms_increase_ratio > 1.0f);
+		assert(params.m_lz_dict_size > 0);
+		assert(params.m_lambda > 0.0f);
+
+		uint32_t total_skipped = 0, total_modified = 0, total_refined = 0, total_smooth = 0;
+
+		uint32_t blocks_per_job = total_jobs ? (num_blocks / total_jobs) : 0;
+
+		std::mutex stat_mutex;
+
+		bool status = false;
+
+		if ((!pJob_pool) || (total_jobs <= 1) || (blocks_per_job <= 8))
+		{
+			status = uastc_rdo_blocks(0, num_blocks, pBlocks, pBlock_pixels, params, flags, total_skipped, total_refined, total_modified, total_smooth);
+		}
+		else
+		{
+			bool all_succeeded = true;
+
+			for (uint32_t block_index_iter = 0; block_index_iter < num_blocks; block_index_iter += blocks_per_job)
+			{
+				const uint32_t first_index = block_index_iter;
+				const uint32_t last_index = minimum<uint32_t>(num_blocks, block_index_iter + blocks_per_job);
+
+#ifndef __EMSCRIPTEN__
+				pJob_pool->add_job([first_index, last_index, pBlocks, pBlock_pixels, &params, flags, &total_skipped, &total_modified, &total_refined, &total_smooth, &all_succeeded, &stat_mutex] {
+#endif
+
+					uint32_t job_skipped = 0, job_modified = 0, job_refined = 0, job_smooth = 0;
+
+					bool status = uastc_rdo_blocks(first_index, last_index, pBlocks, pBlock_pixels, params, flags, job_skipped, job_refined, job_modified, job_smooth);
+
+					{
+						std::lock_guard<std::mutex> lck(stat_mutex);
+						
+						all_succeeded = all_succeeded && status;
+						total_skipped += job_skipped;
+						total_modified += job_modified;
+						total_refined += job_refined;
+						total_smooth += job_smooth;
+					}
+
+#ifndef __EMSCRIPTEN__
+					}
+				);
+#endif
+
+			} // block_index_iter
+
+#ifndef __EMSCRIPTEN__
+			pJob_pool->wait_for_all();
+#endif
+
+			status = all_succeeded;
+		}
+
+		debug_printf("uastc_rdo: Total modified: %3.2f%%, total skipped: %3.2f%%, total refined: %3.2f%%, total smooth: %3.2f%%\n", total_modified * 100.0f / num_blocks, total_skipped * 100.0f / num_blocks, total_refined * 100.0f / num_blocks, total_smooth * 100.0f / num_blocks);
+				
+		return status;
+	}
+} // namespace basisu
+
+
+
+
+
diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.h b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
new file mode 100644
index 0000000000..ba39a558b3
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.h
@@ -0,0 +1,140 @@
+// basisu_uastc_enc.h
+// Copyright (C) 2019-2021 Binomial LLC. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "basisu_etc.h"
+
+#include "../transcoder/basisu_transcoder_uastc.h"
+
+namespace basisu
+{
+	const uint32_t TOTAL_PACK_UASTC_LEVELS = 5;
+
+	enum
+	{
+		// Fastest is the lowest quality, although it's stil substantially higher quality vs. BC1/ETC1. It supports 5 modes.
+		// The output may be somewhat blocky because this setting doesn't support 2/3-subset UASTC modes, but it should be less blocky vs. BC1/ETC1.
+		// This setting doesn't write BC1 hints, so BC1 transcoding will be slower. 
+		// Transcoded ETC1 quality will be lower because it only considers 2 hints out of 32.
+		// Avg. 43.45 dB
+		cPackUASTCLevelFastest = 0,
+		
+		// Faster is ~3x slower than fastest. It supports 9 modes.
+		// Avg. 46.49 dB
+		cPackUASTCLevelFaster = 1,
+		
+		// Default is ~5.5x slower than fastest. It supports 14 modes.
+		// Avg. 47.47 dB
+		cPackUASTCLevelDefault = 2,
+
+		// Slower is ~14.5x slower than fastest. It supports all 18 modes.
+		// Avg. 48.01 dB
+		cPackUASTCLevelSlower = 3,
+
+		// VerySlow is ~200x slower than fastest. 
+		// The best quality the codec is capable of, but you'll need to be patient or have a lot of cores.
+		// Avg. 48.24 dB
+		cPackUASTCLevelVerySlow = 4,
+
+		cPackUASTCLevelMask = 0xF,
+
+		// By default the encoder tries to strike a balance between UASTC and transcoded BC7 quality.
+		// These flags allow you to favor only optimizing for lowest UASTC error, or lowest BC7 error.
+		cPackUASTCFavorUASTCError = 8,
+		cPackUASTCFavorBC7Error = 16,
+						
+		cPackUASTCETC1FasterHints = 64,
+		cPackUASTCETC1FastestHints = 128,
+		cPackUASTCETC1DisableFlipAndIndividual = 256,
+		
+		// Favor UASTC modes 0 and 10 more than the others (this is experimental, it's useful for RDO compression)
+		cPackUASTCFavorSimplerModes = 512, 
+	};
+
+	// pRGBAPixels: Pointer to source 4x4 block of RGBA pixels (R first in memory).
+	// block: Reference to destination UASTC block.
+	// level: Controls compression speed vs. performance tradeoff.
+	void encode_uastc(const uint8_t* pRGBAPixels, basist::uastc_block& output_block, uint32_t flags = cPackUASTCLevelDefault);
+
+	struct uastc_encode_results
+	{
+		uint32_t m_uastc_mode;
+		uint32_t m_common_pattern;
+		basist::astc_block_desc m_astc;
+		color_rgba m_solid_color;
+		uint64_t m_astc_err;
+	};
+			  
+	void pack_uastc(basist::uastc_block& blk, const uastc_encode_results& result, const etc_block& etc1_blk, uint32_t etc1_bias, const eac_a8_block& etc_eac_a8_blk, bool bc1_hint0, bool bc1_hint1);
+
+	const uint32_t UASCT_RDO_DEFAULT_LZ_DICT_SIZE = 4096;
+
+	const float UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO = 10.0f;
+	const float UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH = 8.0f;
+	
+	// The RDO encoder computes a smoothness factor, from [0,1], for each block. To do this it computes each block's maximum component variance, then it divides this by this factor and clamps the result.
+	// Larger values will result in more blocks being protected from too much distortion.
+	const float UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV = 18.0f;
+	
+	// The RDO encoder can artifically boost the error of smooth blocks, in order to suppress distortions on smooth areas of the texture.
+	// The encoder will use this value as the maximum error scale to use on smooth blocks. The larger this value, the better smooth bocks will look. Set to 1.0 to disable this completely.
+	const float UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE = 10.0f;
+
+	struct uastc_rdo_params
+	{
+		uastc_rdo_params()
+		{
+			clear();
+		}
+
+		void clear()
+		{
+			m_lz_dict_size = UASCT_RDO_DEFAULT_LZ_DICT_SIZE;
+			m_lambda = 0.5f;
+			m_max_allowed_rms_increase_ratio = UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO;
+			m_skip_block_rms_thresh = UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH;
+			m_endpoint_refinement = true;
+			m_lz_literal_cost = 100;
+						
+			m_max_smooth_block_std_dev = UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV;
+			m_smooth_block_max_error_scale = UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE;
+		}
+				
+		// m_lz_dict_size: Size of LZ dictionary to simulate in bytes. The larger this value, the slower the encoder but the higher the quality per LZ compressed bit.
+		uint32_t m_lz_dict_size;
+
+		// m_lambda: The post-processor tries to reduce distortion+rate*lambda (rate is approximate LZ bits and distortion is scaled MS error).
+		// Larger values push the postprocessor towards optimizing more for lower rate, and smaller values more for distortion. 0=minimal distortion.
+		float m_lambda;
+		
+		// m_max_allowed_rms_increase_ratio: How much the RMS error of a block is allowed to increase before a trial is rejected. 1.0=no increase allowed, 1.05=5% increase allowed, etc.
+		float m_max_allowed_rms_increase_ratio;
+		
+		// m_skip_block_rms_thresh: Blocks with this much RMS error or more are completely skipped by the RDO encoder. 
+		float m_skip_block_rms_thresh;
+
+		// m_endpoint_refinement: If true, the post-process will attempt to refine the endpoints of blocks with modified selectors. 
+		bool m_endpoint_refinement;
+
+		float m_max_smooth_block_std_dev;
+		float m_smooth_block_max_error_scale;
+		
+		uint32_t m_lz_literal_cost;
+	};
+
+	// num_blocks, pBlocks: Number of blocks and pointer to UASTC blocks to process.
+	// pBlock_pixels: Pointer to an array of 4x4 blocks containing the original texture pixels. This is NOT a raster image, but a pointer to individual 4x4 blocks.
+	// flags: Pass in the same flags used to encode the UASTC blocks. The flags are used to reencode the transcode hints in the same way.
+	bool uastc_rdo(uint32_t num_blocks, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params &params, uint32_t flags = cPackUASTCLevelDefault, job_pool* pJob_pool = nullptr, uint32_t total_jobs = 0);
+} // namespace basisu
diff --git a/thirdparty/basis_universal/encoder/cppspmd_flow.h b/thirdparty/basis_universal/encoder/cppspmd_flow.h
new file mode 100644
index 0000000000..f6930476aa
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_flow.h
@@ -0,0 +1,590 @@
+// Do not include this header directly.
+// Control flow functionality in common between all the headers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef _DEBUG
+CPPSPMD_FORCE_INLINE void spmd_kernel::check_masks()
+{
+	assert(!any(andnot(m_kernel_exec, m_exec)));
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_break()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_continue()
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+
+	// Kill any active lanes, and remember which lanes were active so we can re-enable them at the end of the loop body.
+	m_continue_mask = m_continue_mask | m_exec;
+	m_exec = exec_mask::all_off();
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_return()
+{
+	// Permenantly kill all active lanes
+	m_kernel_exec = andnot(m_exec, m_kernel_exec);
+	m_exec = exec_mask::all_off();
+}
+			
+template<typename UnmaskedBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_unmasked(const UnmaskedBody& unmaskedBody)
+{
+	exec_mask orig_exec = m_exec, orig_kernel_exec = m_kernel_exec;
+
+	m_kernel_exec = exec_mask::all_on();
+	m_exec = exec_mask::all_on();
+
+	unmaskedBody();
+
+	m_kernel_exec = m_kernel_exec & orig_kernel_exec;
+	m_exec = m_exec & orig_exec;
+	
+	check_masks();
+}
+
+struct scoped_unmasked_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_kernel_exec;
+				
+	CPPSPMD_FORCE_INLINE scoped_unmasked_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_kernel_exec(pKernel->m_kernel_exec)
+	{
+		pKernel->m_kernel_exec = exec_mask::all_on();
+		pKernel->m_exec = exec_mask::all_on();
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_unmasked_restorer() 
+	{ 
+		m_pKernel->m_kernel_exec = m_pKernel->m_kernel_exec & m_orig_kernel_exec;
+		m_pKernel->m_exec = m_pKernel->m_exec & m_orig_exec;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_UNMASKED_BEGIN { scoped_unmasked_restorer _unmasked_restorer(this); 
+#define SPMD_UNMASKED_END }
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(m_exec);
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if_break(const vbool& cond)
+{
+#ifdef _DEBUG
+	assert(m_in_loop);
+#endif
+	
+	exec_mask cond_exec(cond);
+					
+	m_exec = andnot(m_exec & cond_exec, m_exec);
+
+	check_masks();
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sif(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		const exec_mask orig_exec = m_exec;
+		m_exec = im;
+		ifBody();
+		m_exec = orig_exec;
+	}
+}
+
+// No SPMD breaks, continues, etc. allowed
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody)
+{
+	const exec_mask orig_exec = m_exec;
+
+	exec_mask im = m_exec & exec_mask(cond);
+
+	if (any(im))
+	{
+		m_exec = im;
+		ifBody();
+	}
+
+	exec_mask em = orig_exec & exec_mask(!cond);
+
+	if (any(em))
+	{
+		m_exec = em;
+		elseBody();
+	}
+		
+	m_exec = orig_exec;
+}
+
+template<typename IfBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_if(const vbool& cond, const IfBody& ifBody)
+{
+	exec_mask cond_exec(cond);
+		
+	exec_mask pre_if_exec = cond_exec & m_exec;
+
+	if (any(pre_if_exec))
+	{
+		exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+		m_exec = pre_if_exec;
+
+		ifBody();
+
+		// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+		m_exec = m_exec | unexecuted_lanes;
+
+		check_masks();
+	}
+}
+
+template<typename IfBody, typename ElseBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody)
+{
+	bool all_flag = false;
+
+	exec_mask cond_exec(cond);
+		
+	{
+		exec_mask pre_if_exec = cond_exec & m_exec;
+
+		int mask = pre_if_exec.get_movemask();
+		if (mask != 0)
+		{
+			all_flag = ((uint32_t)mask == m_exec.get_movemask());
+
+			exec_mask unexecuted_lanes = andnot(cond_exec, m_exec);
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+
+	if (!all_flag)
+	{
+		exec_mask pre_if_exec = andnot(cond_exec, m_exec);
+
+		if (any(pre_if_exec))
+		{
+			exec_mask unexecuted_lanes = cond_exec & m_exec;
+			m_exec = pre_if_exec;
+
+			ifBody();
+
+			// Propagate any lanes that got disabled inside the if body into the exec mask outside the if body, but turn on any lanes that didn't execute inside the if body.
+			m_exec = m_exec | unexecuted_lanes;
+
+			check_masks();
+		}
+	}
+}
+
+struct scoped_exec_restorer
+{
+	exec_mask *m_pMask;
+	exec_mask m_prev_mask;
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer(exec_mask *pExec_mask) : m_pMask(pExec_mask), m_prev_mask(*pExec_mask) { }
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer() { *m_pMask = m_prev_mask; }
+};
+
+// Cannot use SPMD break, continue, or return inside "simple" if/else
+#define SPMD_SIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SENDIF }
+
+// Same as SPMD_SIF, except doesn't use a scoped object
+#define SPMD_SIF2(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SELSE2(cond) m_exec = _orig_exec; } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ exec_mask _orig_exec = m_exec; m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SEND_IF2 m_exec = _orig_exec; }
+
+// Same as SPMD_SIF(), except the if/else blocks are always executed
+#define SPMD_SAIF(cond) exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAELSE(cond) } exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(m_exec & exec_mask(!vbool(cond))); { CPPSPMD::scoped_exec_restorer CPPSPMD_GLUER2(_exec_restore_, __LINE__)(&m_exec); \
+	m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__);
+
+#define SPMD_SAENDIF }
+
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SSELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+#define SPMD_SCASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); if (any(CPPSPMD_GLUER2(_exec_temp, __LINE__))) \
+	{ m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); _select_executed = _select_executed | m_exec;
+
+//#define SPMD_SCASE_END			if (_select_executed.get_movemask() == _orig_exec.m_prev_mask.get_movemask()) break; }
+#define SPMD_SCASE_END			if (!any(_select_executed ^ _orig_exec.m_prev_mask)) break; }
+#define SPMD_SDEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); if (any(_all_other_lanes)) { m_exec = _all_other_lanes;
+#define SPMD_SDEFAULT_END		}
+#define SPMD_SSELECT_END		} while(0);
+
+// Same as SPMD_SSELECT, except all cases are executed.
+// Cannot use SPMD break, continue, or return inside sselect
+#define SPMD_SASELECT(var)		do { vint_t _select_var = var; scoped_exec_restorer _orig_exec(&m_exec); exec_mask _select_executed(exec_mask::all_off());
+
+#define SPMD_SACASE(value)		exec_mask CPPSPMD_GLUER2(_exec_temp, __LINE__)(_orig_exec.m_prev_mask & exec_mask(vbool(_select_var == (value)))); { m_exec = CPPSPMD_GLUER2(_exec_temp, __LINE__); \
+	_select_executed = _select_executed | m_exec;
+
+#define SPMD_SACASE_END			}
+#define SPMD_SADEFAULT			exec_mask _all_other_lanes(andnot(_select_executed, _orig_exec.m_prev_mask)); { m_exec = _all_other_lanes;
+#define SPMD_SADEFAULT_END		}
+#define SPMD_SASELECT_END		} while(0);
+
+struct scoped_exec_restorer2
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_unexecuted_lanes;
+		
+	CPPSPMD_FORCE_INLINE scoped_exec_restorer2(spmd_kernel *pKernel, const vbool &cond) : 
+		m_pKernel(pKernel)
+	{ 
+		exec_mask cond_exec(cond);
+		m_unexecuted_lanes = andnot(cond_exec, pKernel->m_exec);
+		pKernel->m_exec = cond_exec & pKernel->m_exec;
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_exec_restorer2() 
+	{ 
+		m_pKernel->m_exec = m_pKernel->m_exec | m_unexecuted_lanes;
+		m_pKernel->check_masks();
+	}
+};
+
+#define SPMD_IF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); if (any(m_exec)) {
+#define SPMD_ELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); if (any(m_exec)) {
+#define SPMD_END_IF } }
+
+// Same as SPMD_IF, except the conditional block is always executed.
+#define SPMD_AIF(cond) { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, vbool(cond)); {
+#define SPMD_AELSE(cond) } } { CPPSPMD::scoped_exec_restorer2 CPPSPMD_GLUER2(_exec_restore2_, __LINE__)(this, !vbool(cond)); {
+#define SPMD_AEND_IF } }
+
+class scoped_exec_saver
+{
+	exec_mask m_exec, m_kernel_exec, m_continue_mask;
+	spmd_kernel *m_pKernel;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+
+public:
+	inline scoped_exec_saver(spmd_kernel *pKernel) :
+		m_exec(pKernel->m_exec), m_kernel_exec(pKernel->m_kernel_exec), m_continue_mask(pKernel->m_continue_mask),
+		m_pKernel(pKernel)
+	{ 
+#ifdef _DEBUG
+		m_in_loop = pKernel->m_in_loop;
+#endif
+	}
+		
+	inline ~scoped_exec_saver()
+	{ 
+		m_pKernel->m_exec = m_exec; 
+		m_pKernel->m_continue_mask = m_continue_mask; 
+		m_pKernel->m_kernel_exec = m_kernel_exec; 
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#define SPMD_BEGIN_CALL scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_continue_mask = exec_mask::all_off();
+#define SPMD_BEGIN_CALL_ALL_LANES scoped_exec_saver CPPSPMD_GLUER2(_begin_call_scoped_exec_saver, __LINE__)(this); m_exec = exec_mask::all_on(); m_continue_mask = exec_mask::all_off();
+
+template<typename ForeachBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_foreach(int begin, int end, const ForeachBody& foreachBody)
+{
+	if (begin == end)
+		return;
+	
+	if (!any(m_exec))
+		return;
+
+	// We don't support iterating backwards.
+	if (begin > end)
+		std::swap(begin, end);
+
+	exec_mask prev_continue_mask = m_continue_mask, prev_exec = m_exec;
+	
+	int total_full = (end - begin) / PROGRAM_COUNT;
+	int total_partial = (end - begin) % PROGRAM_COUNT;
+
+	lint_t loop_index = begin + program_index;
+	
+	const int total_loops = total_full + (total_partial ? 1 : 0);
+
+	m_continue_mask = exec_mask::all_off();
+
+	for (int i = 0; i < total_loops; i++)
+	{
+		int n = PROGRAM_COUNT;
+		if ((i == (total_loops - 1)) && (total_partial))
+		{
+			exec_mask partial_mask = exec_mask(vint_t(total_partial) > vint_t(program_index));
+			m_exec = m_exec & partial_mask;
+			n = total_partial;
+		}
+
+		foreachBody(loop_index, n);
+
+		m_exec = m_exec | m_continue_mask;
+		if (!any(m_exec))
+			break;
+
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+				
+		store_all(loop_index, loop_index + PROGRAM_COUNT);
+	}
+
+	m_exec = prev_exec & m_kernel_exec;
+	m_continue_mask = prev_continue_mask;
+	check_masks();
+}
+
+template<typename WhileCondBody, typename WhileBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(whileCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		whileBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+	}
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+#endif
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+	check_masks();
+}
+
+struct scoped_while_restorer
+{
+	spmd_kernel *m_pKernel;
+	exec_mask m_orig_exec, m_orig_continue_mask;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+				
+	CPPSPMD_FORCE_INLINE scoped_while_restorer(spmd_kernel *pKernel) : 
+		m_pKernel(pKernel), 
+		m_orig_exec(pKernel->m_exec),
+		m_orig_continue_mask(pKernel->m_continue_mask)
+	{
+		pKernel->m_continue_mask.all_off();
+
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_while_restorer() 
+	{ 
+		m_pKernel->m_exec = m_orig_exec & m_pKernel->m_kernel_exec;
+		m_pKernel->m_continue_mask = m_orig_continue_mask;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+#undef SPMD_WHILE
+#undef SPMD_WEND
+#define SPMD_WHILE(cond) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+
+#define SPMD_WEND m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); } }
+
+// Nesting is not supported (although it will compile, but the results won't make much sense).
+#define SPMD_FOREACH(loop_var, bi, ei) if (((bi) != (ei)) && (any(m_exec))) { \
+	scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	uint32_t b = (uint32_t)(bi), e = (uint32_t)(ei); if ((b) > (e)) { std::swap(b, e); } const uint32_t total_full = ((e) - (b)) >> PROGRAM_COUNT_SHIFT, total_partial = ((e) - (b)) & (PROGRAM_COUNT - 1); \
+	lint_t loop_var = program_index + (int)b; const uint32_t total_loops = total_full + (total_partial ? 1U : 0U); \
+	for (uint32_t CPPSPMD_GLUER2(_foreach_counter, __LINE__) = 0; CPPSPMD_GLUER2(_foreach_counter, __LINE__) < total_loops; ++CPPSPMD_GLUER2(_foreach_counter, __LINE__)) { \
+		if ((CPPSPMD_GLUER2(_foreach_counter, __LINE__) == (total_loops - 1)) && (total_partial)) { exec_mask partial_mask = exec_mask(vint_t((int)total_partial) > vint_t(program_index)); m_exec = m_exec & partial_mask; }
+
+#define SPMD_FOREACH_END(loop_var) m_exec = m_exec | m_continue_mask; if (!any(m_exec)) break; m_continue_mask = exec_mask::all_off(); check_masks(); store_all(loop_var, loop_var + PROGRAM_COUNT); } }
+
+// Okay to use spmd_continue or spmd_return, but not spmd_break
+#define SPMD_FOREACH_ACTIVE(index_var) int64_t index_var; { uint64_t _movemask = m_exec.get_movemask(); if (_movemask) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	for (uint32_t _i = 0; _i < PROGRAM_COUNT; ++_i) { \
+		if (_movemask & (1U << _i)) { \
+			m_exec.enable_lane(_i); m_exec = m_exec & m_kernel_exec; \
+			(index_var) = _i; \
+
+#define SPMD_FOREACH_ACTIVE_END } } } }
+
+// Okay to use spmd_continue, but not spmd_break/spmd_continue
+#define SPMD_FOREACH_UNIQUE_INT(index_var, var) { scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	CPPSPMD_DECL(int_t, _vals[PROGRAM_COUNT]); store_linear_all(_vals, var); std::sort(_vals, _vals + PROGRAM_COUNT); \
+	const int _n = (int)(std::unique(_vals, _vals + PROGRAM_COUNT) - _vals); \
+	for (int _i = 0; _i < _n; ++_i) { int index_var = _vals[_i]; vbool cond = (vint_t(var) == vint_t(index_var)); m_exec = exec_mask(cond);
+
+#define SPMD_FOREACH_UNIQUE_INT_END } }
+
+struct scoped_simple_while_restorer
+{
+	spmd_kernel* m_pKernel;
+	exec_mask m_orig_exec;
+#ifdef _DEBUG
+	bool m_prev_in_loop;
+#endif
+
+	CPPSPMD_FORCE_INLINE scoped_simple_while_restorer(spmd_kernel* pKernel) :
+		m_pKernel(pKernel),
+		m_orig_exec(pKernel->m_exec)
+	{
+			
+#ifdef _DEBUG
+		m_prev_in_loop = pKernel->m_in_loop;
+		pKernel->m_in_loop = true;
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE ~scoped_simple_while_restorer()
+	{
+		m_pKernel->m_exec = m_orig_exec;
+#ifdef _DEBUG
+		m_pKernel->m_in_loop = m_prev_in_loop;
+		m_pKernel->check_masks();
+#endif
+	}
+};
+
+// Cannot use SPMD break, continue, or return inside simple while
+
+#define SPMD_SWHILE(cond) { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); \
+	while(true) { \
+		exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_SWEND } }	
+
+// Cannot use SPMD break, continue, or return inside simple do
+#define SPMD_SDO { scoped_simple_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) {
+#define SPMD_SEND_DO(cond) exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(cond)); m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break; } }	
+
+#undef SPMD_FOR
+#undef SPMD_END_FOR
+#define SPMD_FOR(for_init, for_cond) { for_init; scoped_while_restorer CPPSPMD_GLUER2(_while_restore_, __LINE__)(this); while(true) { exec_mask CPPSPMD_GLUER2(cond_exec, __LINE__) = exec_mask(vbool(for_cond)); \
+	m_exec = m_exec & CPPSPMD_GLUER2(cond_exec, __LINE__); if (!any(m_exec)) break;
+#define SPMD_END_FOR(for_inc) m_exec = m_exec | m_continue_mask; m_continue_mask = exec_mask::all_off(); check_masks(); for_inc; } }
+		
+template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+CPPSPMD_FORCE_INLINE void spmd_kernel::spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody)
+{
+	exec_mask orig_exec = m_exec;
+
+	forInitBody();
+
+	exec_mask orig_continue_mask = m_continue_mask;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	const bool prev_in_loop = m_in_loop;
+	m_in_loop = true;
+#endif
+
+	while(true)
+	{
+		exec_mask cond_exec = exec_mask(forCondBody());
+		m_exec = m_exec & cond_exec;
+
+		if (!any(m_exec))
+			break;
+
+		forBody();
+
+		m_exec = m_exec | m_continue_mask;
+		m_continue_mask = exec_mask::all_off();
+		check_masks();
+			
+		forIncrBody();
+	}
+
+	m_exec = orig_exec & m_kernel_exec;
+	m_continue_mask = orig_continue_mask;
+
+#ifdef _DEBUG
+	m_in_loop = prev_in_loop;
+	check_masks();
+#endif
+}
diff --git a/thirdparty/basis_universal/encoder/cppspmd_math.h b/thirdparty/basis_universal/encoder/cppspmd_math.h
new file mode 100644
index 0000000000..e7b3202b8e
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_math.h
@@ -0,0 +1,725 @@
+// Do not include this header directly.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// The general goal of these vectorized estimated math functions is scalability/performance.
+// There are explictly no checks NaN's/Inf's on the input arguments. There are no assertions either. 
+// These are fast estimate functions - if you need more than that, use stdlib. Please do a proper 
+// engineering analysis before relying on them.
+// I have chosen functions written by others, ported them to CppSPMD, then measured their abs/rel errors.
+// I compared each to the ones in DirectXMath and stdlib's for accuracy/performance.
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	vfloat c = frac(abs(a * b_inv)) * abs(b); 
+	return spmd_ternaryf(a < 0, -c, c); 
+}
+
+CPPSPMD_FORCE_INLINE vfloat fmod_inv_p(const vfloat& a, const vfloat& b, const vfloat& b_inv) 
+{ 
+	return frac(a * b_inv) * b; 
+}
+
+// Avoids dividing by zero or very small values.
+CPPSPMD_FORCE_INLINE vfloat safe_div(vfloat a, vfloat b, float fDivThresh = 1e-7f)
+{
+	return a / spmd_ternaryf( abs(b) > fDivThresh, b, spmd_ternaryf(b < 0.0f, -fDivThresh, fDivThresh) );
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range: 0.0000000000001250 10000000000.0000000000000000, vals: 1073741824
+
+	log2_est():
+	max abs err: 0.0000023076808731
+	max rel err: 0.0000000756678881
+	avg abs err: 0.0000007535452724
+	avg rel err: 0.0000000235117843
+
+	XMVectorLog2():
+	max abs err: 0.0000023329709933
+	max rel err: 0.0000000826961046
+	avg abs err: 0.0000007564889684
+	avg rel err: 0.0000000236051899
+
+	std::log2f():
+	max abs err: 0.0000020265979401
+	max rel err: 0.0000000626647654
+	avg abs err: 0.0000007494445227
+	avg rel err: 0.0000000233800985
+*/
+
+// See https://tech.ebayinc.com/engineering/fast-approximate-logarithms-part-iii-the-formulas/
+inline vfloat spmd_kernel::log2_est(vfloat v)
+{
+	vfloat signif, fexp;
+
+	// Just clamp to a very small value, instead of checking for invalid inputs.
+	vfloat x = max(v, 2.2e-38f);
+
+	/*
+	 * Assume IEEE representation, which is sgn(1):exp(8):frac(23)
+	 * representing (1+frac)*2^(exp-127).  Call 1+frac the significand
+	 */
+
+	 // get exponent
+	vint ux1_i = cast_vfloat_to_vint(x);
+
+	vint exp = VUINT_SHIFT_RIGHT(ux1_i & 0x7F800000, 23);
+
+	// actual exponent is exp-127, will subtract 127 later
+
+	vint ux2_i;
+	vfloat ux2_f;
+
+	vint greater = ux1_i & 0x00400000;  // true if signif > 1.5
+	SPMD_SIF(greater != 0)
+	{
+		// signif >= 1.5 so need to divide by 2.  Accomplish this by stuffing exp = 126 which corresponds to an exponent of -1 
+		store_all(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f000000);
+
+		store_all(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		// 126 instead of 127 compensates for division by 2
+		store_all(fexp, vfloat(exp - 126));    
+	}
+	SPMD_SELSE(greater != 0)
+	{
+		// get signif by stuffing exp = 127 which corresponds to an exponent of 0
+		store(ux2_i, (ux1_i & 0x007FFFFF) | 0x3f800000);
+
+		store(ux2_f, cast_vint_to_vfloat(ux2_i));
+
+		store(fexp, vfloat(exp - 127));
+	}
+	SPMD_SENDIF
+
+	store_all(signif, ux2_f);
+	store_all(signif, signif - 1.0f);
+
+	const float a = 0.1501692f, b = 3.4226132f, c = 5.0225057f, d = 4.1130283f, e = 3.4813372f;
+
+	vfloat xm1 = signif;
+	vfloat xm1sqr = xm1 * xm1;
+		
+	return fexp + ((a * (xm1sqr * xm1) + b * xm1sqr + c * xm1) / (xm1sqr + d * xm1 + e));
+	
+	// fma lowers accuracy for SSE4.1 - no idea why (compiler reordering?)
+	//return fexp + ((vfma(a, (xm1sqr * xm1), vfma(b, xm1sqr, c * xm1))) / (xm1sqr + vfma(d, xm1, e)));
+}
+
+// Uses log2_est(), so this function must be <= the precision of that.
+inline vfloat spmd_kernel::log_est(vfloat v)
+{
+	return log2_est(v) * 0.693147181f;
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment)
+{
+	// Assume we're using equation (2)
+	store_all(adjustment, 0);
+	
+	// integer part of the input argument
+	vint int_arg = (vint)arg;
+	
+	// if frac(arg) is in [0.5, 1.0]...
+	SPMD_SIF((arg - int_arg) > 0.5f)   
+	{
+		store(adjustment, 1);
+		
+		// then change it to [0.0, 0.5]
+		store(arg, arg - 0.5f);
+	}
+	SPMD_SENDIF
+
+	// arg == just the fractional part
+	store_all(arg, arg - (vfloat)int_arg);
+   
+	// Now compute 2** (int) arg. 
+	store_all(int_arg, min(int_arg + 127, 254));
+	
+	store_all(two_int_a, cast_vint_to_vfloat(VINT_SHIFT_LEFT(int_arg, 23)));
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	f range : -50.0000000000000000 49.9999940395355225, vals : 16777216
+	
+	exp2_est():
+	Total passed near - zero check : 16777216
+	Total sign diffs : 0
+	max abs err: 1668910609.7500000000000000
+	max rel err: 0.0000015642030031
+	avg abs err: 10793794.4007573910057545
+	avg rel err: 0.0000003890893282
+	 
+	XMVectorExp2():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1665552836.8750000000000000
+	max rel err: 0.0000114674862370
+	avg abs err: 10771868.2627860084176064
+	avg rel err: 0.0000011218880770
+
+	std::exp2f():
+	Total passed near-zero check: 16777216
+	Total sign diffs: 0
+	max abs err: 1591636585.6250000000000000
+	max rel err: 0.0000014849731018
+	avg abs err: 10775800.3204844966530800
+	avg rel err: 0.0000003851496422
+*/
+
+// http://www.ganssle.com/item/approximations-c-code-exponentiation-log.htm
+inline vfloat spmd_kernel::exp2_est(vfloat arg)
+{
+	SPMD_BEGIN_CALL
+
+	const vfloat P00 = +7.2152891521493f;
+	const vfloat P01 = +0.0576900723731f;
+	const vfloat Q00 = +20.8189237930062f;
+	const vfloat Q01 = +1.0f;
+	const vfloat sqrt2 = 1.4142135623730950488f; // sqrt(2) for scaling 
+
+	vfloat result = 0.0f;
+
+	// Return 0 if arg is too large. 
+	// We're not introducing inf/nan's into calculations, or risk doing so by returning huge default values.
+	SPMD_IF(abs(arg) > 126.0f)
+	{
+		spmd_return();
+	}
+	SPMD_END_IF
+
+	// 2**(int(a))
+	vfloat two_int_a;                
+	
+	// set to 1 by reduce_expb
+	vint adjustment;
+	
+	// 0 if arg is +; 1 if negative
+	vint negative = 0;                 
+
+	// If the input is negative, invert it. At the end we'll take the reciprocal, since n**(-1) = 1/(n**x).
+	SPMD_SIF(arg < 0.0f)
+	{
+		store(arg, -arg);
+		store(negative, 1);
+	}
+	SPMD_SENDIF
+
+	store_all(arg, min(arg, 126.0f));
+
+	// reduce to [0.0, 0.5]
+	reduce_expb(arg, two_int_a, adjustment);
+
+	// The format of the polynomial is:
+	//  answer=(Q(x**2) + x*P(x**2))/(Q(x**2) - x*P(x**2))
+	//
+	//  The following computes the polynomial in several steps:
+
+	// Q(x**2)
+	vfloat Q = vfma(Q01, (arg * arg), Q00);
+	
+	// x*P(x**2)
+	vfloat x_P = arg * (vfma(P01, arg * arg, P00));
+	
+	vfloat answer = (Q + x_P) / (Q - x_P);
+
+	// Now correct for the scaling factor of 2**(int(a))
+	store_all(answer, answer * two_int_a);
+			
+	// If the result had a fractional part > 0.5, correct for that
+	store_all(answer, spmd_ternaryf(adjustment != 0, answer * sqrt2, answer));
+
+	// Correct for a negative input
+	SPMD_SIF(negative != 0)
+	{
+		store(answer, 1.0f / answer);
+	}
+	SPMD_SENDIF
+
+	store(result, answer);
+
+	return result;
+}
+
+inline vfloat spmd_kernel::exp_est(vfloat arg)
+{
+	// e^x = exp2(x / log_base_e(2))
+	// constant is 1.0/(log(2)/log(e)) or 1/log(2)
+	return exp2_est(arg * 1.44269504f);
+}
+
+inline vfloat spmd_kernel::pow_est(vfloat arg1, vfloat arg2)
+{
+	return exp_est(log_est(arg1) * arg2);
+}
+
+/*
+	clang 9.0.0 for win /fp:precise release
+	Total near-zero: 144, output above near-zero tresh: 30
+	Total near-zero avg: 0.0000067941016621 max: 0.0000134706497192
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16777072
+	Total sign diffs: 5
+	max abs err: 0.0000031375306036
+	max rel err: 0.1140846017075028
+	avg abs err: 0.0000003026226621
+	avg rel err: 0.0000033564977623
+*/
+
+// Math from this web page: http://developer.download.nvidia.com/cg/sin.html
+// This is ~2x slower than sin_est() or cos_est(), and less accurate, but I'm keeping it here for comparison purposes to help validate/sanity check sin_est() and cos_est().
+inline vfloat spmd_kernel::sincos_est_a(vfloat a, bool sin_flag)
+{
+	const float c0_x = 0.0f, c0_y = 0.5f, c0_z = 1.0f;
+	const float c1_x = 0.25f, c1_y = -9.0f, c1_z = 0.75f, c1_w = 0.159154943091f;
+	const float c2_x = 24.9808039603f, c2_y = -24.9808039603f, c2_z = -60.1458091736f, c2_w = 60.1458091736f;
+	const float c3_x = 85.4537887573f, c3_y = -85.4537887573f, c3_z = -64.9393539429f, c3_w = 64.9393539429f;
+	const float c4_x = 19.7392082214f, c4_y = -19.7392082214f, c4_z = -1.0f, c4_w = 1.0f;
+
+	vfloat r0_x, r0_y, r0_z, r1_x, r1_y, r1_z, r2_x, r2_y, r2_z;
+
+	store_all(r1_x, sin_flag ? vfms(c1_w, a, c1_x) : c1_w * a);
+
+	store_all(r1_y, frac(r1_x));                   
+	
+	store_all(r2_x, (vfloat)(r1_y < c1_x));        
+
+	store_all(r2_y, (vfloat)(r1_y >= c1_y));    
+	store_all(r2_z, (vfloat)(r1_y >= c1_z));    
+
+	store_all(r2_y, vfma(r2_x, c4_z, vfma(r2_y, c4_w, r2_z * c4_z)));
+
+	store_all(r0_x, c0_x - r1_y);                
+	store_all(r0_y, c0_y - r1_y);                
+	store_all(r0_z, c0_z - r1_y);                
+	
+	store_all(r0_x, r0_x * r0_x);
+	store_all(r0_y, r0_y * r0_y);
+	store_all(r0_z, r0_z * r0_z);
+
+	store_all(r1_x, vfma(c2_x, r0_x, c2_z));           
+	store_all(r1_y, vfma(c2_y, r0_y, c2_w));           
+	store_all(r1_z, vfma(c2_x, r0_z, c2_z));           
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c3_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_x));
+		
+	store_all(r1_x, vfma(r1_x, r0_x, c3_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c3_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c3_z));
+	
+	store_all(r1_x, vfma(r1_x, r0_x, c4_x));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_y));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_x));
+
+	store_all(r1_x, vfma(r1_x, r0_x, c4_z));
+	store_all(r1_y, vfma(r1_y, r0_y, c4_w));
+	store_all(r1_z, vfma(r1_z, r0_z, c4_z));
+
+	store_all(r0_x, vfnma(r1_x, r2_x, vfnma(r1_y, r2_y, r1_z * -r2_z)));
+
+	return r0_x;
+}
+
+// positive values only
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1(const vfloat& q)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+	
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+	
+	return rcp_l * vfnma(rcp_l, q, 2.0f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::recip_est1_pn(const vfloat& t)
+{
+	//const int mag = 0x7EF312AC; // 2 NR iters, 3 is  0x7EEEEBB3
+	const int mag = 0x7EF311C3;
+	const float fMinThresh = .0000125f;
+
+	vfloat s = sign(t);
+	vfloat q = abs(t);
+
+	vfloat l = spmd_ternaryf(q >= fMinThresh, q, cast_vint_to_vfloat(vint(mag)));
+
+	vint x_l = vint(mag) - cast_vfloat_to_vint(l);
+
+	vfloat rcp_l = cast_vint_to_vfloat(x_l);
+
+	return rcp_l * vfnma(rcp_l, q, 2.0f) * s;
+}
+
+// https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf
+// https://github.com/hcs0/Hackers-Delight/blob/master/rsqrt.c.txt
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est1(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F375A82) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	return x * vfnma(xhalf * x, x, 1.5008909f);
+}
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::rsqrt_est2(vfloat x0)
+{
+	vfloat xhalf = 0.5f * x0;
+	vfloat x = cast_vint_to_vfloat(vint(0x5F37599E) - (VINT_SHIFT_RIGHT(cast_vfloat_to_vint(x0), 1)));
+	vfloat x1 = x * vfnma(xhalf * x, x, 1.5);
+	vfloat x2 = x1 * vfnma(xhalf * x1, x1, 1.5);
+	return x2;
+}
+
+// Math from: http://developer.download.nvidia.com/cg/atan2.html
+// TODO: Needs more validation, parameter checking.
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::atan2_est(vfloat y, vfloat x)
+{
+	vfloat t1 = abs(y);
+	vfloat t3 = abs(x);
+	
+	vfloat t0 = max(t3, t1);
+	store_all(t1, min(t3, t1));
+
+	store_all(t3, t1 / t0);
+	
+	vfloat t4 = t3 * t3;
+	store_all(t0, vfma(-0.013480470f, t4, 0.057477314f));
+	store_all(t0, vfms(t0, t4, 0.121239071f));
+	store_all(t0, vfma(t0, t4, 0.195635925f));
+	store_all(t0, vfms(t0, t4, 0.332994597f));
+	store_all(t0, vfma(t0, t4, 0.999995630f));
+	store_all(t3, t0 * t3);
+
+	store_all(t3, spmd_ternaryf(abs(y) > abs(x), vfloat(1.570796327f) - t3, t3));
+
+	store_all(t3, spmd_ternaryf(x < 0.0f, vfloat(3.141592654f) - t3, t3));
+	store_all(t3, spmd_ternaryf(y < 0.0f, -t3, t3));
+
+	return t3;
+}
+
+/*
+    clang 9.0.0 for win /fp:precise release
+	Tested range: -25.1327412287183449 25.1327382326621169, vals : 16777216
+	Skipped angles near 90/270 within +- .001 radians.
+	Near-zero threshold: .0000125f
+	Near-zero output above check threshold: 1e-6f
+
+	Total near-zero: 144, output above near-zero tresh: 20
+	Total near-zero avg: 0.0000067510751968 max: 0.0000133514404297
+	Total near-zero sign diffs: 5
+	Total passed near-zero check: 16766400
+	Total sign diffs: 5
+	max abs err: 1.4982600811139264
+	max rel err: 0.1459155900188041
+	avg rel err: 0.0000054659502568
+
+	XMVectorTan() precise:
+	Total near-zero: 144, output above near-zero tresh: 18
+	Total near-zero avg: 0.0000067641216186 max: 0.0000133524126795
+	Total near-zero sign diffs: 0
+	Total passed near-zero check: 16766400
+	Total sign diffs: 0
+	max abs err: 1.9883573246424930
+	max rel err: 0.1459724171926864
+	avg rel err: 0.0000054965766843
+
+	std::tanf():
+	Total near-zero: 144, output above near-zero tresh: 0
+	Total near-zero avg: 0.0000067116930779 max: 0.0000127713074107
+	Total near-zero sign diffs: 11
+	Total passed near-zero check: 16766400
+	Total sign diffs: 11
+	max abs err: 0.8989131818294709
+	max rel err: 0.0573181403173166
+	avg rel err: 0.0000030791301203
+	
+	Originally from:
+	http://www.ganssle.com/approx.htm
+*/
+
+CPPSPMD_FORCE_INLINE vfloat spmd_kernel::tan82(vfloat x)
+{
+	// Original double version was 8.2 digits
+	//double c1 = 211.849369664121f, c2 = -12.5288887278448f, c3 = 269.7350131214121f, c4 = -71.4145309347748f;
+	// Tuned float constants for lower avg rel error (without using FMA3):
+	const float c1 = 211.849350f, c2 = -12.5288887f, c3 = 269.734985f, c4 = -71.4145203f;
+	vfloat x2 = x * x;
+	return (x * (vfma(c2, x2, c1)) / (vfma(x2, (c4 + x2), c3)));
+}
+
+// Don't call this for angles close to 90/270!.
+inline vfloat spmd_kernel::tan_est(vfloat x)
+{
+	const float fPi = 3.141592653589793f, fOneOverPi = 0.3183098861837907f;
+	CPPSPMD_DECL(const uint8_t, s_table0[16]) =	{ 128 + 0, 128 + 2, 128 + -2, 128 + 4,    128 + 0, 128 + 2, 128 + -2, 128 + 4,	  128 + 0, 128 + 2, 128 + -2, 128 + 4,   128 + 0, 128 + 2, 128 + -2, 128 + 4 };
+
+	vint table = init_lookup4(s_table0); // a load
+	vint sgn = cast_vfloat_to_vint(x) & 0x80000000;
+
+	store_all(x, abs(x));
+	vfloat orig_x = x;
+
+	vfloat q = x * fOneOverPi;
+	store_all(x, q - floor(q));
+
+	vfloat x4 = x * 4.0f;
+	vint octant = (vint)(x4);
+
+	vfloat x0 = spmd_ternaryf((octant & 1) != 0, -x4, x4);
+
+	vint k = table_lookup4_8(octant, table) & 0xFF; // a shuffle
+
+	vfloat bias = (vfloat)k + -128.0f;
+	vfloat y = x0 + bias;
+
+	vfloat z = tan82(y);
+
+	vfloat r;
+	
+	vbool octant_one_or_two = (octant == 1) || (octant == 2);
+
+	// SPMD optimization - skip costly divide if we can
+	if (spmd_any(octant_one_or_two))
+	{
+		const float fDivThresh = .4371e-7f;
+		vfloat one_over_z = 1.0f / spmd_ternaryf(abs(z) > fDivThresh, z, spmd_ternaryf(z < 0.0f, -fDivThresh, fDivThresh));
+				
+		vfloat b = spmd_ternaryf(octant_one_or_two, one_over_z, z);
+		store_all(r, spmd_ternaryf((octant & 2) != 0, -b, b));
+	}
+	else
+	{
+		store_all(r, spmd_ternaryf(octant == 0, z, -z));
+	}
+		
+	// Small angle approximation, to decrease the max rel error near Pi.
+	SPMD_SIF(x >= (1.0f - .0003125f*4.0f))
+	{
+		store(r, vfnma(floor(q) + 1.0f, fPi, orig_x));
+	}
+	SPMD_SENDIF
+
+	return cast_vint_to_vfloat(cast_vfloat_to_vint(r) ^ sgn);
+}
+
+inline void spmd_kernel::seed_rand(rand_context& x, vint seed)
+{ 
+	store(x.a, 0xf1ea5eed); 
+	store(x.b, seed ^ 0xd8487b1f); 
+	store(x.c, seed ^ 0xdbadef9a); 
+	store(x.d, seed); 
+	for (int i = 0; i < 20; ++i) 
+		(void)get_randu(x); 
+}
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+// Returns 32-bit unsigned random numbers.
+inline vint spmd_kernel::get_randu(rand_context& x)
+{ 
+	vint e = x.a - VINT_ROT(x.b, 27); 
+	store(x.a, x.b ^ VINT_ROT(x.c, 17)); 
+	store(x.b, x.c + x.d); 
+	store(x.c, x.d + e); 
+	store(x.d, e + x.a);	
+	return x.d; 
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint spmd_kernel::get_randi(rand_context& x, vint low, vint high)
+{
+	vint rnd = get_randu(x);
+
+	vint range = high - low;
+
+	vint rnd_range = mulhiu(rnd, range);
+	
+	return spmd_ternaryi(low < high, low + rnd_range, low);
+}
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat spmd_kernel::get_randf(rand_context& x, vfloat low, vfloat high)
+{
+	vint rndi = get_randu(x) & 0x7fffff;
+
+	vfloat rnd = (vfloat)(rndi) * (1.0f / 8388608.0f);
+
+	return spmd_ternaryf(low < high, vfma(high - low, rnd, low), low);
+}
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init_reverse_bits(vint& tab1, vint& tab2)
+{
+	const uint8_t tab1_bytes[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
+	const uint8_t tab2_bytes[16] = { 0, 8 << 4, 4 << 4, 12 << 4, 2 << 4, 10 << 4, 6 << 4, 14 << 4, 1 << 4, 9 << 4, 5 << 4, 13 << 4, 3 << 4, 11 << 4, 7 << 4, 15 << 4 };
+	store_all(tab1, init_lookup4(tab1_bytes));
+	store_all(tab2, init_lookup4(tab2_bytes));
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::reverse_bits(vint k, vint tab1, vint tab2)
+{
+	vint r0 = table_lookup4_8(k & 0x7F7F7F7F, tab2);
+	vint r1 = table_lookup4_8(VUINT_SHIFT_RIGHT(k, 4) & 0x7F7F7F7F, tab1);
+	vint r3 = r0 | r1;
+	return byteswap(r3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros(vint x)
+{
+	CPPSPMD_DECL(const uint8_t, s_tab[16]) = { 0, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	vint tab = init_lookup4(s_tab);
+
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	return table_lookup4_8(VUINT_SHIFT_RIGHT(x2, 28), tab) + n2;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_leading_zeros_alt(vint x)
+{
+	//x <= 0x0000ffff
+	vbool c0 = (x & 0xFFFF0000) == 0;
+	vint n0 = spmd_ternaryi(c0, 16, 0);
+	vint x0 = spmd_ternaryi(c0, VINT_SHIFT_LEFT(x, 16), x);
+
+	//x <= 0x00ffffff
+	vbool c1 = (x0 & 0xFF000000) == 0;
+	vint n1 = spmd_ternaryi(c1, n0 + 8, n0);
+	vint x1 = spmd_ternaryi(c1, VINT_SHIFT_LEFT(x0, 8), x0);
+
+	//x <= 0x0fffffff
+	vbool c2 = (x1 & 0xF0000000) == 0;
+	vint n2 = spmd_ternaryi(c2, n1 + 4, n1);
+	vint x2 = spmd_ternaryi(c2, VINT_SHIFT_LEFT(x1, 4), x1);
+
+	// x <= 0x3fffffff
+	vbool c3 = (x2 & 0xC0000000) == 0;
+	vint n3 = spmd_ternaryi(c3, n2 + 2, n2);
+	vint x3 = spmd_ternaryi(c3, VINT_SHIFT_LEFT(x2, 2), x2);
+
+	// x <= 0x7fffffff
+	vbool c4 = (x3 & 0x80000000) == 0;
+	return spmd_ternaryi(c4, n3 + 1, n3);
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_trailing_zeros(vint x)
+{
+	// cast the least significant bit in v to a float
+	vfloat f = (vfloat)(x & -x);
+	
+	// extract exponent and adjust
+	return VUINT_SHIFT_RIGHT(cast_vfloat_to_vint(f), 23) - 0x7F;
+}
+
+CPPSPMD_FORCE_INLINE vint spmd_kernel::count_set_bits(vint x)
+{
+	vint v = x - (VUINT_SHIFT_RIGHT(x, 1) & 0x55555555);                    
+	vint v1 = (v & 0x33333333) + (VUINT_SHIFT_RIGHT(v, 2) & 0x33333333);     
+	return VUINT_SHIFT_RIGHT(((v1 + VUINT_SHIFT_RIGHT(v1, 4) & 0xF0F0F0F) * 0x1010101), 24);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epu16(const vint &a, const vint &b) 
+{ 
+	return cmpeq_epi16(subs_epu16(a, b), vint(0)); 
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epu16(const vint &a, const vint &b) 
+{ 
+	return cmple_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpgt_epu16(const vint &a, const vint &b)
+{
+	return andnot(cmpeq_epi16(a, b), cmple_epu16(b, a));
+}
+
+CPPSPMD_FORCE_INLINE vint cmplt_epu16(const vint &a, const vint &b)
+{
+	return cmpgt_epu16(b, a);
+}
+
+CPPSPMD_FORCE_INLINE vint cmpge_epi16(const vint &a, const vint &b)
+{
+	return cmpeq_epi16(a, b) | cmpgt_epi16(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint cmple_epi16(const vint &a, const vint &b)
+{
+	return cmpge_epi16(b, a);
+}
+
+void spmd_kernel::print_vint(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_vbool(vbool v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%i ", extract(v, i) ? 1 : 0); 
+	printf("\n"); 
+}
+	
+void spmd_kernel::print_vint_hex(vint v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("0x%X ", extract(v, i)); 
+	printf("\n"); 
+}
+
+void spmd_kernel::print_active_lanes(const char *pPrefix) 
+{ 
+	CPPSPMD_DECL(int, flags[PROGRAM_COUNT]);
+	memset(flags, 0, sizeof(flags));
+	storeu_linear(flags, vint(1));
+
+	if (pPrefix)
+		printf("%s", pPrefix);
+
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+	{
+		if (flags[i])
+			printf("%u ", i);
+	}
+	printf("\n");
+}
+	
+void spmd_kernel::print_vfloat(vfloat v) 
+{ 
+	for (uint32_t i = 0; i < PROGRAM_COUNT; i++) 
+		printf("%f ", extract(v, i)); 
+	printf("\n"); 
+}
diff --git a/thirdparty/basis_universal/encoder/cppspmd_math_declares.h b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
new file mode 100644
index 0000000000..cdb6447b62
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_math_declares.h
@@ -0,0 +1,89 @@
+// Do not include this header directly.
+// This header defines shared struct spmd_kernel helpers.
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// See cppspmd_math.h for detailed error statistics.
+
+CPPSPMD_FORCE_INLINE void reduce_expb(vfloat& arg, vfloat& two_int_a, vint& adjustment);
+CPPSPMD_FORCE_INLINE vfloat tan56(vfloat x);
+CPPSPMD_FORCE_INLINE vfloat tan82(vfloat x);
+
+inline vfloat log2_est(vfloat v);
+
+inline vfloat log_est(vfloat v);
+
+inline vfloat exp2_est(vfloat arg);
+
+inline vfloat exp_est(vfloat arg);
+
+inline vfloat pow_est(vfloat arg1, vfloat arg2);
+
+CPPSPMD_FORCE_INLINE vfloat recip_est1(const vfloat& q);
+CPPSPMD_FORCE_INLINE vfloat recip_est1_pn(const vfloat& q);
+
+inline vfloat mod_angles(vfloat a);
+
+inline vfloat sincos_est_a(vfloat a, bool sin_flag);
+CPPSPMD_FORCE_INLINE vfloat sin_est_a(vfloat a) { return sincos_est_a(a, true); }
+CPPSPMD_FORCE_INLINE vfloat cos_est_a(vfloat a) { return sincos_est_a(a, false); }
+
+inline vfloat sin_est(vfloat a);
+
+inline vfloat cos_est(vfloat a);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est1(vfloat x0);
+
+// Don't call with values <= 0.
+CPPSPMD_FORCE_INLINE vfloat rsqrt_est2(vfloat x0);
+
+CPPSPMD_FORCE_INLINE vfloat atan2_est(vfloat y, vfloat x);
+
+CPPSPMD_FORCE_INLINE vfloat atan_est(vfloat x) { return atan2_est(x, vfloat(1.0f)); }
+
+// Don't call this for angles close to 90/270! 
+inline vfloat tan_est(vfloat x);
+
+// https://burtleburtle.net/bob/rand/smallprng.html
+struct rand_context { vint a, b, c, d; };
+
+inline void seed_rand(rand_context& x, vint seed);
+
+// Returns 32-bit unsigned random numbers.
+inline vint get_randu(rand_context& x);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vint get_randi(rand_context& x, vint low, vint high);
+
+// Returns random numbers between [low, high), or low if low >= high
+inline vfloat get_randf(rand_context& x, vfloat low, vfloat high);
+
+CPPSPMD_FORCE_INLINE void init_reverse_bits(vint& tab1, vint& tab2);
+CPPSPMD_FORCE_INLINE vint reverse_bits(vint k, vint tab1, vint tab2);
+
+CPPSPMD_FORCE_INLINE vint count_leading_zeros(vint x);
+CPPSPMD_FORCE_INLINE vint count_leading_zeros_alt(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_trailing_zeros(vint x);
+
+CPPSPMD_FORCE_INLINE vint count_set_bits(vint x);
+
+void print_vint(vint v);
+void print_vbool(vbool v);
+void print_vint_hex(vint v);
+void print_active_lanes(const char *pPrefix);
+void print_vfloat(vfloat v);
+
diff --git a/thirdparty/basis_universal/encoder/cppspmd_sse.h b/thirdparty/basis_universal/encoder/cppspmd_sse.h
new file mode 100644
index 0000000000..b39cb82a5f
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_sse.h
@@ -0,0 +1,2118 @@
+// cppspmd_sse.h
+// Note for Basis Universal: All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation.
+// SSE 2 or 4.1
+// Originally written by Nicolas Guillemot, Jefferson Amstutz in the "CppSPMD" project.
+// 4/20: Richard Geldreich: Macro control flow, more SIMD instruction sets, optimizations, supports using multiple SIMD instruction sets in same executable. Still a work in progress!
+//
+// Originally Copyright 2016 Nicolas Guillemot
+// Changed from the MIT license to Apache 2.0 with permission from the author.
+//
+// Modifications/enhancements Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <utility>
+#include <algorithm>
+
+#if CPPSPMD_SSE2
+#include <xmmintrin.h>		// SSE
+#include <emmintrin.h>		// SSE2
+#else
+#include <xmmintrin.h>		// SSE
+#include <emmintrin.h>		// SSE2
+#include <pmmintrin.h>		// SSE3
+#include <tmmintrin.h>		// SSSE3
+#include <smmintrin.h>		// SSE4.1
+//#include <nmmintrin.h>		// SSE4.2
+#endif
+
+#undef CPPSPMD_SSE
+#undef CPPSPMD_AVX1
+#undef CPPSPMD_AVX2
+#undef CPPSPMD_AVX
+#undef CPPSPMD_FLOAT4
+#undef CPPSPMD_INT16
+
+#define CPPSPMD_SSE 1
+#define CPPSPMD_AVX 0
+#define CPPSPMD_AVX1 0
+#define CPPSPMD_AVX2 0
+#define CPPSPMD_FLOAT4 0
+#define CPPSPMD_INT16 0
+
+#ifdef _MSC_VER
+	#ifndef CPPSPMD_DECL
+	#define CPPSPMD_DECL(type, name) __declspec(align(16)) type name
+	#endif
+
+	#ifndef CPPSPMD_ALIGN
+	#define CPPSPMD_ALIGN(v) __declspec(align(v))
+	#endif
+
+	#define _mm_undefined_si128 _mm_setzero_si128
+	#define _mm_undefined_ps _mm_setzero_ps
+#else
+	#ifndef CPPSPMD_DECL
+	#define CPPSPMD_DECL(type, name) type name __attribute__((aligned(32)))
+	#endif
+
+	#ifndef CPPSPMD_ALIGN
+	#define CPPSPMD_ALIGN(v) __attribute__((aligned(v)))
+	#endif
+#endif
+
+#ifndef CPPSPMD_FORCE_INLINE
+#ifdef _DEBUG
+#define CPPSPMD_FORCE_INLINE inline
+#else
+	#ifdef _MSC_VER
+		#define CPPSPMD_FORCE_INLINE __forceinline
+	#else
+		#define CPPSPMD_FORCE_INLINE inline
+	#endif
+#endif
+#endif
+
+#undef CPPSPMD
+#undef CPPSPMD_ARCH
+
+#if CPPSPMD_SSE2
+	#define CPPSPMD_SSE41 0
+	#define CPPSPMD cppspmd_sse2
+	#define CPPSPMD_ARCH _sse2
+#else
+	#define CPPSPMD_SSE41 1
+	#define CPPSPMD cppspmd_sse41
+	#define CPPSPMD_ARCH _sse41
+#endif
+
+#ifndef CPPSPMD_GLUER
+	#define CPPSPMD_GLUER(a, b) a##b
+#endif
+
+#ifndef CPPSPMD_GLUER2
+	#define CPPSPMD_GLUER2(a, b) CPPSPMD_GLUER(a, b)
+#endif
+
+#ifndef CPPSPMD_NAME
+#define CPPSPMD_NAME(a) CPPSPMD_GLUER2(a, CPPSPMD_ARCH)
+#endif
+
+#undef VASSERT
+#define VCOND(cond) ((exec_mask(vbool(cond)) & m_exec).get_movemask() == m_exec.get_movemask())
+#define VASSERT(cond) assert( VCOND(cond) )
+
+#define CPPSPMD_ALIGNMENT (16)
+
+#define storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
+
+namespace CPPSPMD
+{
+
+const int PROGRAM_COUNT_SHIFT = 2;
+const int PROGRAM_COUNT = 1 << PROGRAM_COUNT_SHIFT;
+
+template <typename N> inline N* aligned_new() { void* p = _mm_malloc(sizeof(N), 64); new (p) N;	return static_cast<N*>(p); }
+template <typename N> void aligned_delete(N* p) { if (p) { p->~N(); _mm_free(p); } }
+
+CPPSPMD_DECL(const uint32_t, g_allones_128[4]) = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
+CPPSPMD_DECL(const uint32_t, g_x_128[4]) = { UINT32_MAX, 0, 0, 0 };
+CPPSPMD_DECL(const float, g_onef_128[4]) = { 1.0f, 1.0f, 1.0f, 1.0f };
+CPPSPMD_DECL(const uint32_t, g_oneu_128[4]) = { 1, 1, 1, 1 };
+
+CPPSPMD_DECL(const uint32_t, g_lane_masks_128[4][4]) = 
+{ 
+	{ UINT32_MAX, 0, 0, 0 },
+	{ 0, UINT32_MAX, 0, 0 },
+	{ 0, 0, UINT32_MAX, 0 },
+	{ 0, 0, 0, UINT32_MAX },
+};
+
+#if CPPSPMD_SSE41
+CPPSPMD_FORCE_INLINE __m128i _mm_blendv_epi32(__m128i a, __m128i b, __m128i c) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(c))); }
+#endif
+
+CPPSPMD_FORCE_INLINE __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask)
+{
+#if CPPSPMD_SSE2
+	return _mm_castps_si128(_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(b)), _mm_andnot_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(a))));
+#else
+	return _mm_blendv_epi8(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128 blendv_mask_ps(__m128 a, __m128 b, __m128 mask)
+{
+#if CPPSPMD_SSE2
+	// We know it's a mask, so we can just emulate the blend.
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+#else
+	return _mm_blendv_ps(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
+{
+#if CPPSPMD_SSE2
+	// Input is not a mask, but MSB bits - so emulate _mm_blendv_ps() by replicating bit 31.
+	mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mask), 31));
+	return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
+#else
+	return _mm_blendv_ps(a, b, mask);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE __m128i blendv_mask_epi32(__m128i a, __m128i b, __m128i mask)
+{
+	return _mm_castps_si128(blendv_mask_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
+}
+
+CPPSPMD_FORCE_INLINE __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask)
+{
+	return _mm_castps_si128(blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
+}
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_cvtsi128_si32(vec); }
+CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0x55)); }
+CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xAA)); }
+CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xFF)); }
+
+// Returns float bits as int, to emulate _mm_extract_ps()
+CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { float f = _mm_cvtss_f32(vec); return *(const int*)&f;  }
+CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); return *(const int*)&f; }
+CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); return *(const int*)&f; }
+CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); return *(const int*)&f; }
+
+// Returns floats
+CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { return _mm_cvtss_f32(vec); }
+CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); }
+CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); }
+CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); }
+#else
+CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_extract_epi32(vec, 0); }
+CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_extract_epi32(vec, 1); }
+CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_extract_epi32(vec, 2); }
+CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_extract_epi32(vec, 3); }
+
+// Returns float bits as int
+CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { return _mm_extract_ps(vec, 0); }
+CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { return _mm_extract_ps(vec, 1); }
+CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { return _mm_extract_ps(vec, 2); }
+CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { return _mm_extract_ps(vec, 3); }
+
+// Returns floats
+CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { int v = extract_ps_x(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { int v = extract_ps_y(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { int v = extract_ps_z(vec); return *(const float*)&v; }
+CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { int v = extract_ps_w(vec); return *(const float*)&v; }
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 0), (uint32_t)v >> 16U, 1); }
+CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 2), (uint32_t)v >> 16U, 3); }
+CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 4), (uint32_t)v >> 16U, 5); }
+CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 6), (uint32_t)v >> 16U, 7); }
+#else
+CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 0); }
+CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 1); }
+CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 2); }
+CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 3); }
+#endif
+
+#if CPPSPMD_SSE2
+inline __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
+{
+	// Just emulate _mm_shuffle_epi8. This is very slow, but what else can we do?
+	CPPSPMD_ALIGN(16) uint8_t av[16];
+	_mm_store_si128((__m128i*)av, a);
+		
+	CPPSPMD_ALIGN(16) uint8_t bvi[16];
+	_mm_store_ps((float*)bvi, _mm_and_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(_mm_set1_epi32(0x0F0F0F0F))));
+
+	CPPSPMD_ALIGN(16) uint8_t result[16];
+
+	result[0] = av[bvi[0]];
+	result[1] = av[bvi[1]];
+	result[2] = av[bvi[2]];
+	result[3] = av[bvi[3]];
+	
+	result[4] = av[bvi[4]];
+	result[5] = av[bvi[5]];
+	result[6] = av[bvi[6]];
+	result[7] = av[bvi[7]];
+
+	result[8] = av[bvi[8]];
+	result[9] = av[bvi[9]];
+	result[10] = av[bvi[10]];
+	result[11] = av[bvi[11]];
+
+	result[12] = av[bvi[12]];
+	result[13] = av[bvi[13]];
+	result[14] = av[bvi[14]];
+	result[15] = av[bvi[15]];
+
+	return _mm_andnot_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), _mm_load_si128((__m128i*)result));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i shuffle_epi8(const __m128i& a, const __m128i& b) 
+{ 
+	return _mm_shuffle_epi8(a, b); 
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
+{
+	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(a, b));
+}
+CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
+{
+	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(a, b));
+}
+CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
+{
+	__m128i n = _mm_set1_epi32(0x80000000);
+	__m128i ac = _mm_add_epi32(a, n);
+	__m128i bc = _mm_add_epi32(b, n);
+	return blendv_mask_epi32(b, a, _mm_cmplt_epi32(ac, bc));
+}
+CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
+{
+	__m128i n = _mm_set1_epi32(0x80000000);
+	__m128i ac = _mm_add_epi32(a, n);
+	__m128i bc = _mm_add_epi32(b, n);
+	return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(ac, bc));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
+{
+	return _mm_min_epi32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
+{
+	return _mm_max_epi32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
+{
+	return _mm_min_epu32(a, b);
+}
+CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
+{
+	return _mm_max_epu32(a, b);
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
+{
+	__m128i sign_mask = _mm_srai_epi32(a, 31);
+	return _mm_sub_epi32(_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(sign_mask))), sign_mask);
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
+{
+	return _mm_abs_epi32(a);
+}
+#endif
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a, b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
+}
+#else
+CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
+{
+	return _mm_mullo_epi32(a, b);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE __m128i mulhi_epu32(__m128i a, __m128i b)
+{
+	__m128i tmp1 = _mm_mul_epu32(a, b);
+	__m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
+	return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 3, 1)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 3, 1)));
+}
+
+#if CPPSPMD_SSE2
+inline __m128i load_rgba32(const void* p)
+{
+	__m128i xmm = _mm_cvtsi32_si128(*(const int*)p);
+	xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
+	xmm = _mm_unpacklo_epi16(xmm, _mm_setzero_si128());
+	return xmm;
+}
+#else
+inline __m128i load_rgba32(const void* p)
+{
+	return _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((const float*)p)));
+}
+#endif
+
+inline void transpose4x4(__m128i& x, __m128i& y, __m128i& z, __m128i& w, const __m128i& r0, const __m128i& r1, const __m128i& r2, const __m128i& r3)
+{
+	__m128i t0 = _mm_unpacklo_epi32(r0, r1);
+	__m128i t1 = _mm_unpacklo_epi32(r2, r3);
+	__m128i t2 = _mm_unpackhi_epi32(r0, r1);
+	__m128i t3 = _mm_unpackhi_epi32(r2, r3);
+	x = _mm_unpacklo_epi64(t0, t1);
+	y = _mm_unpackhi_epi64(t0, t1);
+	z = _mm_unpacklo_epi64(t2, t3);
+	w = _mm_unpackhi_epi64(t2, t3);
+}
+
+const uint32_t ALL_ON_MOVEMASK = 0xF;
+
+struct spmd_kernel
+{
+	struct vint;
+	struct lint;
+	struct vbool;
+	struct vfloat;
+
+	typedef int int_t;
+	typedef vint vint_t;
+	typedef lint lint_t;
+		
+	// Exec mask
+	struct exec_mask
+	{
+		__m128i m_mask;
+
+		exec_mask() = default;
+
+		CPPSPMD_FORCE_INLINE explicit exec_mask(const vbool& b);
+		CPPSPMD_FORCE_INLINE explicit exec_mask(const __m128i& mask) : m_mask(mask) { }
+
+		CPPSPMD_FORCE_INLINE void enable_lane(uint32_t lane) { m_mask = _mm_load_si128((const __m128i *)&g_lane_masks_128[lane][0]); }
+				
+		static CPPSPMD_FORCE_INLINE exec_mask all_on()	{ return exec_mask{ _mm_load_si128((const __m128i*)g_allones_128) };	}
+		static CPPSPMD_FORCE_INLINE exec_mask all_off() { return exec_mask{ _mm_setzero_si128() }; }
+
+		CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(m_mask)); }
+	};
+
+	friend CPPSPMD_FORCE_INLINE bool all(const exec_mask& e);
+	friend CPPSPMD_FORCE_INLINE bool any(const exec_mask& e);
+
+	CPPSPMD_FORCE_INLINE bool spmd_all() const { return all(m_exec); }
+	CPPSPMD_FORCE_INLINE bool spmd_any() const { return any(m_exec); }
+	CPPSPMD_FORCE_INLINE bool spmd_none() { return !any(m_exec); }
+
+	// true if cond is true for all active lanes - false if no active lanes
+	CPPSPMD_FORCE_INLINE bool spmd_all(const vbool& e) { uint32_t m = m_exec.get_movemask(); return (m != 0) && ((exec_mask(e) & m_exec).get_movemask() == m); }
+	// true if cond is true for any active lanes
+	CPPSPMD_FORCE_INLINE bool spmd_any(const vbool& e) { return (exec_mask(e) & m_exec).get_movemask() != 0; }
+	CPPSPMD_FORCE_INLINE bool spmd_none(const vbool& e) { return !spmd_any(e); }
+
+	friend CPPSPMD_FORCE_INLINE exec_mask operator^ (const exec_mask& a, const exec_mask& b);
+	friend CPPSPMD_FORCE_INLINE exec_mask operator& (const exec_mask& a, const exec_mask& b);
+	friend CPPSPMD_FORCE_INLINE exec_mask operator| (const exec_mask& a, const exec_mask& b);
+		
+	exec_mask m_exec;
+	exec_mask m_kernel_exec;
+	exec_mask m_continue_mask;
+#ifdef _DEBUG
+	bool m_in_loop;
+#endif
+		
+	CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return m_exec.get_movemask(); }
+		
+	void init(const exec_mask& kernel_exec);
+	
+	// Varying bool
+		
+	struct vbool
+	{
+		__m128i m_value;
+
+		vbool() = default;
+
+		CPPSPMD_FORCE_INLINE vbool(bool value) : m_value(_mm_set1_epi32(value ? UINT32_MAX : 0)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vbool(const __m128i& value) : m_value(value) { }
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const;
+		CPPSPMD_FORCE_INLINE explicit operator vint() const;
+								
+	private:
+		vbool& operator=(const vbool&);
+	};
+
+	friend vbool operator!(const vbool& v);
+		
+	CPPSPMD_FORCE_INLINE vbool& store(vbool& dst, const vbool& src)
+	{
+		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
+		return dst;
+	}
+		
+	CPPSPMD_FORCE_INLINE vbool& store_all(vbool& dst, const vbool& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+	
+	// Varying float
+	struct vfloat
+	{
+		__m128 m_value;
+
+		vfloat() = default;
+
+		CPPSPMD_FORCE_INLINE explicit vfloat(const __m128& v) : m_value(v) { }
+
+		CPPSPMD_FORCE_INLINE vfloat(float value) : m_value(_mm_set1_ps(value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }
+
+	private:
+		vfloat& operator=(const vfloat&);
+	};
+
+	CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)
+	{
+		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat& store(vfloat&& dst, const vfloat& src)
+	{
+		dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat& dst, const vfloat& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat&& dst, const vfloat& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+
+	// Linear ref to floats
+	struct float_lref
+	{
+		float* m_pValue;
+
+	private:
+		float_lref& operator=(const float_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps(dst.m_pValue, src.m_value);
+		else
+			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref&& dst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps(dst.m_pValue, src.m_value);
+		else
+			_mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref& dst, const vfloat& src)
+	{
+		_mm_storeu_ps(dst.m_pValue, src.m_value);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref&& dst, const vfloat& src)
+	{
+		_mm_storeu_ps(dst.m_pValue, src.m_value);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load(const float_lref& src)
+	{
+		return vfloat{ _mm_and_ps(_mm_loadu_ps(src.m_pValue), _mm_castsi128_ps(m_exec.m_mask)) };
+	}
+		
+	// Varying ref to floats
+	struct float_vref
+	{
+		__m128i m_vindex;
+		float* m_pValue;
+		
+	private:
+		float_vref& operator=(const float_vref&);
+	};
+
+	// Varying ref to varying float
+	struct vfloat_vref
+	{
+		__m128i m_vindex;
+		vfloat* m_pValue;
+		
+	private:
+		vfloat_vref& operator=(const vfloat_vref&);
+	};
+
+	// Varying ref to varying int
+	struct vint_vref
+	{
+		__m128i m_vindex;
+		vint* m_pValue;
+		
+	private:
+		vint_vref& operator=(const vint_vref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);
+	CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref&& dst, const vfloat& src);
+		
+	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref& dst, const vfloat& src);
+	CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref&& dst, const vfloat& src);
+
+	CPPSPMD_FORCE_INLINE vfloat load(const float_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i *)vindex, src.m_vindex);
+
+		CPPSPMD_ALIGN(16) float loaded[4];
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				loaded[i] = src.m_pValue[vindex[i]];
+		}
+		return vfloat{ _mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)loaded)) };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_all(const float_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i *)vindex, src.m_vindex);
+
+		CPPSPMD_ALIGN(16) float loaded[4];
+
+		for (int i = 0; i < 4; i++)
+			loaded[i] = src.m_pValue[vindex[i]];
+		return vfloat{ _mm_load_ps((const float*)loaded) };
+	}
+
+	// Linear ref to ints
+	struct int_lref
+	{
+		int* m_pValue;
+
+	private:
+		int_lref& operator=(const int_lref&);
+	};
+		
+	CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+		{
+			_mm_storeu_si128((__m128i *)dst.m_pValue, src.m_value);
+		}
+		else
+		{
+			CPPSPMD_ALIGN(16) int stored[4];
+			_mm_store_si128((__m128i *)stored, src.m_value);
+
+			for (int i = 0; i < 4; i++)
+			{
+				if (mask & (1 << i))
+					dst.m_pValue[i] = stored[i];
+			}
+		}
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const int_lref& src)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i*)src.m_pValue);
+
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+
+		return vint{ v };
+	}
+
+	// Linear ref to int16's
+	struct int16_lref
+	{
+		int16_t* m_pValue;
+
+	private:
+		int16_lref& operator=(const int16_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i *)stored, src.m_value);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
+		}
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const int16_lref& store_all(const int16_lref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i *)stored, src.m_value);
+
+		for (int i = 0; i < 4; i++)
+			dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
+		return dst;
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load(const int16_lref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		for (int i = 0; i < 4; i++)
+			values[i] = static_cast<int16_t>(src.m_pValue[i]);
+
+		__m128i t = _mm_load_si128( (const __m128i *)values );
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps( t ), _mm_castsi128_ps(m_exec.m_mask))) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const int16_lref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		for (int i = 0; i < 4; i++)
+			values[i] = static_cast<int16_t>(src.m_pValue[i]);
+
+		__m128i t = _mm_load_si128( (const __m128i *)values );
+
+		return vint{ t };
+	}
+		
+	// Linear ref to constant ints
+	struct cint_lref
+	{
+		const int* m_pValue;
+
+	private:
+		cint_lref& operator=(const cint_lref&);
+	};
+
+	CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i *)src.m_pValue);
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+		return vint{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const cint_lref& src)
+	{
+		return vint{ _mm_loadu_si128((const __m128i *)src.m_pValue) };
+	}
+	
+	// Varying ref to ints
+	struct int_vref
+	{
+		__m128i m_vindex;
+		int* m_pValue;
+
+	private:
+		int_vref& operator=(const int_vref&);
+	};
+
+	// Varying ref to constant ints
+	struct cint_vref
+	{
+		__m128i m_vindex;
+		const int* m_pValue;
+
+	private:
+		cint_vref& operator=(const cint_vref&);
+	};
+
+	// Varying int
+	struct vint
+	{
+		__m128i m_value;
+
+		vint() = default;
+
+		CPPSPMD_FORCE_INLINE explicit vint(const __m128i& value) : m_value(value)	{ }
+
+		CPPSPMD_FORCE_INLINE explicit vint(const lint &other) : m_value(other.m_value) { }
+
+		CPPSPMD_FORCE_INLINE vint& operator=(const lint& other) { m_value = other.m_value; return *this; }
+
+		CPPSPMD_FORCE_INLINE vint(int value) : m_value(_mm_set1_epi32(value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit vint(float value) : m_value(_mm_set1_epi32((int)value))	{ }
+
+		CPPSPMD_FORCE_INLINE explicit vint(const vfloat& other) : m_value(_mm_cvttps_epi32(other.m_value)) { }
+
+		CPPSPMD_FORCE_INLINE explicit operator vbool() const 
+		{
+			return vbool{ _mm_xor_si128( _mm_load_si128((const __m128i*)g_allones_128), _mm_cmpeq_epi32(m_value, _mm_setzero_si128())) };
+		}
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
+		{
+			return vfloat{ _mm_cvtepi32_ps(m_value) };
+		}
+
+		CPPSPMD_FORCE_INLINE int_vref operator[](int* ptr) const
+		{
+			return int_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE cint_vref operator[](const int* ptr) const
+		{
+			return cint_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE float_vref operator[](float* ptr) const
+		{
+			return float_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE vfloat_vref operator[](vfloat* ptr) const
+		{
+			return vfloat_vref{ m_value, ptr };
+		}
+
+		CPPSPMD_FORCE_INLINE vint_vref operator[](vint* ptr) const
+		{
+			return vint_vref{ m_value, ptr };
+		}
+
+	private:
+		vint& operator=(const vint&);
+	};
+
+	// Load/store linear int
+	CPPSPMD_FORCE_INLINE void storeu_linear(int *pDst, const vint& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_si128((__m128i *)pDst, src.m_value);
+		else
+		{
+			if (mask & 1) pDst[0] = extract_x(src.m_value);
+			if (mask & 2) pDst[1] = extract_y(src.m_value);
+			if (mask & 4) pDst[2] = extract_z(src.m_value);
+			if (mask & 8) pDst[3] = extract_w(src.m_value);
+		}
+	}
+
+	CPPSPMD_FORCE_INLINE void storeu_linear_all(int *pDst, const vint& src)
+	{
+		_mm_storeu_si128((__m128i*)pDst, src.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_linear_all(int *pDst, const vint& src)
+	{
+		_mm_store_si128((__m128i*)pDst, src.m_value);
+	}
+		
+	CPPSPMD_FORCE_INLINE vint loadu_linear(const int *pSrc)
+	{
+		__m128i v = _mm_loadu_si128((const __m128i*)pSrc);
+
+		v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
+
+		return vint{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vint loadu_linear_all(const int *pSrc)
+	{
+		return vint{ _mm_loadu_si128((__m128i*)pSrc) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_linear_all(const int *pSrc)
+	{
+		return vint{ _mm_load_si128((__m128i*)pSrc) };
+	}
+
+	// Load/store linear float
+	CPPSPMD_FORCE_INLINE void storeu_linear(float *pDst, const vfloat& src)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		if (mask == ALL_ON_MOVEMASK)
+			_mm_storeu_ps((float*)pDst, src.m_value);
+		else
+		{
+			int *pDstI = (int *)pDst;
+			if (mask & 1) pDstI[0] = extract_ps_x(src.m_value);
+			if (mask & 2) pDstI[1] = extract_ps_y(src.m_value);
+			if (mask & 4) pDstI[2] = extract_ps_z(src.m_value);
+			if (mask & 8) pDstI[3] = extract_ps_w(src.m_value);
+		}
+	}
+
+	CPPSPMD_FORCE_INLINE void storeu_linear_all(float *pDst, const vfloat& src)
+	{
+		_mm_storeu_ps((float*)pDst, src.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_linear_all(float *pDst, const vfloat& src)
+	{
+		_mm_store_ps((float*)pDst, src.m_value);
+	}
+		
+	CPPSPMD_FORCE_INLINE vfloat loadu_linear(const float *pSrc)
+	{
+		__m128 v = _mm_loadu_ps((const float*)pSrc);
+
+		v = _mm_and_ps(v, _mm_castsi128_ps(m_exec.m_mask));
+
+		return vfloat{ v };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat loadu_linear_all(const float *pSrc)
+	{
+		return vfloat{ _mm_loadu_ps((float*)pSrc) };
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_linear_all(const float *pSrc)
+	{
+		return vfloat{ _mm_load_ps((float*)pSrc) };
+	}
+	
+	CPPSPMD_FORCE_INLINE vint& store(vint& dst, const vint& src)
+	{
+		dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE const int_vref& store(const int_vref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i*)stored, src.m_value);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				dst.m_pValue[vindex[i]] = stored[i];
+		}
+		return dst;
+	}
+	
+	CPPSPMD_FORCE_INLINE vint& store_all(vint& dst, const vint& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+				
+	CPPSPMD_FORCE_INLINE const int_vref& store_all(const int_vref& dst, const vint& src)
+	{
+		CPPSPMD_ALIGN(16) int vindex[4];
+		_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+		CPPSPMD_ALIGN(16) int stored[4];
+		_mm_store_si128((__m128i*)stored, src.m_value);
+
+		for (int i = 0; i < 4; i++)
+			dst.m_pValue[vindex[i]] = stored[i];
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const int_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				values[i] = src.m_pValue[indices[i]];
+		}
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load_all(const int_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		for (int i = 0; i < 4; i++)
+			values[i] = src.m_pValue[indices[i]];
+
+		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load(const cint_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		for (int i = 0; i < 4; i++)
+		{
+			if (mask & (1 << i))
+				values[i] = src.m_pValue[indices[i]];
+		}
+
+		return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
+	}
+		
+	CPPSPMD_FORCE_INLINE vint load_all(const cint_vref& src)
+	{
+		CPPSPMD_ALIGN(16) int values[4];
+
+		CPPSPMD_ALIGN(16) int indices[4];
+		_mm_store_si128((__m128i *)indices, src.m_vindex);
+
+		for (int i = 0; i < 4; i++)
+			values[i] = src.m_pValue[indices[i]];
+
+		return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_bytes_all(const cint_vref& src)
+	{
+		__m128i v0_l;
+
+		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
+		v0_l = insert_x(_mm_undefined_si128(), ((int*)(pSrc + extract_x(src.m_vindex)))[0]);
+		v0_l = insert_y(v0_l, ((int*)(pSrc + extract_y(src.m_vindex)))[0]);
+		v0_l = insert_z(v0_l, ((int*)(pSrc + extract_z(src.m_vindex)))[0]);
+		v0_l = insert_w(v0_l, ((int*)(pSrc + extract_w(src.m_vindex)))[0]);
+
+		return vint{ v0_l };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_words_all(const cint_vref& src)
+	{
+		__m128i v0_l;
+
+		const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
+		v0_l = insert_x(_mm_undefined_si128(), ((int16_t*)(pSrc + 2 * extract_x(src.m_vindex)))[0]);
+		v0_l = insert_y(v0_l, ((int16_t*)(pSrc + 2 * extract_y(src.m_vindex)))[0]);
+		v0_l = insert_z(v0_l, ((int16_t*)(pSrc + 2 * extract_z(src.m_vindex)))[0]);
+		v0_l = insert_w(v0_l, ((int16_t*)(pSrc + 2 * extract_w(src.m_vindex)))[0]);
+
+		return vint{ v0_l };
+	}
+
+	CPPSPMD_FORCE_INLINE void store_strided(int *pDst, uint32_t stride, const vint &v)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) pDst[0] = extract_x(v.m_value);
+		if (mask & 2) pDst[stride] = extract_y(v.m_value);
+		if (mask & 4) pDst[stride*2] = extract_z(v.m_value);
+		if (mask & 8) pDst[stride*3] = extract_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_strided(float *pDstF, uint32_t stride, const vfloat &v)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		if (mask & 1) ((int *)pDstF)[0] = extract_ps_x(v.m_value);
+		if (mask & 2) ((int *)pDstF)[stride] = extract_ps_y(v.m_value);
+		if (mask & 4) ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
+		if (mask & 8) ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_all_strided(int *pDst, uint32_t stride, const vint &v)
+	{
+		pDst[0] = extract_x(v.m_value);
+		pDst[stride] = extract_y(v.m_value);
+		pDst[stride*2] = extract_z(v.m_value);
+		pDst[stride*3] = extract_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE void store_all_strided(float *pDstF, uint32_t stride, const vfloat &v)
+	{
+		((int *)pDstF)[0] = extract_ps_x(v.m_value);
+		((int *)pDstF)[stride] = extract_ps_y(v.m_value);
+		((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
+		((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_strided(const int *pSrc, uint32_t stride)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+								
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) int vals[4] = { 0, 0, 0, 0 };
+		if (mask & 1) vals[0] = pSrc[0];
+		if (mask & 2) vals[1] = pSrc[stride];
+		if (mask & 4) vals[2] = pSrc[stride * 2];
+		if (mask & 8) vals[3] = pSrc[stride * 3];
+		return vint{ _mm_load_si128((__m128i*)vals) };
+#else
+		const float* pSrcF = (const float*)pSrc;
+		__m128 v = _mm_setzero_ps();
+		if (mask & 1) v = _mm_load_ss(pSrcF);
+		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
+		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
+		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
+		return vint{ _mm_castps_si128(v) };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_strided(const float *pSrc, uint32_t stride)
+	{
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) float vals[4] = { 0, 0, 0, 0 };
+		if (mask & 1) vals[0] = pSrc[0];
+		if (mask & 2) vals[1] = pSrc[stride];
+		if (mask & 4) vals[2] = pSrc[stride * 2];
+		if (mask & 8) vals[3] = pSrc[stride * 3];
+		return vfloat{ _mm_load_ps(vals) };
+#else
+		__m128 v = _mm_setzero_ps();
+		if (mask & 1) v = _mm_load_ss(pSrc);
+		if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
+		if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
+		if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
+		return vfloat{ v };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all_strided(const int *pSrc, uint32_t stride)
+	{
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) int vals[4];
+		vals[0] = pSrc[0];
+		vals[1] = pSrc[stride];
+		vals[2] = pSrc[stride * 2];
+		vals[3] = pSrc[stride * 3];
+		return vint{ _mm_load_si128((__m128i*)vals) };
+#else		
+		const float* pSrcF = (const float*)pSrc;
+		__m128 v = _mm_load_ss(pSrcF);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
+		return vint{ _mm_castps_si128(v) };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load_all_strided(const float *pSrc, uint32_t stride)
+	{
+#if CPPSPMD_SSE2
+		CPPSPMD_ALIGN(16) float vals[4];
+		vals[0] = pSrc[0];
+		vals[1] = pSrc[stride];
+		vals[2] = pSrc[stride * 2];
+		vals[3] = pSrc[stride * 3];
+		return vfloat{ _mm_load_ps(vals) };
+#else
+		__m128 v = _mm_load_ss(pSrc);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
+		v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
+		return vfloat{ v };
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE const vfloat_vref& store(const vfloat_vref& dst, const vfloat& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(_mm_castps_si128(src.m_value));
+		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(_mm_castps_si128(src.m_value));
+		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(_mm_castps_si128(src.m_value));
+		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(_mm_castps_si128(src.m_value));
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vfloat load(const vfloat_vref& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		__m128i k = _mm_setzero_si128();
+
+		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vfloat{ _mm_castsi128_ps(k) };
+	}
+
+	CPPSPMD_FORCE_INLINE const vint_vref& store(const vint_vref& dst, const vint& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+		
+		if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(src.m_value);
+		if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(src.m_value);
+		if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(src.m_value);
+		if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(src.m_value);
+
+		return dst;
+	}
+
+	CPPSPMD_FORCE_INLINE vint load(const vint_vref& src)
+	{
+		// TODO: There's surely a better way
+		int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+
+		__m128i k = _mm_setzero_si128();
+
+		if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vint{ k };
+	}
+
+	CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)
+	{
+		// TODO: There's surely a better way
+		__m128i k;
+
+		k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
+		k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
+		k = insert_z(k, ((int*)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
+		k = insert_w(k, ((int*)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
+
+		return vint{ k };
+	}
+			
+	// Linear integer
+	struct lint
+	{
+		__m128i m_value;
+
+		CPPSPMD_FORCE_INLINE explicit lint(__m128i value)
+			: m_value(value)
+		{ }
+
+		CPPSPMD_FORCE_INLINE explicit operator vfloat() const
+		{
+			return vfloat{ _mm_cvtepi32_ps(m_value) };
+		}
+
+		CPPSPMD_FORCE_INLINE explicit operator vint() const
+		{
+			return vint{ m_value };
+		}
+
+		CPPSPMD_FORCE_INLINE int get_first_value() const 
+		{
+			return _mm_cvtsi128_si32(m_value);
+		}
+
+		CPPSPMD_FORCE_INLINE float_lref operator[](float* ptr) const
+		{
+			return float_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE int_lref operator[](int* ptr) const
+		{
+			return int_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE int16_lref operator[](int16_t* ptr) const
+		{
+			return int16_lref{ ptr + get_first_value() };
+		}
+
+		CPPSPMD_FORCE_INLINE cint_lref operator[](const int* ptr) const
+		{
+			return cint_lref{ ptr + get_first_value() };
+		}
+
+	private:
+		lint& operator=(const lint&);
+	};
+
+	CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)
+	{
+		dst.m_value = src.m_value;
+		return dst;
+	}
+	
+	const lint program_index = lint{ _mm_set_epi32( 3, 2, 1, 0 ) };
+	
+	// SPMD condition helpers
+
+	template<typename IfBody>
+	CPPSPMD_FORCE_INLINE void spmd_if(const vbool& cond, const IfBody& ifBody);
+
+	CPPSPMD_FORCE_INLINE void spmd_if_break(const vbool& cond);
+
+	// No breaks, continues, etc. allowed
+	template<typename IfBody>
+	CPPSPMD_FORCE_INLINE void spmd_sif(const vbool& cond, const IfBody& ifBody);
+
+	// No breaks, continues, etc. allowed
+	template<typename IfBody, typename ElseBody>
+	CPPSPMD_FORCE_INLINE void spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody);
+
+	template<typename IfBody, typename ElseBody>
+	CPPSPMD_FORCE_INLINE void spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody);
+
+	template<typename WhileCondBody, typename WhileBody>
+	CPPSPMD_FORCE_INLINE void spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody);
+
+	template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
+	CPPSPMD_FORCE_INLINE void spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody);
+
+	template<typename ForeachBody>
+	CPPSPMD_FORCE_INLINE void spmd_foreach(int begin, int end, const ForeachBody& foreachBody);
+		
+#ifdef _DEBUG
+	CPPSPMD_FORCE_INLINE void check_masks();
+#else
+	CPPSPMD_FORCE_INLINE void check_masks() { }
+#endif
+
+	CPPSPMD_FORCE_INLINE void spmd_break();
+	CPPSPMD_FORCE_INLINE void spmd_continue();
+	
+	CPPSPMD_FORCE_INLINE void spmd_return();
+	
+	template<typename UnmaskedBody>
+	CPPSPMD_FORCE_INLINE void spmd_unmasked(const UnmaskedBody& unmaskedBody);
+
+	template<typename SPMDKernel, typename... Args>
+	//CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args);
+	CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args);
+
+	CPPSPMD_FORCE_INLINE void swap(vint &a, vint &b) { vint temp = a; store(a, b); store(b, temp); }
+	CPPSPMD_FORCE_INLINE void swap(vfloat &a, vfloat &b) { vfloat temp = a; store(a, b); store(b, temp); }
+	CPPSPMD_FORCE_INLINE void swap(vbool &a, vbool &b) { vbool temp = a; store(a, b); store(b, temp); }
+
+	CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)	
+	{ 
+		__m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
+
+//#if CPPSPMD_SSE2
+#if 1
+		// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
+		__m128 shuf   = _mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(2, 3, 0, 1));
+		__m128 sums   = _mm_add_ps(k3210, shuf);
+		shuf          = _mm_movehl_ps(shuf, sums);
+		sums          = _mm_add_ss(sums, shuf);
+		return _mm_cvtss_f32(sums);
+#else
+		// This is pretty slow.
+		__m128 a = _mm_hadd_ps(k3210, k3210);
+		__m128 b = _mm_hadd_ps(a, a);
+		return extractf_ps_x(b);
+#endif
+	}
+
+	CPPSPMD_FORCE_INLINE int reduce_add(vint v)
+	{
+		__m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
+
+		// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction/35270026#35270026
+		__m128i shuf = _mm_shuffle_epi32(k3210, _MM_SHUFFLE(2, 3, 0, 1));
+		__m128i sums = _mm_add_epi32(k3210, shuf);
+		shuf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(shuf), _mm_castsi128_ps(sums)));
+		sums = _mm_add_epi32(sums, shuf);
+		return extract_x(sums);
+	}
+
+	#include "cppspmd_math_declares.h"
+
+}; // struct spmd_kernel
+
+using exec_mask = spmd_kernel::exec_mask;
+using vint = spmd_kernel::vint;
+using int_lref = spmd_kernel::int_lref;
+using cint_vref = spmd_kernel::cint_vref;
+using cint_lref = spmd_kernel::cint_lref;
+using int_vref = spmd_kernel::int_vref;
+using lint = spmd_kernel::lint;
+using vbool = spmd_kernel::vbool;
+using vfloat = spmd_kernel::vfloat;
+using float_lref = spmd_kernel::float_lref;
+using float_vref = spmd_kernel::float_vref;
+using vfloat_vref = spmd_kernel::vfloat_vref;
+using vint_vref = spmd_kernel::vint_vref;
+
+CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vfloat() const 
+{ 
+	return vfloat { _mm_and_ps( _mm_castsi128_ps(m_value), *(const __m128 *)g_onef_128 ) }; 
+}
+	
+// Returns UINT32_MAX's for true, 0 for false. (Should it return 1's?)
+CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vint() const 
+{ 
+	return vint { m_value };
+}
+
+CPPSPMD_FORCE_INLINE vbool operator!(const vbool& v)
+{
+	return vbool{ _mm_castps_si128(_mm_xor_ps(_mm_load_ps((const float*)g_allones_128), _mm_castsi128_ps(v.m_value))) };
+}
+
+CPPSPMD_FORCE_INLINE exec_mask::exec_mask(const vbool& b) { m_mask = b.m_value; }
+
+CPPSPMD_FORCE_INLINE exec_mask operator^(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_xor_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE exec_mask operator&(const exec_mask& a, const exec_mask& b) {	return exec_mask{ _mm_and_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE exec_mask operator|(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_or_si128(a.m_mask, b.m_mask) }; }
+
+CPPSPMD_FORCE_INLINE bool all(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) == ALL_ON_MOVEMASK; }
+CPPSPMD_FORCE_INLINE bool any(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) != 0; }
+
+// Bad pattern - doesn't factor in the current exec mask. Prefer spmd_any() instead.
+CPPSPMD_FORCE_INLINE bool all(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) == ALL_ON_MOVEMASK; }
+CPPSPMD_FORCE_INLINE bool any(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) != 0; }
+
+CPPSPMD_FORCE_INLINE exec_mask andnot(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_andnot_si128(a.m_mask, b.m_mask) }; }
+CPPSPMD_FORCE_INLINE vbool operator||(const vbool& a, const vbool& b) { return vbool{ _mm_or_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator&&(const vbool& a, const vbool& b) { return vbool{ _mm_and_si128(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, const vfloat& b) { return vfloat{ _mm_add_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_sub_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator+(float a, const vfloat& b) { return vfloat(a) + b; }
+CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, float b) { return a + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vint& b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vint& a, const vfloat& b) { return vfloat(a) - b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, int b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(int a, const vfloat& b) { return vfloat(a) - b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, float b) { return a - vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator-(float a, const vfloat& b) { return vfloat(a) - b; }
+
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, const vfloat& b) { return vfloat{ _mm_mul_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, float b) { return a * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(float a, const vfloat& b) { return vfloat(a) * b; }
+CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, int b) { return a * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(int a, const vfloat& b) { return vfloat(a) * b; }
+
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_div_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, int b) { return a / vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator/(int a, const vfloat& b) { return vfloat(a) / b; }
+CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, float b) { return a / vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator/(float a, const vfloat& b) { return vfloat(a) / b; }
+CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& v) { return vfloat{ _mm_sub_ps(_mm_xor_ps(v.m_value, v.m_value), v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, float b) { return a == vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, const vfloat& b) { return !vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, float b) { return a != vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmplt_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, float b) { return a < vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpgt_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, float b) { return a > vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmple_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, float b) { return a <= vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpge_ps(a.m_value, b.m_value)) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, float b) { return a >= vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vfloat spmd_ternaryf(const vbool& cond, const vfloat& a, const vfloat& b) { return vfloat{ blendv_mask_ps(b.m_value, a.m_value, _mm_castsi128_ps(cond.m_value)) }; }
+CPPSPMD_FORCE_INLINE vint spmd_ternaryi(const vbool& cond, const vint& a, const vint& b) { return vint{ blendv_mask_epi32(b.m_value, a.m_value, cond.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat sqrt(const vfloat& v) { return vfloat{ _mm_sqrt_ps(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat abs(const vfloat& v) { return vfloat{ _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat max(const vfloat& a, const vfloat& b) { return vfloat{ _mm_max_ps(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat min(const vfloat& a, const vfloat& b) {	return vfloat{ _mm_min_ps(a.m_value, b.m_value) }; }
+
+#if CPPSPMD_SSE2
+CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU) );
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+		
+	__m128i ai = _mm_cvttps_epi32(a.m_value);
+	
+	__m128 af = _mm_cvtepi32_ps(ai);
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+
+	__m128i ai = _mm_cvtps_epi32(a.m_value);
+	__m128 af = _mm_cvtepi32_ps(ai);
+	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmpgt_ps(af, a.m_value)));
+
+	af = _mm_add_ps(af, changed);
+
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a)
+{
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
+	
+	__m128i ai = _mm_cvtps_epi32(a.m_value);
+	__m128 af = _mm_cvtepi32_ps(ai);
+	__m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmplt_ps(af, a.m_value)));
+	
+	af = _mm_sub_ps(af, changed);
+
+	return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
+}
+
+// We need to disable unsafe math optimizations for the key operations used for rounding to nearest.
+// I wish there was a better way.
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
+inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#elif defined(__clang__)
+inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optnone))
+#elif defined (_MSC_VER)
+#pragma float_control(push)
+#pragma float_control(precise, on)
+inline __m128 add_sub(__m128 a, __m128 b)
+#else
+inline __m128 add_sub(__m128 a, __m128 b)
+#endif
+{
+	return _mm_sub_ps(_mm_add_ps(a, b), b);
+}
+
+#if defined (_MSC_VER)
+#pragma float_control(pop)
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat& a)
+{
+	__m128i no_fract_fp_bits = _mm_castps_si128(_mm_set1_ps(8388608.0f));
+
+	__m128i sign_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x80000000U));
+	__m128 force_int = _mm_castsi128_ps(_mm_or_si128(no_fract_fp_bits, sign_a));
+	
+	// Can't use individual _mm_add_ps/_mm_sub_ps - this will be optimized out with /fp:fast by clang and probably other compilers.
+	//__m128 temp1 = _mm_add_ps(a.m_value, force_int);
+	//__m128 temp2 = _mm_sub_ps(temp1, force_int);
+	__m128 temp2 = add_sub(a.m_value, force_int);
+	
+	__m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
+	__m128i has_fractional = _mm_cmplt_epi32(abs_a, no_fract_fp_bits);
+	return vfloat{ blendv_mask_ps(a.m_value, temp2, _mm_castsi128_ps(has_fractional)) };
+}
+
+#else
+CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& v) { return vfloat{ _mm_floor_ps(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a) { return vfloat{ _mm_ceil_ps(a.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) }; }
+CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) }; }
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat frac(const vfloat& a) { return a - floor(a); }
+CPPSPMD_FORCE_INLINE vfloat fmod(vfloat a, vfloat b) { vfloat c = frac(abs(a / b)) * abs(b); return spmd_ternaryf(a < 0, -c, c); }
+CPPSPMD_FORCE_INLINE vfloat sign(const vfloat& a) { return spmd_ternaryf(a < 0.0f, 1.0f, 1.0f); }
+
+CPPSPMD_FORCE_INLINE vint max(const vint& a, const vint& b) { return vint{ max_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min(const vint& a, const vint& b) {	return vint{ min_epi32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint maxu(const vint& a, const vint& b) { return vint{ max_epu32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint minu(const vint& a, const vint& b) { return vint{ min_epu32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint abs(const vint& v) { return vint{ abs_epi32(v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint byteswap(const vint& v) {	return vint{ shuffle_epi8(v.m_value, _mm_set_epi8(12, 13, 14, 15,  8,  9, 10, 11,  4,  5,  6,  7,  0,  1,  2,  3)) }; }
+
+CPPSPMD_FORCE_INLINE vint cast_vfloat_to_vint(const vfloat& v) { return vint{ _mm_castps_si128(v.m_value) }; }
+CPPSPMD_FORCE_INLINE vfloat cast_vint_to_vfloat(const vint& v) { return vfloat{ _mm_castsi128_ps(v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vfloat clamp(const vfloat& v, const vfloat& a, const vfloat& b)
+{
+	return vfloat{ _mm_min_ps(b.m_value, _mm_max_ps(v.m_value, a.m_value) ) };
+}
+
+CPPSPMD_FORCE_INLINE vint clamp(const vint& v, const vint& a, const vint& b)
+{
+	return vint{ min_epi32(b.m_value, max_epi32(v.m_value, a.m_value) ) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfma(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_add_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfms(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfnma(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(c.m_value, _mm_mul_ps(a.m_value, b.m_value)) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat vfnms(const vfloat& a, const vfloat& b, const vfloat& c)
+{
+	return vfloat{ _mm_sub_ps(_mm_sub_ps(_mm_xor_ps(a.m_value, a.m_value), _mm_mul_ps(a.m_value, b.m_value)), c.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE vfloat lerp(const vfloat &x, const vfloat &y, const vfloat &s) { return vfma(y - x, s, x); }
+
+CPPSPMD_FORCE_INLINE lint operator+(int a, const lint& b) { return lint{ _mm_add_epi32(_mm_set1_epi32(a), b.m_value) }; }
+CPPSPMD_FORCE_INLINE lint operator+(const lint& a, int b) { return lint{ _mm_add_epi32(a.m_value, _mm_set1_epi32(b)) }; }
+CPPSPMD_FORCE_INLINE vfloat operator+(float a, const lint& b) { return vfloat(a) + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator+(const lint& a, float b) { return vfloat(a) + vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(const lint& a, float b) { return vfloat(a) * vfloat(b); }
+CPPSPMD_FORCE_INLINE vfloat operator*(float b, const lint& a) { return vfloat(a) * vfloat(b); }
+
+CPPSPMD_FORCE_INLINE vint operator&(const vint& a, const vint& b) { return vint{ _mm_and_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator&(const vint& a, int b) { return a & vint(b); }
+CPPSPMD_FORCE_INLINE vint andnot(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator|(const vint& a, const vint& b) { return vint{ _mm_or_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator|(const vint& a, int b) { return a | vint(b); }
+CPPSPMD_FORCE_INLINE vint operator^(const vint& a, const vint& b) { return vint{ _mm_xor_si128(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator^(const vint& a, int b) { return a ^ vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator==(const vint& a, const vint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator!=(const vint& a, const vint& b) { return !vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator+(const vint& a, const vint& b) { return vint{ _mm_add_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator-(const vint& a, const vint& b) { return vint{ _mm_sub_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator+(const vint& a, int b) { return a + vint(b); }
+CPPSPMD_FORCE_INLINE vint operator-(const vint& a, int b) { return a - vint(b); }
+CPPSPMD_FORCE_INLINE vint operator+(int a, const vint& b) { return vint(a) + b; }
+CPPSPMD_FORCE_INLINE vint operator-(int a, const vint& b) { return vint(a) - b; }
+CPPSPMD_FORCE_INLINE vint operator*(const vint& a, const vint& b) { return vint{ mullo_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint operator*(const vint& a, int b) { return a * vint(b); }
+CPPSPMD_FORCE_INLINE vint operator*(int a, const vint& b) { return vint(a) * b; }
+
+CPPSPMD_FORCE_INLINE vint mulhiu(const vint& a, const vint& b) { return vint{ mulhi_epu32(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint operator-(const vint& v) { return vint{ _mm_sub_epi32(_mm_setzero_si128(), v.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint operator~(const vint& a) { return vint{ -a - 1 }; }
+
+// A few of these break the lane-based abstraction model. They are supported in SSE2, so it makes sense to support them and let the user figure it out.
+CPPSPMD_FORCE_INLINE vint adds_epu8(const vint& a, const vint& b) {	return vint{ _mm_adds_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epu8(const vint& a, const vint& b) { return vint{ _mm_subs_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint avg_epu8(const vint & a, const vint & b) { return vint{ _mm_avg_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint max_epu8(const vint& a, const vint& b) { return vint{ _mm_max_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min_epu8(const vint& a, const vint& b) { return vint{ _mm_min_epu8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sad_epu8(const vint& a, const vint& b) { return vint{ _mm_sad_epu8(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint add_epi8(const vint& a, const vint& b) { return vint{ _mm_add_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epi8(const vint& a, const vint& b) { return vint{ _mm_adds_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sub_epi8(const vint& a, const vint& b) { return vint{ _mm_sub_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epi8(const vint& a, const vint& b) { return vint{ _mm_subs_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpeq_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint unpacklo_epi8(const vint& a, const vint& b) { return vint{ _mm_unpacklo_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint unpackhi_epi8(const vint& a, const vint& b) { return vint{ _mm_unpackhi_epi8(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE int movemask_epi8(const vint& a) { return _mm_movemask_epi8(a.m_value); }
+CPPSPMD_FORCE_INLINE int movemask_epi32(const vint& a) { return _mm_movemask_ps(_mm_castsi128_ps(a.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint cmple_epu8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(_mm_min_epu8(a.m_value, b.m_value), a.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpge_epu8(const vint& a, const vint& b) { return vint{ cmple_epu8(b, a) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epu8(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(_mm_cmpeq_epi8(a.m_value, b.m_value), _mm_cmpeq_epi8(_mm_max_epu8(a.m_value, b.m_value), a.m_value)) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epu8(const vint& a, const vint& b) { return vint{ cmpgt_epu8(b, a) }; }
+CPPSPMD_FORCE_INLINE vint absdiff_epu8(const vint& a, const vint& b) { return vint{ _mm_or_si128(_mm_subs_epu8(a.m_value, b.m_value), _mm_subs_epu8(b.m_value, a.m_value)) }; }
+
+CPPSPMD_FORCE_INLINE vint blendv_epi8(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi8(a.m_value, b.m_value, _mm_cmplt_epi8(mask.m_value, _mm_setzero_si128())) }; }
+CPPSPMD_FORCE_INLINE vint blendv_epi32(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi32(a.m_value, b.m_value, mask.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint add_epi16(const vint& a, const vint& b) { return vint{ _mm_add_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epi16(const vint& a, const vint& b) { return vint{ _mm_adds_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint adds_epu16(const vint& a, const vint& b) { return vint{ _mm_adds_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint avg_epu16(const vint& a, const vint& b) { return vint{ _mm_avg_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint sub_epi16(const vint& a, const vint& b) { return vint{ _mm_sub_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epi16(const vint& a, const vint& b) { return vint{ _mm_subs_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint subs_epu16(const vint& a, const vint& b) { return vint{ _mm_subs_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mullo_epi16(const vint& a, const vint& b) { return vint{ _mm_mullo_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mulhi_epi16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint mulhi_epu16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epu16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint min_epi16(const vint& a, const vint& b) { return vint{ _mm_min_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint max_epi16(const vint& a, const vint& b) { return vint{ _mm_max_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint madd_epi16(const vint& a, const vint& b) { return vint{ _mm_madd_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpeq_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmpgt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint cmplt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint packs_epi16(const vint& a, const vint& b) { return vint{ _mm_packs_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint packus_epi16(const vint& a, const vint& b) { return vint{ _mm_packus_epi16(a.m_value, b.m_value) }; }
+
+CPPSPMD_FORCE_INLINE vint uniform_shift_left_epi16(const vint& a, const vint& b) { return vint{ _mm_sll_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint uniform_arith_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_sra_epi16(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_srl_epi16(a.m_value, b.m_value) }; }
+
+#define VINT_SHIFT_LEFT_EPI16(a, b) vint(_mm_slli_epi16((a).m_value, b))
+#define VINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srai_epi16((a).m_value, b))
+#define VUINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srli_epi16((a).m_value, b))
+
+CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
+CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
+
+// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
+#define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
+
+// control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane.
+#define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control))
+#define VINT_LANE_SHUFFLEHI_EPI16(a, control) vint(_mm_shufflehi_epi16((a).m_value, control))
+
+#define VINT_LANE_SHUFFLE_MASK(a, b, c, d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
+#define VINT_LANE_SHUFFLE_MASK_R(d, c, b, a) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
+
+#define VINT_LANE_SHIFT_LEFT_BYTES(a, l) vint(_mm_slli_si128((a).m_value, l))
+#define VINT_LANE_SHIFT_RIGHT_BYTES(a, l) vint(_mm_srli_si128((a).m_value, l))
+
+// Unpack and interleave 8-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi8(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi8(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi8(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi8(a.m_value, b.m_value)); }
+
+// Unpack and interleave 16-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi16(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi16(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi16(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi16(a.m_value, b.m_value)); }
+
+// Unpack and interleave 32-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi32(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi32(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi32(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi32(a.m_value, b.m_value)); }
+
+// Unpack and interleave 64-bit integers from the low or high half of a and b
+CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi64(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi64(a.m_value, b.m_value)); }
+CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi64(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi64(a.m_value, b.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint vint_set1_epi8(int8_t a) { return vint(_mm_set1_epi8(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi16(int16_t a) { return vint(_mm_set1_epi16(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi32(int32_t a) { return vint(_mm_set1_epi32(a)); }
+CPPSPMD_FORCE_INLINE vint vint_set1_epi64(int64_t a) { return vint(_mm_set1_epi64x(a)); }
+
+CPPSPMD_FORCE_INLINE vint mul_epu32(const vint &a, const vint& b) { return vint(_mm_mul_epu32(a.m_value, b.m_value)); }
+
+CPPSPMD_FORCE_INLINE vint div_epi32(const vint &a, const vint& b)
+{
+	__m128d al = _mm_cvtepi32_pd(a.m_value);
+	__m128d ah = _mm_cvtepi32_pd(_mm_unpackhi_epi64(a.m_value, a.m_value));
+
+	__m128d bl = _mm_cvtepi32_pd(b.m_value);
+	__m128d bh = _mm_cvtepi32_pd(_mm_unpackhi_epi64(b.m_value, b.m_value));
+
+	__m128d rl = _mm_div_pd(al, bl);
+	__m128d rh = _mm_div_pd(ah, bh);
+
+	__m128i rli = _mm_cvttpd_epi32(rl);
+	__m128i rhi = _mm_cvttpd_epi32(rh);
+
+	return vint(_mm_unpacklo_epi64(rli, rhi));
+}
+
+CPPSPMD_FORCE_INLINE vint mod_epi32(const vint &a, const vint& b)
+{
+	vint aa = abs(a), ab = abs(b);
+	vint q = div_epi32(aa, ab);
+	vint r = aa - q * ab;
+	return spmd_ternaryi(a < 0, -r, r);
+}
+
+CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, const vint& b)
+{
+	return div_epi32(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, int b)
+{
+	return div_epi32(a, vint(b));
+}
+
+CPPSPMD_FORCE_INLINE vint operator% (const vint& a, const vint& b)
+{
+	return mod_epi32(a, b);
+}
+
+CPPSPMD_FORCE_INLINE vint operator% (const vint& a, int b)
+{
+	return mod_epi32(a, vint(b));
+}
+
+CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = extract_x(a.m_value) << extract_x(b.m_value);
+	result[1] = extract_y(a.m_value) << extract_y(b.m_value);
+	result[2] = extract_z(a.m_value) << extract_z(b.m_value);
+	result[3] = extract_w(a.m_value) << extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	int x = extract_x(a.m_value) << extract_x(b.m_value);
+	int y = extract_y(a.m_value) << extract_y(b.m_value);
+	int z = extract_z(a.m_value) << extract_z(b.m_value);
+	int w = extract_w(a.m_value) << extract_w(b.m_value);
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	// What this does: shift left each b lane by 23 bits (to move the shift amount into the FP exponent position), then epi32 add to the integer rep of 1.0f, then cast that to float, then convert that to int to get fast 2^x.
+	return a * vint(cast_vint_to_vfloat(vint(_mm_slli_epi32(b.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f))));
+#endif
+}
+
+// uniform shift left
+CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_sll_epi32(a.m_value, bv) };
+}
+
+// uniform arithmetic shift right
+CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_sra_epi32(a.m_value, bv) };
+}
+
+// uniform shift right
+CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, int b)
+{
+	__m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
+	return vint{ _mm_srl_epi32(a.m_value, bv) };
+}
+
+CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = ((uint32_t)extract_x(a.m_value)) >> extract_x(b.m_value);
+	result[1] = ((uint32_t)extract_y(a.m_value)) >> extract_y(b.m_value);
+	result[2] = ((uint32_t)extract_z(a.m_value)) >> extract_z(b.m_value);
+	result[3] = ((uint32_t)extract_w(a.m_value)) >> extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	uint32_t x = ((uint32_t)extract_x(a.m_value)) >> ((uint32_t)extract_x(b.m_value));
+	uint32_t y = ((uint32_t)extract_y(a.m_value)) >> ((uint32_t)extract_y(b.m_value));
+	uint32_t z = ((uint32_t)extract_z(a.m_value)) >> ((uint32_t)extract_z(b.m_value));
+	uint32_t w = ((uint32_t)extract_w(a.m_value)) >> ((uint32_t)extract_w(b.m_value));
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	//vint inv_shift = 32 - b;
+	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
+	
+	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
+	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
+
+	// Now convert scale factor to integer.
+	vint r = vint(f);
+
+	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
+	vint q(mulhi_epu32(a.m_value, r.m_value));
+
+	// Handle shift amounts of 0.
+	return spmd_ternaryi(b > 0, q, a);
+#endif
+}
+
+CPPSPMD_FORCE_INLINE vint vuint_shift_right_not_zero(const vint& a, const vint& b)
+{
+	//vint inv_shift = 32 - b;
+	//vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
+	
+	// Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
+	vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
+
+	// Now convert scale factor to integer.
+	vint r = vint(f);
+
+	// mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
+	return vint(mulhi_epu32(a.m_value, r.m_value));
+}
+
+CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, const vint& b)
+{
+#if 0
+	CPPSPMD_ALIGN(32) int result[4];
+	result[0] = extract_x(a.m_value) >> extract_x(b.m_value);
+	result[1] = extract_y(a.m_value) >> extract_y(b.m_value);
+	result[2] = extract_z(a.m_value) >> extract_z(b.m_value);
+	result[3] = extract_w(a.m_value) >> extract_w(b.m_value);
+
+	return vint{ _mm_load_si128((__m128i*)result) };
+#elif 0
+	int x = extract_x(a.m_value) >> extract_x(b.m_value);
+	int y = extract_y(a.m_value) >> extract_y(b.m_value);
+	int z = extract_z(a.m_value) >> extract_z(b.m_value);
+	int w = extract_w(a.m_value) >> extract_w(b.m_value);
+
+	__m128i v = insert_x(_mm_undefined_si128(), x);
+	v = insert_y(v, y);
+	v = insert_z(v, z);
+	return vint{ insert_w(v, w) };
+#else
+	vint sign_mask(_mm_cmplt_epi32(a.m_value, _mm_setzero_si128()));
+	vint a_shifted = vuint_shift_right(a ^ sign_mask, b) ^ sign_mask;
+	return a_shifted;
+#endif
+}
+
+#undef VINT_SHIFT_LEFT
+#undef VINT_SHIFT_RIGHT
+#undef VUINT_SHIFT_RIGHT
+
+// Shift left/right by a uniform immediate constant
+#define VINT_SHIFT_LEFT(a, b) vint(_mm_slli_epi32( (a).m_value, (b) ) )
+#define VINT_SHIFT_RIGHT(a, b) vint( _mm_srai_epi32( (a).m_value, (b) ) ) 
+#define VUINT_SHIFT_RIGHT(a, b) vint( _mm_srli_epi32( (a).m_value, (b) ) )
+#define VINT_ROT(x, k) (VINT_SHIFT_LEFT((x), (k)) | VUINT_SHIFT_RIGHT((x), 32 - (k)))
+
+CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, const lint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, int b) { return vint(a) == vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator==(int a, const lint& b) { return vint(a) == vint(b); }
+CPPSPMD_FORCE_INLINE vbool operator<(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator<=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
+CPPSPMD_FORCE_INLINE vbool operator>=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
+
+CPPSPMD_FORCE_INLINE float extract(const vfloat& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) float values[4]; _mm_store_ps(values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE int extract(const vint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE int extract(const lint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
+CPPSPMD_FORCE_INLINE bool extract(const vbool& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance] != 0; }
+
+#undef VINT_EXTRACT
+#undef VBOOL_EXTRACT
+#undef VFLOAT_EXTRACT
+
+#if CPPSPMD_SSE2
+// Pass in an immediate constant and the compiler will optimize these expressions.
+#define VINT_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
+#define VBOOL_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
+#define VFLOAT_EXTRACT(v, instance) ( ((instance) == 0) ? extractf_ps_x((v).m_value) : (((instance) == 1) ? extractf_ps_y((v).m_value) : (((instance) == 2) ? extractf_ps_z((v).m_value) : extractf_ps_w((v).m_value))) )
+#else
+CPPSPMD_FORCE_INLINE float cast_int_bits_as_float(int v) { return *(const float*)&v; }
+
+#define VINT_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
+#define VBOOL_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
+#define VFLOAT_EXTRACT(v, instance) cast_int_bits_as_float(_mm_extract_ps((v).m_value, instance))
+#endif
+
+CPPSPMD_FORCE_INLINE vfloat &insert(vfloat& v, int instance, float f)
+{
+	assert(instance < 4);
+	CPPSPMD_ALIGN(16) float values[4];
+	_mm_store_ps(values, v.m_value);
+	values[instance] = f;
+	v.m_value = _mm_load_ps(values);
+	return v;
+}
+
+CPPSPMD_FORCE_INLINE vint &insert(vint& v, int instance, int i)
+{
+	assert(instance < 4);
+	CPPSPMD_ALIGN(16) int values[4];
+	_mm_store_si128((__m128i *)values, v.m_value);
+	values[instance] = i;
+	v.m_value = _mm_load_si128((__m128i *)values);
+	return v;
+}
+
+CPPSPMD_FORCE_INLINE vint init_lookup4(const uint8_t pTab[16])
+{
+	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
+	return vint{ l };
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup4_8(const vint& a, const vint& table)
+{
+	return vint{ shuffle_epi8(table.m_value, a.m_value) };
+}
+
+CPPSPMD_FORCE_INLINE void init_lookup5(const uint8_t pTab[32], vint& table_0, vint& table_1)
+{
+	__m128i l = _mm_loadu_si128((const __m128i*)pTab);
+	__m128i h = _mm_loadu_si128((const __m128i*)(pTab + 16));
+	table_0.m_value = l;
+	table_1.m_value = h;
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup5_8(const vint& a, const vint& table_0, const vint& table_1)
+{
+	__m128i l_0 = shuffle_epi8(table_0.m_value, a.m_value);
+	__m128i h_0 = shuffle_epi8(table_1.m_value, a.m_value);
+
+	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
+
+	__m128 v_0 = blendv_ps(_mm_castsi128_ps(l_0), _mm_castsi128_ps(h_0), _mm_castsi128_ps(m_0));
+
+	return vint{ _mm_castps_si128(v_0) };
+}
+
+CPPSPMD_FORCE_INLINE void init_lookup6(const uint8_t pTab[64], vint& table_0, vint& table_1, vint& table_2, vint& table_3)
+{
+	__m128i a = _mm_loadu_si128((const __m128i*)pTab);
+	__m128i b = _mm_loadu_si128((const __m128i*)(pTab + 16));
+	__m128i c = _mm_loadu_si128((const __m128i*)(pTab + 32));
+	__m128i d = _mm_loadu_si128((const __m128i*)(pTab + 48));
+
+	table_0.m_value = a;
+	table_1.m_value = b;
+	table_2.m_value = c;
+	table_3.m_value = d;
+}
+
+CPPSPMD_FORCE_INLINE vint table_lookup6_8(const vint& a, const vint& table_0, const vint& table_1, const vint& table_2, const vint& table_3)
+{
+	__m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
+
+	__m128 av_0;
+	{
+		__m128i al_0 = shuffle_epi8(table_0.m_value, a.m_value);
+		__m128i ah_0 = shuffle_epi8(table_1.m_value, a.m_value);
+		av_0 = blendv_ps(_mm_castsi128_ps(al_0), _mm_castsi128_ps(ah_0), _mm_castsi128_ps(m_0));
+	}
+
+	__m128 bv_0;
+	{
+		__m128i bl_0 = shuffle_epi8(table_2.m_value, a.m_value);
+		__m128i bh_0 = shuffle_epi8(table_3.m_value, a.m_value);
+		bv_0 = blendv_ps(_mm_castsi128_ps(bl_0), _mm_castsi128_ps(bh_0), _mm_castsi128_ps(m_0));
+	}
+
+	__m128i m2_0 = _mm_slli_epi32(a.m_value, 31 - 5);
+	__m128 v2_0 = blendv_ps(av_0, bv_0, _mm_castsi128_ps(m2_0));
+
+	return vint{ _mm_castps_si128(v2_0) };
+}
+
+#if 0
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(exec_mask::all_on());
+	return kernel._call(std::forward<Args>(args)...);
+}
+#else
+template<typename SPMDKernel, typename... Args>
+CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args)
+{
+	SPMDKernel kernel;
+	kernel.init(exec_mask::all_on());
+	kernel._call(std::forward<Args>(args)...);
+}
+#endif
+
+CPPSPMD_FORCE_INLINE void spmd_kernel::init(const spmd_kernel::exec_mask& kernel_exec)
+{
+	m_exec = kernel_exec;
+	m_kernel_exec = kernel_exec;
+	m_continue_mask = exec_mask::all_off();
+
+#ifdef _DEBUG
+	m_in_loop = false;
+#endif
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+	for (int i = 0; i < 4; i++)
+	{
+		if (mask & (1 << i))
+			dst.m_pValue[vindex[i]] = stored[i];
+	}
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	for (int i = 0; i < 4; i++)
+		dst.m_pValue[vindex[i]] = stored[i];
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref&& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
+	for (int i = 0; i < 4; i++)
+	{
+		if (mask & (1 << i))
+			dst.m_pValue[vindex[i]] = stored[i];
+	}
+	return dst;
+}
+
+CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref&& dst, const vfloat& src)
+{
+	CPPSPMD_ALIGN(16) int vindex[4];
+	_mm_store_si128((__m128i*)vindex, dst.m_vindex);
+
+	CPPSPMD_ALIGN(16) float stored[4];
+	_mm_store_ps(stored, src.m_value);
+
+	for (int i = 0; i < 4; i++)
+		dst.m_pValue[vindex[i]] = stored[i];
+	return dst;
+}
+
+#include "cppspmd_flow.h"
+#include "cppspmd_math.h"
+
+} // namespace cppspmd_sse41
+
diff --git a/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
new file mode 100644
index 0000000000..0dfb28b88f
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/cppspmd_type_aliases.h
@@ -0,0 +1,47 @@
+// cppspmd_type_aliases.h
+// Do not include this file directly
+//
+// Copyright 2020-2021 Binomial LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifndef CPPSPMD_TYPES
+#define CPPSPMD_TYPES
+
+using exec_mask = CPPSPMD::exec_mask;
+
+#if CPPSPMD_INT16
+using vint16 = CPPSPMD::vint16;
+using int16_lref = CPPSPMD::int16_lref;
+using cint16_vref = CPPSPMD::cint16_vref;
+using int16_vref = CPPSPMD::int16_vref;
+using lint16 = CPPSPMD::lint16;
+using vint16_vref = CPPSPMD::vint16_vref;
+#else
+using vint = CPPSPMD::vint;
+using int_lref = CPPSPMD::int_lref;
+using cint_vref = CPPSPMD::cint_vref;
+using int_vref = CPPSPMD::int_vref;
+using lint = CPPSPMD::lint;
+using vint_vref = CPPSPMD::vint_vref;
+#endif
+
+using vbool = CPPSPMD::vbool;
+using vfloat = CPPSPMD::vfloat;
+using float_lref = CPPSPMD::float_lref;
+using float_vref = CPPSPMD::float_vref;
+using vfloat_vref = CPPSPMD::vfloat_vref;
+
+#endif // CPPSPMD_TYPES
diff --git a/thirdparty/basis_universal/encoder/jpgd.cpp b/thirdparty/basis_universal/encoder/jpgd.cpp
new file mode 100644
index 0000000000..460834409d
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/jpgd.cpp
@@ -0,0 +1,3241 @@
+// jpgd.cpp - C++ class for JPEG decompression. Written by Richard Geldreich <richgel99@gmail.com> between 1994-2020.
+// Supports progressive and baseline sequential JPEG image files, and the most common chroma subsampling factors: Y, H1V1, H2V1, H1V2, and H2V2.
+// Supports box and linear chroma upsampling.
+//
+// Released under two licenses. You are free to choose which license you want:
+// License 1: 
+// Public Domain
+//
+// License 2:
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Alex Evans: Linear memory allocator (taken from jpge.h).
+// v1.04, May. 19, 2012: Code tweaks to fix VS2008 static code analysis warnings
+// v2.00, March 20, 2020: Fuzzed with zzuf and afl. Fixed several issues, converted most assert()'s to run-time checks. Added chroma upsampling. Removed freq. domain upsampling. gcc/clang warnings.
+//
+#ifdef _MSC_VER
+#ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL
+#if defined(_DEBUG) || defined(DEBUG)
+#define _ITERATOR_DEBUG_LEVEL 1
+#define _SECURE_SCL 1
+#else
+#define _SECURE_SCL 0
+#define _ITERATOR_DEBUG_LEVEL 0
+#endif
+#endif
+#endif
+
+#include "jpgd.h"
+#include <string.h>
+#include <algorithm>
+#include <assert.h>
+
+#ifdef _MSC_VER
+#pragma warning (disable : 4611) // warning C4611: interaction between '_setjmp' and C++ object destruction is non-portable
+#endif
+
+#define JPGD_TRUE (1)
+#define JPGD_FALSE (0)
+
+#define JPGD_MAX(a,b) (((a)>(b)) ? (a) : (b))
+#define JPGD_MIN(a,b) (((a)<(b)) ? (a) : (b))
+
+namespace jpgd {
+
+	static inline void* jpgd_malloc(size_t nSize) { return malloc(nSize); }
+	static inline void jpgd_free(void* p) { free(p); }
+
+	// DCT coefficients are stored in this sequence.
+	static int g_ZAG[64] = { 0,1,8,16,9,2,3,10,17,24,32,25,18,11,4,5,12,19,26,33,40,48,41,34,27,20,13,6,7,14,21,28,35,42,49,56,57,50,43,36,29,22,15,23,30,37,44,51,58,59,52,45,38,31,39,46,53,60,61,54,47,55,62,63 };
+
+	enum JPEG_MARKER
+	{
+		M_SOF0 = 0xC0, M_SOF1 = 0xC1, M_SOF2 = 0xC2, M_SOF3 = 0xC3, M_SOF5 = 0xC5, M_SOF6 = 0xC6, M_SOF7 = 0xC7, M_JPG = 0xC8,
+		M_SOF9 = 0xC9, M_SOF10 = 0xCA, M_SOF11 = 0xCB, M_SOF13 = 0xCD, M_SOF14 = 0xCE, M_SOF15 = 0xCF, M_DHT = 0xC4, M_DAC = 0xCC,
+		M_RST0 = 0xD0, M_RST1 = 0xD1, M_RST2 = 0xD2, M_RST3 = 0xD3, M_RST4 = 0xD4, M_RST5 = 0xD5, M_RST6 = 0xD6, M_RST7 = 0xD7,
+		M_SOI = 0xD8, M_EOI = 0xD9, M_SOS = 0xDA, M_DQT = 0xDB, M_DNL = 0xDC, M_DRI = 0xDD, M_DHP = 0xDE, M_EXP = 0xDF,
+		M_APP0 = 0xE0, M_APP15 = 0xEF, M_JPG0 = 0xF0, M_JPG13 = 0xFD, M_COM = 0xFE, M_TEM = 0x01, M_ERROR = 0x100, RST0 = 0xD0
+	};
+
+	enum JPEG_SUBSAMPLING { JPGD_GRAYSCALE = 0, JPGD_YH1V1, JPGD_YH2V1, JPGD_YH1V2, JPGD_YH2V2 };
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+#define SCALEDONE ((int32)1)
+
+#define FIX_0_298631336  ((int32)2446)        /* FIX(0.298631336) */
+#define FIX_0_390180644  ((int32)3196)        /* FIX(0.390180644) */
+#define FIX_0_541196100  ((int32)4433)        /* FIX(0.541196100) */
+#define FIX_0_765366865  ((int32)6270)        /* FIX(0.765366865) */
+#define FIX_0_899976223  ((int32)7373)        /* FIX(0.899976223) */
+#define FIX_1_175875602  ((int32)9633)        /* FIX(1.175875602) */
+#define FIX_1_501321110  ((int32)12299)       /* FIX(1.501321110) */
+#define FIX_1_847759065  ((int32)15137)       /* FIX(1.847759065) */
+#define FIX_1_961570560  ((int32)16069)       /* FIX(1.961570560) */
+#define FIX_2_053119869  ((int32)16819)       /* FIX(2.053119869) */
+#define FIX_2_562915447  ((int32)20995)       /* FIX(2.562915447) */
+#define FIX_3_072711026  ((int32)25172)       /* FIX(3.072711026) */
+
+#define DESCALE(x,n)  (((x) + (SCALEDONE << ((n)-1))) >> (n))
+#define DESCALE_ZEROSHIFT(x,n)  (((x) + (128 << (n)) + (SCALEDONE << ((n)-1))) >> (n))
+
+#define MULTIPLY(var, cnst)  ((var) * (cnst))
+
+#define CLAMP(i) ((static_cast<uint>(i) > 255) ? (((~i) >> 31) & 0xFF) : (i))
+
+	static inline int left_shifti(int val, uint32_t bits)
+	{
+		return static_cast<int>(static_cast<uint32_t>(val) << bits);
+	}
+
+	// Compiler creates a fast path 1D IDCT for X non-zero columns
+	template <int NONZERO_COLS>
+	struct Row
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			// ACCESS_COL() will be optimized at compile time to either an array access, or 0. Good compilers will then optimize out muls against 0.
+#define ACCESS_COL(x) (((x) < NONZERO_COLS) ? (int)pSrc[x] : 0)
+
+			const int z2 = ACCESS_COL(2), z3 = ACCESS_COL(6);
+
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+			const int tmp0 = left_shifti(ACCESS_COL(0) + ACCESS_COL(4), CONST_BITS);
+			const int tmp1 = left_shifti(ACCESS_COL(0) - ACCESS_COL(4), CONST_BITS);
+
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+			const int atmp0 = ACCESS_COL(7), atmp1 = ACCESS_COL(5), atmp2 = ACCESS_COL(3), atmp3 = ACCESS_COL(1);
+
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+			const int az1 = MULTIPLY(bz1, -FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, -FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, -FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, -FIX_0_390180644) + bz5;
+
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+			pTemp[0] = DESCALE(tmp10 + btmp3, CONST_BITS - PASS1_BITS);
+			pTemp[7] = DESCALE(tmp10 - btmp3, CONST_BITS - PASS1_BITS);
+			pTemp[1] = DESCALE(tmp11 + btmp2, CONST_BITS - PASS1_BITS);
+			pTemp[6] = DESCALE(tmp11 - btmp2, CONST_BITS - PASS1_BITS);
+			pTemp[2] = DESCALE(tmp12 + btmp1, CONST_BITS - PASS1_BITS);
+			pTemp[5] = DESCALE(tmp12 - btmp1, CONST_BITS - PASS1_BITS);
+			pTemp[3] = DESCALE(tmp13 + btmp0, CONST_BITS - PASS1_BITS);
+			pTemp[4] = DESCALE(tmp13 - btmp0, CONST_BITS - PASS1_BITS);
+		}
+	};
+
+	template <>
+	struct Row<0>
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			(void)pTemp; 
+			(void)pSrc;
+		}
+	};
+
+	template <>
+	struct Row<1>
+	{
+		static void idct(int* pTemp, const jpgd_block_t* pSrc)
+		{
+			const int dcval = left_shifti(pSrc[0], PASS1_BITS);
+
+			pTemp[0] = dcval;
+			pTemp[1] = dcval;
+			pTemp[2] = dcval;
+			pTemp[3] = dcval;
+			pTemp[4] = dcval;
+			pTemp[5] = dcval;
+			pTemp[6] = dcval;
+			pTemp[7] = dcval;
+		}
+	};
+
+	// Compiler creates a fast path 1D IDCT for X non-zero rows
+	template <int NONZERO_ROWS>
+	struct Col
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			// ACCESS_ROW() will be optimized at compile time to either an array access, or 0.
+#define ACCESS_ROW(x) (((x) < NONZERO_ROWS) ? pTemp[x * 8] : 0)
+
+			const int z2 = ACCESS_ROW(2);
+			const int z3 = ACCESS_ROW(6);
+
+			const int z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
+			const int tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
+			const int tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
+
+			const int tmp0 = left_shifti(ACCESS_ROW(0) + ACCESS_ROW(4), CONST_BITS);
+			const int tmp1 = left_shifti(ACCESS_ROW(0) - ACCESS_ROW(4), CONST_BITS);
+
+			const int tmp10 = tmp0 + tmp3, tmp13 = tmp0 - tmp3, tmp11 = tmp1 + tmp2, tmp12 = tmp1 - tmp2;
+
+			const int atmp0 = ACCESS_ROW(7), atmp1 = ACCESS_ROW(5), atmp2 = ACCESS_ROW(3), atmp3 = ACCESS_ROW(1);
+
+			const int bz1 = atmp0 + atmp3, bz2 = atmp1 + atmp2, bz3 = atmp0 + atmp2, bz4 = atmp1 + atmp3;
+			const int bz5 = MULTIPLY(bz3 + bz4, FIX_1_175875602);
+
+			const int az1 = MULTIPLY(bz1, -FIX_0_899976223);
+			const int az2 = MULTIPLY(bz2, -FIX_2_562915447);
+			const int az3 = MULTIPLY(bz3, -FIX_1_961570560) + bz5;
+			const int az4 = MULTIPLY(bz4, -FIX_0_390180644) + bz5;
+
+			const int btmp0 = MULTIPLY(atmp0, FIX_0_298631336) + az1 + az3;
+			const int btmp1 = MULTIPLY(atmp1, FIX_2_053119869) + az2 + az4;
+			const int btmp2 = MULTIPLY(atmp2, FIX_3_072711026) + az2 + az3;
+			const int btmp3 = MULTIPLY(atmp3, FIX_1_501321110) + az1 + az4;
+
+			int i = DESCALE_ZEROSHIFT(tmp10 + btmp3, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 0] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp10 - btmp3, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 7] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 + btmp2, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 1] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp11 - btmp2, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 6] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 + btmp1, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 2] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp12 - btmp1, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 5] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 + btmp0, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 3] = (uint8)CLAMP(i);
+
+			i = DESCALE_ZEROSHIFT(tmp13 - btmp0, CONST_BITS + PASS1_BITS + 3);
+			pDst_ptr[8 * 4] = (uint8)CLAMP(i);
+		}
+	};
+
+	template <>
+	struct Col<1>
+	{
+		static void idct(uint8* pDst_ptr, const int* pTemp)
+		{
+			int dcval = DESCALE_ZEROSHIFT(pTemp[0], PASS1_BITS + 3);
+			const uint8 dcval_clamped = (uint8)CLAMP(dcval);
+			pDst_ptr[0 * 8] = dcval_clamped;
+			pDst_ptr[1 * 8] = dcval_clamped;
+			pDst_ptr[2 * 8] = dcval_clamped;
+			pDst_ptr[3 * 8] = dcval_clamped;
+			pDst_ptr[4 * 8] = dcval_clamped;
+			pDst_ptr[5 * 8] = dcval_clamped;
+			pDst_ptr[6 * 8] = dcval_clamped;
+			pDst_ptr[7 * 8] = dcval_clamped;
+		}
+	};
+
+	static const uint8 s_idct_row_table[] =
+	{
+	  1,0,0,0,0,0,0,0, 2,0,0,0,0,0,0,0, 2,1,0,0,0,0,0,0, 2,1,1,0,0,0,0,0, 2,2,1,0,0,0,0,0, 3,2,1,0,0,0,0,0, 4,2,1,0,0,0,0,0, 4,3,1,0,0,0,0,0,
+	  4,3,2,0,0,0,0,0, 4,3,2,1,0,0,0,0, 4,3,2,1,1,0,0,0, 4,3,2,2,1,0,0,0, 4,3,3,2,1,0,0,0, 4,4,3,2,1,0,0,0, 5,4,3,2,1,0,0,0, 6,4,3,2,1,0,0,0,
+	  6,5,3,2,1,0,0,0, 6,5,4,2,1,0,0,0, 6,5,4,3,1,0,0,0, 6,5,4,3,2,0,0,0, 6,5,4,3,2,1,0,0, 6,5,4,3,2,1,1,0, 6,5,4,3,2,2,1,0, 6,5,4,3,3,2,1,0,
+	  6,5,4,4,3,2,1,0, 6,5,5,4,3,2,1,0, 6,6,5,4,3,2,1,0, 7,6,5,4,3,2,1,0, 8,6,5,4,3,2,1,0, 8,7,5,4,3,2,1,0, 8,7,6,4,3,2,1,0, 8,7,6,5,3,2,1,0,
+	  8,7,6,5,4,2,1,0, 8,7,6,5,4,3,1,0, 8,7,6,5,4,3,2,0, 8,7,6,5,4,3,2,1, 8,7,6,5,4,3,2,2, 8,7,6,5,4,3,3,2, 8,7,6,5,4,4,3,2, 8,7,6,5,5,4,3,2,
+	  8,7,6,6,5,4,3,2, 8,7,7,6,5,4,3,2, 8,8,7,6,5,4,3,2, 8,8,8,6,5,4,3,2, 8,8,8,7,5,4,3,2, 8,8,8,7,6,4,3,2, 8,8,8,7,6,5,3,2, 8,8,8,7,6,5,4,2,
+	  8,8,8,7,6,5,4,3, 8,8,8,7,6,5,4,4, 8,8,8,7,6,5,5,4, 8,8,8,7,6,6,5,4, 8,8,8,7,7,6,5,4, 8,8,8,8,7,6,5,4, 8,8,8,8,8,6,5,4, 8,8,8,8,8,7,5,4,
+	  8,8,8,8,8,7,6,4, 8,8,8,8,8,7,6,5, 8,8,8,8,8,7,6,6, 8,8,8,8,8,7,7,6, 8,8,8,8,8,8,7,6, 8,8,8,8,8,8,8,6, 8,8,8,8,8,8,8,7, 8,8,8,8,8,8,8,8,
+	};
+
+	static const uint8 s_idct_col_table[] = 
+	{ 
+		1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 
+		7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 
+	};
+
+	// Scalar "fast pathing" IDCT.
+	static void idct(const jpgd_block_t* pSrc_ptr, uint8* pDst_ptr, int block_max_zag)
+	{
+		assert(block_max_zag >= 1);
+		assert(block_max_zag <= 64);
+
+		if (block_max_zag <= 1)
+		{
+			int k = ((pSrc_ptr[0] + 4) >> 3) + 128;
+			k = CLAMP(k);
+			k = k | (k << 8);
+			k = k | (k << 16);
+
+			for (int i = 8; i > 0; i--)
+			{
+				*(int*)&pDst_ptr[0] = k;
+				*(int*)&pDst_ptr[4] = k;
+				pDst_ptr += 8;
+			}
+			return;
+		}
+
+		int temp[64];
+
+		const jpgd_block_t* pSrc = pSrc_ptr;
+		int* pTemp = temp;
+
+		const uint8* pRow_tab = &s_idct_row_table[(block_max_zag - 1) * 8];
+		int i;
+		for (i = 8; i > 0; i--, pRow_tab++)
+		{
+			switch (*pRow_tab)
+			{
+			case 0: Row<0>::idct(pTemp, pSrc); break;
+			case 1: Row<1>::idct(pTemp, pSrc); break;
+			case 2: Row<2>::idct(pTemp, pSrc); break;
+			case 3: Row<3>::idct(pTemp, pSrc); break;
+			case 4: Row<4>::idct(pTemp, pSrc); break;
+			case 5: Row<5>::idct(pTemp, pSrc); break;
+			case 6: Row<6>::idct(pTemp, pSrc); break;
+			case 7: Row<7>::idct(pTemp, pSrc); break;
+			case 8: Row<8>::idct(pTemp, pSrc); break;
+			}
+
+			pSrc += 8;
+			pTemp += 8;
+		}
+
+		pTemp = temp;
+
+		const int nonzero_rows = s_idct_col_table[block_max_zag - 1];
+		for (i = 8; i > 0; i--)
+		{
+			switch (nonzero_rows)
+			{
+			case 1: Col<1>::idct(pDst_ptr, pTemp); break;
+			case 2: Col<2>::idct(pDst_ptr, pTemp); break;
+			case 3: Col<3>::idct(pDst_ptr, pTemp); break;
+			case 4: Col<4>::idct(pDst_ptr, pTemp); break;
+			case 5: Col<5>::idct(pDst_ptr, pTemp); break;
+			case 6: Col<6>::idct(pDst_ptr, pTemp); break;
+			case 7: Col<7>::idct(pDst_ptr, pTemp); break;
+			case 8: Col<8>::idct(pDst_ptr, pTemp); break;
+			}
+
+			pTemp++;
+			pDst_ptr++;
+		}
+	}
+
+	// Retrieve one character from the input stream.
+	inline uint jpeg_decoder::get_char()
+	{
+		// Any bytes remaining in buffer?
+		if (!m_in_buf_left)
+		{
+			// Try to get more bytes.
+			prep_in_buffer();
+			// Still nothing to get?
+			if (!m_in_buf_left)
+			{
+				// Pad the end of the stream with 0xFF 0xD9 (EOI marker)
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Same as previous method, except can indicate if the character is a pad character or not.
+	inline uint jpeg_decoder::get_char(bool* pPadding_flag)
+	{
+		if (!m_in_buf_left)
+		{
+			prep_in_buffer();
+			if (!m_in_buf_left)
+			{
+				*pPadding_flag = true;
+				int t = m_tem_flag;
+				m_tem_flag ^= 1;
+				if (t)
+					return 0xD9;
+				else
+					return 0xFF;
+			}
+		}
+
+		*pPadding_flag = false;
+
+		uint c = *m_pIn_buf_ofs++;
+		m_in_buf_left--;
+
+		return c;
+	}
+
+	// Inserts a previously retrieved character back into the input buffer.
+	inline void jpeg_decoder::stuff_char(uint8 q)
+	{
+		// This could write before the input buffer, but we've placed another array there.
+		*(--m_pIn_buf_ofs) = q;
+		m_in_buf_left++;
+	}
+
+	// Retrieves one character from the input stream, but does not read past markers. Will continue to return 0xFF when a marker is encountered.
+	inline uint8 jpeg_decoder::get_octet()
+	{
+		bool padding_flag;
+		int c = get_char(&padding_flag);
+
+		if (c == 0xFF)
+		{
+			if (padding_flag)
+				return 0xFF;
+
+			c = get_char(&padding_flag);
+			if (padding_flag)
+			{
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+
+			if (c == 0x00)
+				return 0xFF;
+			else
+			{
+				stuff_char(static_cast<uint8>(c));
+				stuff_char(0xFF);
+				return 0xFF;
+			}
+		}
+
+		return static_cast<uint8>(c);
+	}
+
+	// Retrieves a variable number of bits from the input stream. Does not recognize markers.
+	inline uint jpeg_decoder::get_bits(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			uint c1 = get_char();
+			uint c2 = get_char();
+			m_bit_buf = (m_bit_buf & 0xFFFF0000) | (c1 << 8) | c2;
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			assert(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Retrieves a variable number of bits from the input stream. Markers will not be read into the input bit buffer. Instead, an infinite number of all 1's will be returned when a marker is encountered.
+	inline uint jpeg_decoder::get_bits_no_markers(int num_bits)
+	{
+		if (!num_bits)
+			return 0;
+
+		assert(num_bits <= 16);
+
+		uint i = m_bit_buf >> (32 - num_bits);
+
+		if ((m_bits_left -= num_bits) <= 0)
+		{
+			m_bit_buf <<= (num_bits += m_bits_left);
+
+			if ((m_in_buf_left < 2) || (m_pIn_buf_ofs[0] == 0xFF) || (m_pIn_buf_ofs[1] == 0xFF))
+			{
+				uint c1 = get_octet();
+				uint c2 = get_octet();
+				m_bit_buf |= (c1 << 8) | c2;
+			}
+			else
+			{
+				m_bit_buf |= ((uint)m_pIn_buf_ofs[0] << 8) | m_pIn_buf_ofs[1];
+				m_in_buf_left -= 2;
+				m_pIn_buf_ofs += 2;
+			}
+
+			m_bit_buf <<= -m_bits_left;
+
+			m_bits_left += 16;
+
+			assert(m_bits_left >= 0);
+		}
+		else
+			m_bit_buf <<= num_bits;
+
+		return i;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables* pH)
+	{
+		if (!pH)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		int symbol;
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up[m_bit_buf >> 24]) < 0)
+		{
+			// Decode more bits, use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
+
+				// This should never happen, but to be safe I'm turning these asserts into a run-time check.
+				if ((idx >= JPGD_HUFF_TREE_MAX_LENGTH) || (ofs < 0))
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				symbol = pH->tree[idx];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+		}
+		else
+		{
+			assert(symbol < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
+			get_bits_no_markers(pH->code_size[symbol]);
+		}
+
+		return symbol;
+	}
+
+	// Decodes a Huffman encoded symbol.
+	inline int jpeg_decoder::huff_decode(huff_tables* pH, int& extra_bits)
+	{
+		int symbol;
+
+		if (!pH)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		// Check first 8-bits: do we have a complete symbol?
+		if ((symbol = pH->look_up2[m_bit_buf >> 24]) < 0)
+		{
+			// Use a tree traversal to find symbol.
+			int ofs = 23;
+			do
+			{
+				unsigned int idx = -(int)(symbol + ((m_bit_buf >> ofs) & 1));
+
+				// This should never happen, but to be safe I'm turning these asserts into a run-time check.
+				if ((idx >= JPGD_HUFF_TREE_MAX_LENGTH) || (ofs < 0))
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				symbol = pH->tree[idx];
+				ofs--;
+			} while (symbol < 0);
+
+			get_bits_no_markers(8 + (23 - ofs));
+
+			extra_bits = get_bits_no_markers(symbol & 0xF);
+		}
+		else
+		{
+			if (symbol & 0x8000)
+			{
+				//get_bits_no_markers((symbol >> 8) & 31);
+				assert(((symbol >> 8) & 31) <= 15);
+				get_bits_no_markers((symbol >> 8) & 15);
+				extra_bits = symbol >> 16;
+			}
+			else
+			{
+				int code_size = (symbol >> 8) & 31;
+				int num_extra_bits = symbol & 0xF;
+				int bits = code_size + num_extra_bits;
+
+				if (bits <= 16)
+					extra_bits = get_bits_no_markers(bits) & ((1 << num_extra_bits) - 1);
+				else
+				{
+					get_bits_no_markers(code_size);
+					extra_bits = get_bits_no_markers(num_extra_bits);
+				}
+			}
+
+			symbol &= 0xFF;
+		}
+
+		return symbol;
+	}
+
+	// Tables and macro used to fully decode the DPCM differences.
+	static const int s_extend_test[16] = { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };
+	static const int s_extend_offset[16] = { 0, -1, -3, -7, -15, -31, -63, -127, -255, -511, -1023, -2047, -4095, -8191, -16383, -32767 };
+	//static const int s_extend_mask[] = { 0, (1 << 0), (1 << 1), (1 << 2), (1 << 3), (1 << 4), (1 << 5), (1 << 6), (1 << 7), (1 << 8), (1 << 9), (1 << 10), (1 << 11), (1 << 12), (1 << 13), (1 << 14), (1 << 15), (1 << 16) };
+
+#define JPGD_HUFF_EXTEND(x, s) (((x) < s_extend_test[s & 15]) ? ((x) + s_extend_offset[s & 15]) : (x))
+
+	// Unconditionally frees all allocated m_blocks.
+	void jpeg_decoder::free_all_blocks()
+	{
+		m_pStream = nullptr;
+		for (mem_block* b = m_pMem_blocks; b; )
+		{
+			mem_block* n = b->m_pNext;
+			jpgd_free(b);
+			b = n;
+		}
+		m_pMem_blocks = nullptr;
+	}
+
+	// This method handles all errors. It will never return.
+	// It could easily be changed to use C++ exceptions.
+	JPGD_NORETURN void jpeg_decoder::stop_decoding(jpgd_status status)
+	{
+		m_error_code = status;
+		free_all_blocks();
+		longjmp(m_jmp_state, status);
+	}
+
+	void* jpeg_decoder::alloc(size_t nSize, bool zero)
+	{
+		nSize = (JPGD_MAX(nSize, 1) + 3) & ~3;
+		char* rv = nullptr;
+		for (mem_block* b = m_pMem_blocks; b; b = b->m_pNext)
+		{
+			if ((b->m_used_count + nSize) <= b->m_size)
+			{
+				rv = b->m_data + b->m_used_count;
+				b->m_used_count += nSize;
+				break;
+			}
+		}
+		if (!rv)
+		{
+			int capacity = JPGD_MAX(32768 - 256, ((int)nSize + 2047) & ~2047);
+			mem_block* b = (mem_block*)jpgd_malloc(sizeof(mem_block) + capacity);
+			if (!b)
+			{
+				stop_decoding(JPGD_NOTENOUGHMEM);
+			}
+
+			b->m_pNext = m_pMem_blocks;
+			m_pMem_blocks = b;
+			b->m_used_count = nSize;
+			b->m_size = capacity;
+			rv = b->m_data;
+		}
+		if (zero) memset(rv, 0, nSize);
+		return rv;
+	}
+
+	void jpeg_decoder::word_clear(void* p, uint16 c, uint n)
+	{
+		uint8* pD = (uint8*)p;
+		const uint8 l = c & 0xFF, h = (c >> 8) & 0xFF;
+		while (n)
+		{
+			pD[0] = l;
+			pD[1] = h;
+			pD += 2;
+			n--;
+		}
+	}
+
+	// Refill the input buffer.
+	// This method will sit in a loop until (A) the buffer is full or (B)
+	// the stream's read() method reports and end of file condition.
+	void jpeg_decoder::prep_in_buffer()
+	{
+		m_in_buf_left = 0;
+		m_pIn_buf_ofs = m_in_buf;
+
+		if (m_eof_flag)
+			return;
+
+		do
+		{
+			int bytes_read = m_pStream->read(m_in_buf + m_in_buf_left, JPGD_IN_BUF_SIZE - m_in_buf_left, &m_eof_flag);
+			if (bytes_read == -1)
+				stop_decoding(JPGD_STREAM_READ);
+
+			m_in_buf_left += bytes_read;
+		} while ((m_in_buf_left < JPGD_IN_BUF_SIZE) && (!m_eof_flag));
+
+		m_total_bytes_read += m_in_buf_left;
+
+		// Pad the end of the block with M_EOI (prevents the decompressor from going off the rails if the stream is invalid).
+		// (This dates way back to when this decompressor was written in C/asm, and the all-asm Huffman decoder did some fancy things to increase perf.)
+		word_clear(m_pIn_buf_ofs + m_in_buf_left, 0xD9FF, 64);
+	}
+
+	// Read a Huffman code table.
+	void jpeg_decoder::read_dht_marker()
+	{
+		int i, index, count;
+		uint8 huff_num[17];
+		uint8 huff_val[256];
+
+		uint num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DHT_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			index = get_bits(8);
+
+			huff_num[0] = 0;
+
+			count = 0;
+
+			for (i = 1; i <= 16; i++)
+			{
+				huff_num[i] = static_cast<uint8>(get_bits(8));
+				count += huff_num[i];
+			}
+
+			if (count > 255)
+				stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+			bool symbol_present[256];
+			memset(symbol_present, 0, sizeof(symbol_present));
+
+			for (i = 0; i < count; i++)
+			{
+				const int s = get_bits(8);
+
+				// Check for obviously bogus tables.
+				if (symbol_present[s])
+					stop_decoding(JPGD_BAD_DHT_COUNTS);
+
+				huff_val[i] = static_cast<uint8_t>(s);
+				symbol_present[s] = true;
+			}
+
+			i = 1 + 16 + count;
+
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DHT_MARKER);
+
+			num_left -= i;
+
+			if ((index & 0x10) > 0x10)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
+
+			index = (index & 0x0F) + ((index & 0x10) >> 4) * (JPGD_MAX_HUFF_TABLES >> 1);
+
+			if (index >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_BAD_DHT_INDEX);
+
+			if (!m_huff_num[index])
+				m_huff_num[index] = (uint8*)alloc(17);
+
+			if (!m_huff_val[index])
+				m_huff_val[index] = (uint8*)alloc(256);
+
+			m_huff_ac[index] = (index & 0x10) != 0;
+			memcpy(m_huff_num[index], huff_num, 17);
+			memcpy(m_huff_val[index], huff_val, 256);
+		}
+	}
+
+	// Read a quantization table.
+	void jpeg_decoder::read_dqt_marker()
+	{
+		int n, i, prec;
+		uint num_left;
+		uint temp;
+
+		num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_DQT_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			n = get_bits(8);
+			prec = n >> 4;
+			n &= 0x0F;
+
+			if (n >= JPGD_MAX_QUANT_TABLES)
+				stop_decoding(JPGD_BAD_DQT_TABLE);
+
+			if (!m_quant[n])
+				m_quant[n] = (jpgd_quant_t*)alloc(64 * sizeof(jpgd_quant_t));
+
+			// read quantization entries, in zag order
+			for (i = 0; i < 64; i++)
+			{
+				temp = get_bits(8);
+
+				if (prec)
+					temp = (temp << 8) + get_bits(8);
+
+				m_quant[n][i] = static_cast<jpgd_quant_t>(temp);
+			}
+
+			i = 64 + 1;
+
+			if (prec)
+				i += 64;
+
+			if (num_left < (uint)i)
+				stop_decoding(JPGD_BAD_DQT_LENGTH);
+
+			num_left -= i;
+		}
+	}
+
+	// Read the start of frame (SOF) marker.
+	void jpeg_decoder::read_sof_marker()
+	{
+		int i;
+		uint num_left;
+
+		num_left = get_bits(16);
+
+		/* precision: sorry, only 8-bit precision is supported */
+		if (get_bits(8) != 8)
+			stop_decoding(JPGD_BAD_PRECISION);
+
+		m_image_y_size = get_bits(16);
+
+		if ((m_image_y_size < 1) || (m_image_y_size > JPGD_MAX_HEIGHT))
+			stop_decoding(JPGD_BAD_HEIGHT);
+
+		m_image_x_size = get_bits(16);
+
+		if ((m_image_x_size < 1) || (m_image_x_size > JPGD_MAX_WIDTH))
+			stop_decoding(JPGD_BAD_WIDTH);
+
+		m_comps_in_frame = get_bits(8);
+
+		if (m_comps_in_frame > JPGD_MAX_COMPONENTS)
+			stop_decoding(JPGD_TOO_MANY_COMPONENTS);
+
+		if (num_left != (uint)(m_comps_in_frame * 3 + 8))
+			stop_decoding(JPGD_BAD_SOF_LENGTH);
+
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_comp_ident[i] = get_bits(8);
+			m_comp_h_samp[i] = get_bits(4);
+			m_comp_v_samp[i] = get_bits(4);
+
+			if (!m_comp_h_samp[i] || !m_comp_v_samp[i] || (m_comp_h_samp[i] > 2) || (m_comp_v_samp[i] > 2))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			m_comp_quant[i] = get_bits(8);
+			if (m_comp_quant[i] >= JPGD_MAX_QUANT_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+		}
+	}
+
+	// Used to skip unrecognized markers.
+	void jpeg_decoder::skip_variable_marker()
+	{
+		uint num_left;
+
+		num_left = get_bits(16);
+
+		if (num_left < 2)
+			stop_decoding(JPGD_BAD_VARIABLE_MARKER);
+
+		num_left -= 2;
+
+		while (num_left)
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
+
+	// Read a define restart interval (DRI) marker.
+	void jpeg_decoder::read_dri_marker()
+	{
+		if (get_bits(16) != 4)
+			stop_decoding(JPGD_BAD_DRI_LENGTH);
+
+		m_restart_interval = get_bits(16);
+	}
+
+	// Read a start of scan (SOS) marker.
+	void jpeg_decoder::read_sos_marker()
+	{
+		uint num_left;
+		int i, ci, n, c, cc;
+
+		num_left = get_bits(16);
+
+		n = get_bits(8);
+
+		m_comps_in_scan = n;
+
+		num_left -= 3;
+
+		if ((num_left != (uint)(n * 2 + 3)) || (n < 1) || (n > JPGD_MAX_COMPS_IN_SCAN))
+			stop_decoding(JPGD_BAD_SOS_LENGTH);
+
+		for (i = 0; i < n; i++)
+		{
+			cc = get_bits(8);
+			c = get_bits(8);
+			num_left -= 2;
+
+			for (ci = 0; ci < m_comps_in_frame; ci++)
+				if (cc == m_comp_ident[ci])
+					break;
+
+			if (ci >= m_comps_in_frame)
+				stop_decoding(JPGD_BAD_SOS_COMP_ID);
+
+			if (ci >= JPGD_MAX_COMPONENTS)
+				stop_decoding(JPGD_DECODE_ERROR);
+
+			m_comp_list[i] = ci;
+
+			m_comp_dc_tab[ci] = (c >> 4) & 15;
+			m_comp_ac_tab[ci] = (c & 15) + (JPGD_MAX_HUFF_TABLES >> 1);
+
+			if (m_comp_dc_tab[ci] >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+
+			if (m_comp_ac_tab[ci] >= JPGD_MAX_HUFF_TABLES)
+				stop_decoding(JPGD_DECODE_ERROR);
+		}
+
+		m_spectral_start = get_bits(8);
+		m_spectral_end = get_bits(8);
+		m_successive_high = get_bits(4);
+		m_successive_low = get_bits(4);
+
+		if (!m_progressive_flag)
+		{
+			m_spectral_start = 0;
+			m_spectral_end = 63;
+		}
+
+		num_left -= 3;
+
+		/* read past whatever is num_left */
+		while (num_left)
+		{
+			get_bits(8);
+			num_left--;
+		}
+	}
+
+	// Finds the next marker.
+	int jpeg_decoder::next_marker()
+	{
+		uint c, bytes;
+
+		bytes = 0;
+
+		do
+		{
+			do
+			{
+				bytes++;
+				c = get_bits(8);
+			} while (c != 0xFF);
+
+			do
+			{
+				c = get_bits(8);
+			} while (c == 0xFF);
+
+		} while (c == 0);
+
+		// If bytes > 0 here, there where extra bytes before the marker (not good).
+
+		return c;
+	}
+
+	// Process markers. Returns when an SOFx, SOI, EOI, or SOS marker is
+	// encountered.
+	int jpeg_decoder::process_markers()
+	{
+		int c;
+
+		for (; ; )
+		{
+			c = next_marker();
+
+			switch (c)
+			{
+			case M_SOF0:
+			case M_SOF1:
+			case M_SOF2:
+			case M_SOF3:
+			case M_SOF5:
+			case M_SOF6:
+			case M_SOF7:
+				//      case M_JPG:
+			case M_SOF9:
+			case M_SOF10:
+			case M_SOF11:
+			case M_SOF13:
+			case M_SOF14:
+			case M_SOF15:
+			case M_SOI:
+			case M_EOI:
+			case M_SOS:
+			{
+				return c;
+			}
+			case M_DHT:
+			{
+				read_dht_marker();
+				break;
+			}
+			// No arithmitic support - dumb patents!
+			case M_DAC:
+			{
+				stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+				break;
+			}
+			case M_DQT:
+			{
+				read_dqt_marker();
+				break;
+			}
+			case M_DRI:
+			{
+				read_dri_marker();
+				break;
+			}
+			//case M_APP0:  /* no need to read the JFIF marker */
+			case M_JPG:
+			case M_RST0:    /* no parameters */
+			case M_RST1:
+			case M_RST2:
+			case M_RST3:
+			case M_RST4:
+			case M_RST5:
+			case M_RST6:
+			case M_RST7:
+			case M_TEM:
+			{
+				stop_decoding(JPGD_UNEXPECTED_MARKER);
+				break;
+			}
+			default:    /* must be DNL, DHP, EXP, APPn, JPGn, COM, or RESn or APP0 */
+			{
+				skip_variable_marker();
+				break;
+			}
+			}
+		}
+	}
+
+	// Finds the start of image (SOI) marker.
+	void jpeg_decoder::locate_soi_marker()
+	{
+		uint lastchar, thischar;
+		uint bytesleft;
+
+		lastchar = get_bits(8);
+
+		thischar = get_bits(8);
+
+		/* ok if it's a normal JPEG file without a special header */
+
+		if ((lastchar == 0xFF) && (thischar == M_SOI))
+			return;
+
+		bytesleft = 4096;
+
+		for (; ; )
+		{
+			if (--bytesleft == 0)
+				stop_decoding(JPGD_NOT_JPEG);
+
+			lastchar = thischar;
+
+			thischar = get_bits(8);
+
+			if (lastchar == 0xFF)
+			{
+				if (thischar == M_SOI)
+					break;
+				else if (thischar == M_EOI) // get_bits will keep returning M_EOI if we read past the end
+					stop_decoding(JPGD_NOT_JPEG);
+			}
+		}
+
+		// Check the next character after marker: if it's not 0xFF, it can't be the start of the next marker, so the file is bad.
+		thischar = (m_bit_buf >> 24) & 0xFF;
+
+		if (thischar != 0xFF)
+			stop_decoding(JPGD_NOT_JPEG);
+	}
+
+	// Find a start of frame (SOF) marker.
+	void jpeg_decoder::locate_sof_marker()
+	{
+		locate_soi_marker();
+
+		int c = process_markers();
+
+		switch (c)
+		{
+		case M_SOF2:
+		{
+			m_progressive_flag = JPGD_TRUE;
+			read_sof_marker();
+			break;
+		}
+		case M_SOF0:  /* baseline DCT */
+		case M_SOF1:  /* extended sequential DCT */
+		{
+			read_sof_marker();
+			break;
+		}
+		case M_SOF9:  /* Arithmitic coding */
+		{
+			stop_decoding(JPGD_NO_ARITHMITIC_SUPPORT);
+			break;
+		}
+		default:
+		{
+			stop_decoding(JPGD_UNSUPPORTED_MARKER);
+			break;
+		}
+		}
+	}
+
+	// Find a start of scan (SOS) marker.
+	int jpeg_decoder::locate_sos_marker()
+	{
+		int c;
+
+		c = process_markers();
+
+		if (c == M_EOI)
+			return JPGD_FALSE;
+		else if (c != M_SOS)
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+
+		read_sos_marker();
+
+		return JPGD_TRUE;
+	}
+
+	// Reset everything to default/uninitialized state.
+	void jpeg_decoder::init(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		m_flags = flags;
+		m_pMem_blocks = nullptr;
+		m_error_code = JPGD_SUCCESS;
+		m_ready_flag = false;
+		m_image_x_size = m_image_y_size = 0;
+		m_pStream = pStream;
+		m_progressive_flag = JPGD_FALSE;
+
+		memset(m_huff_ac, 0, sizeof(m_huff_ac));
+		memset(m_huff_num, 0, sizeof(m_huff_num));
+		memset(m_huff_val, 0, sizeof(m_huff_val));
+		memset(m_quant, 0, sizeof(m_quant));
+
+		m_scan_type = 0;
+		m_comps_in_frame = 0;
+
+		memset(m_comp_h_samp, 0, sizeof(m_comp_h_samp));
+		memset(m_comp_v_samp, 0, sizeof(m_comp_v_samp));
+		memset(m_comp_quant, 0, sizeof(m_comp_quant));
+		memset(m_comp_ident, 0, sizeof(m_comp_ident));
+		memset(m_comp_h_blocks, 0, sizeof(m_comp_h_blocks));
+		memset(m_comp_v_blocks, 0, sizeof(m_comp_v_blocks));
+
+		m_comps_in_scan = 0;
+		memset(m_comp_list, 0, sizeof(m_comp_list));
+		memset(m_comp_dc_tab, 0, sizeof(m_comp_dc_tab));
+		memset(m_comp_ac_tab, 0, sizeof(m_comp_ac_tab));
+
+		m_spectral_start = 0;
+		m_spectral_end = 0;
+		m_successive_low = 0;
+		m_successive_high = 0;
+		m_max_mcu_x_size = 0;
+		m_max_mcu_y_size = 0;
+		m_blocks_per_mcu = 0;
+		m_max_blocks_per_row = 0;
+		m_mcus_per_row = 0;
+		m_mcus_per_col = 0;
+
+		memset(m_mcu_org, 0, sizeof(m_mcu_org));
+
+		m_total_lines_left = 0;
+		m_mcu_lines_left = 0;
+		m_num_buffered_scanlines = 0;
+		m_real_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_scan_line = 0;
+		m_dest_bytes_per_pixel = 0;
+
+		memset(m_pHuff_tabs, 0, sizeof(m_pHuff_tabs));
+
+		memset(m_dc_coeffs, 0, sizeof(m_dc_coeffs));
+		memset(m_ac_coeffs, 0, sizeof(m_ac_coeffs));
+		memset(m_block_y_mcu, 0, sizeof(m_block_y_mcu));
+
+		m_eob_run = 0;
+
+		m_pIn_buf_ofs = m_in_buf;
+		m_in_buf_left = 0;
+		m_eof_flag = false;
+		m_tem_flag = 0;
+
+		memset(m_in_buf_pad_start, 0, sizeof(m_in_buf_pad_start));
+		memset(m_in_buf, 0, sizeof(m_in_buf));
+		memset(m_in_buf_pad_end, 0, sizeof(m_in_buf_pad_end));
+
+		m_restart_interval = 0;
+		m_restarts_left = 0;
+		m_next_restart_num = 0;
+
+		m_max_mcus_per_row = 0;
+		m_max_blocks_per_mcu = 0;
+		m_max_mcus_per_col = 0;
+
+		memset(m_last_dc_val, 0, sizeof(m_last_dc_val));
+		m_pMCU_coefficients = nullptr;
+		m_pSample_buf = nullptr;
+		m_pSample_buf_prev = nullptr;
+		m_sample_buf_prev_valid = false;
+
+		m_total_bytes_read = 0;
+
+		m_pScan_line_0 = nullptr;
+		m_pScan_line_1 = nullptr;
+
+		// Ready the input buffer.
+		prep_in_buffer();
+
+		// Prime the bit buffer.
+		m_bits_left = 16;
+		m_bit_buf = 0;
+
+		get_bits(16);
+		get_bits(16);
+
+		for (int i = 0; i < JPGD_MAX_BLOCKS_PER_MCU; i++)
+			m_mcu_block_max_zag[i] = 64;
+	}
+
+#define SCALEBITS 16
+#define ONE_HALF  ((int) 1 << (SCALEBITS-1))
+#define FIX(x)    ((int) ((x) * (1L<<SCALEBITS) + 0.5f))
+
+	// Create a few tables that allow us to quickly convert YCbCr to RGB.
+	void jpeg_decoder::create_look_ups()
+	{
+		for (int i = 0; i <= 255; i++)
+		{
+			int k = i - 128;
+			m_crr[i] = (FIX(1.40200f) * k + ONE_HALF) >> SCALEBITS;
+			m_cbb[i] = (FIX(1.77200f) * k + ONE_HALF) >> SCALEBITS;
+			m_crg[i] = (-FIX(0.71414f)) * k;
+			m_cbg[i] = (-FIX(0.34414f)) * k + ONE_HALF;
+		}
+	}
+
+	// This method throws back into the stream any bytes that where read
+	// into the bit buffer during initial marker scanning.
+	void jpeg_decoder::fix_in_buffer()
+	{
+		// In case any 0xFF's where pulled into the buffer during marker scanning.
+		assert((m_bits_left & 7) == 0);
+
+		if (m_bits_left == 16)
+			stuff_char((uint8)(m_bit_buf & 0xFF));
+
+		if (m_bits_left >= 8)
+			stuff_char((uint8)((m_bit_buf >> 8) & 0xFF));
+
+		stuff_char((uint8)((m_bit_buf >> 16) & 0xFF));
+		stuff_char((uint8)((m_bit_buf >> 24) & 0xFF));
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	void jpeg_decoder::transform_mcu(int mcu_row)
+	{
+		jpgd_block_t* pSrc_ptr = m_pMCU_coefficients;
+		if (mcu_row * m_blocks_per_mcu >= m_max_blocks_per_row)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		uint8* pDst_ptr = m_pSample_buf + mcu_row * m_blocks_per_mcu * 64;
+
+		for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+		{
+			idct(pSrc_ptr, pDst_ptr, m_mcu_block_max_zag[mcu_block]);
+			pSrc_ptr += 64;
+			pDst_ptr += 64;
+		}
+	}
+
+	// Loads and dequantizes the next row of (already decoded) coefficients.
+	// Progressive images only.
+	void jpeg_decoder::load_next_row()
+	{
+		int i;
+		jpgd_block_t* p;
+		jpgd_quant_t* q;
+		int mcu_row, mcu_block, row_block = 0;
+		int component_num, component_id;
+		int block_x_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(block_x_mcu, 0, JPGD_MAX_COMPONENTS * sizeof(int));
+
+		for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+			for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+			{
+				component_id = m_mcu_org[mcu_block];
+				if (m_comp_quant[component_id] >= JPGD_MAX_QUANT_TABLES)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				q = m_quant[m_comp_quant[component_id]];
+
+				p = m_pMCU_coefficients + 64 * mcu_block;
+
+				jpgd_block_t* pAC = coeff_buf_getp(m_ac_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				jpgd_block_t* pDC = coeff_buf_getp(m_dc_coeffs[component_id], block_x_mcu[component_id] + block_x_mcu_ofs, m_block_y_mcu[component_id] + block_y_mcu_ofs);
+				p[0] = pDC[0];
+				memcpy(&p[1], &pAC[1], 63 * sizeof(jpgd_block_t));
+
+				for (i = 63; i > 0; i--)
+					if (p[g_ZAG[i]])
+						break;
+
+				m_mcu_block_max_zag[mcu_block] = i + 1;
+
+				for (; i >= 0; i--)
+					if (p[g_ZAG[i]])
+						p[g_ZAG[i]] = static_cast<jpgd_block_t>(p[g_ZAG[i]] * q[i]);
+
+				row_block++;
+
+				if (m_comps_in_scan == 1)
+					block_x_mcu[component_id]++;
+				else
+				{
+					if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+					{
+						block_x_mcu_ofs = 0;
+
+						if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+						{
+							block_y_mcu_ofs = 0;
+
+							block_x_mcu[component_id] += m_comp_h_samp[component_id];
+						}
+					}
+				}
+			}
+
+			transform_mcu(mcu_row);
+		}
+
+		if (m_comps_in_scan == 1)
+			m_block_y_mcu[m_comp_list[0]]++;
+		else
+		{
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				component_id = m_comp_list[component_num];
+
+				m_block_y_mcu[component_id] += m_comp_v_samp[component_id];
+			}
+		}
+	}
+
+	// Restart interval processing.
+	void jpeg_decoder::process_restart()
+	{
+		int i;
+		int c = 0;
+
+		// Align to a byte boundry
+		// FIXME: Is this really necessary? get_bits_no_markers() never reads in markers!
+		//get_bits_no_markers(m_bits_left & 7);
+
+		// Let's scan a little bit to find the marker, but not _too_ far.
+		// 1536 is a "fudge factor" that determines how much to scan.
+		for (i = 1536; i > 0; i--)
+			if (get_char() == 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		for (; i > 0; i--)
+			if ((c = get_char()) != 0xFF)
+				break;
+
+		if (i == 0)
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Is it the expected marker? If not, something bad happened.
+		if (c != (m_next_restart_num + M_RST0))
+			stop_decoding(JPGD_BAD_RESTART_MARKER);
+
+		// Reset each component's DC prediction values.
+		memset(&m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		m_restarts_left = m_restart_interval;
+
+		m_next_restart_num = (m_next_restart_num + 1) & 7;
+
+		// Get the bit buffer going again...
+
+		m_bits_left = 16;
+		get_bits_no_markers(16);
+		get_bits_no_markers(16);
+	}
+
+	static inline int dequantize_ac(int c, int q) { c *= q; return c; }
+
+	// Decodes and dequantizes the next row of coefficients.
+	void jpeg_decoder::decode_next_row()
+	{
+		int row_block = 0;
+
+		for (int mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+		{
+			if ((m_restart_interval) && (m_restarts_left == 0))
+				process_restart();
+
+			jpgd_block_t* p = m_pMCU_coefficients;
+			for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++, p += 64)
+			{
+				int component_id = m_mcu_org[mcu_block];
+				if (m_comp_quant[component_id] >= JPGD_MAX_QUANT_TABLES)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				jpgd_quant_t* q = m_quant[m_comp_quant[component_id]];
+
+				int r, s;
+				s = huff_decode(m_pHuff_tabs[m_comp_dc_tab[component_id]], r);
+				if (s >= 16)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				s = JPGD_HUFF_EXTEND(r, s);
+
+				m_last_dc_val[component_id] = (s += m_last_dc_val[component_id]);
+
+				p[0] = static_cast<jpgd_block_t>(s * q[0]);
+
+				int prev_num_set = m_mcu_block_max_zag[mcu_block];
+
+				huff_tables* pH = m_pHuff_tabs[m_comp_ac_tab[component_id]];
+
+				int k;
+				for (k = 1; k < 64; k++)
+				{
+					int extra_bits;
+					s = huff_decode(pH, extra_bits);
+
+					r = s >> 4;
+					s &= 15;
+
+					if (s)
+					{
+						if (r)
+						{
+							if ((k + r) > 63)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(r, prev_num_set - k);
+								int kt = k;
+								while (n--)
+									p[g_ZAG[kt++]] = 0;
+							}
+
+							k += r;
+						}
+
+						s = JPGD_HUFF_EXTEND(extra_bits, s);
+
+						if (k >= 64)
+							stop_decoding(JPGD_DECODE_ERROR);
+
+						p[g_ZAG[k]] = static_cast<jpgd_block_t>(dequantize_ac(s, q[k])); //s * q[k];
+					}
+					else
+					{
+						if (r == 15)
+						{
+							if ((k + 16) > 64)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							if (k < prev_num_set)
+							{
+								int n = JPGD_MIN(16, prev_num_set - k);
+								int kt = k;
+								while (n--)
+								{
+									if (kt > 63)
+										stop_decoding(JPGD_DECODE_ERROR);
+									p[g_ZAG[kt++]] = 0;
+								}
+							}
+
+							k += 16 - 1; // - 1 because the loop counter is k
+
+							if (p[g_ZAG[k & 63]] != 0)
+								stop_decoding(JPGD_DECODE_ERROR);
+						}
+						else
+							break;
+					}
+				}
+
+				if (k < prev_num_set)
+				{
+					int kt = k;
+					while (kt < prev_num_set)
+						p[g_ZAG[kt++]] = 0;
+				}
+
+				m_mcu_block_max_zag[mcu_block] = k;
+
+				row_block++;
+			}
+
+			transform_mcu(mcu_row);
+
+			m_restarts_left--;
+		}
+	}
+
+	// YCbCr H1V1 (1x1:1:1, 3 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d = m_pScan_line_0;
+		uint8* s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int y = s[j];
+				int cb = s[64 + j];
+				int cr = s[128 + j];
+
+				d[0] = clamp(y + m_crr[cr]);
+				d[1] = clamp(y + ((m_crg[cr] + m_cbg[cb]) >> 16));
+				d[2] = clamp(y + m_cbb[cb]);
+				d[3] = 255;
+
+				d += 4;
+			}
+
+			s += 64 * 3;
+		}
+	}
+
+	// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V1Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* y = m_pSample_buf + row * 8;
+		uint8* c = m_pSample_buf + 2 * 64 + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 4; j++)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j << 1];
+					d0[0] = clamp(yy + rc);
+					d0[1] = clamp(yy + gc);
+					d0[2] = clamp(yy + bc);
+					d0[3] = 255;
+
+					yy = y[(j << 1) + 1];
+					d0[4] = clamp(yy + rc);
+					d0[5] = clamp(yy + gc);
+					d0[6] = clamp(yy + bc);
+					d0[7] = 255;
+
+					d0 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64 * 4 - 64 * 2;
+			c += 64 * 4 - 8;
+		}
+	}
+
+	// YCbCr H2V1 (2x1:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V1ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 4;
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+
+		const int half_image_x_size = (m_image_x_size >> 1) - 1;
+		const int row_x8 = row * 8;
+
+		for (int x = 0; x < m_image_x_size; x++)
+		{
+			int y = m_pSample_buf[check_sample_buf_ofs((x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7) + row_x8)];
+
+			int c_x0 = (x - 1) >> 1;
+			int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+			c_x0 = JPGD_MAX(c_x0, 0);
+
+			int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7) + row_x8 + 128;
+			int cb0 = m_pSample_buf[check_sample_buf_ofs(a)];
+			int cr0 = m_pSample_buf[check_sample_buf_ofs(a + 64)];
+
+			int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7) + row_x8 + 128;
+			int cb1 = m_pSample_buf[check_sample_buf_ofs(b)];
+			int cr1 = m_pSample_buf[check_sample_buf_ofs(b + 64)];
+
+			int w0 = (x & 1) ? 3 : 1;
+			int w1 = (x & 1) ? 1 : 3;
+
+			int cb = (cb0 * w0 + cb1 * w1 + 2) >> 2;
+			int cr = (cr0 * w0 + cr1 * w1 + 2) >> 2;
+
+			int rc = m_crr[cr];
+			int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+			int bc = m_cbb[cb];
+
+			d0[0] = clamp(y + rc);
+			d0[1] = clamp(y + gc);
+			d0[2] = clamp(y + bc);
+			d0[3] = 255;
+
+			d0 += 4;
+		}
+	}
+
+	// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* d1 = m_pScan_line_1;
+		uint8* y;
+		uint8* c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64 * 1 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64 * 2 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int j = 0; j < 8; j++)
+			{
+				int cb = c[0 + j];
+				int cr = c[64 + j];
+
+				int rc = m_crr[cr];
+				int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+				int bc = m_cbb[cb];
+
+				int yy = y[j];
+				d0[0] = clamp(yy + rc);
+				d0[1] = clamp(yy + gc);
+				d0[2] = clamp(yy + bc);
+				d0[3] = 255;
+
+				yy = y[8 + j];
+				d1[0] = clamp(yy + rc);
+				d1[1] = clamp(yy + gc);
+				d1[2] = clamp(yy + bc);
+				d1[3] = 255;
+
+				d0 += 4;
+				d1 += 4;
+			}
+
+			y += 64 * 4;
+			c += 64 * 4;
+		}
+	}
+
+	// YCbCr H2V1 (1x2:1:1, 4 m_blocks per MCU) to RGB
+	void jpeg_decoder::H1V2ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 4;
+		int y = m_image_y_size - m_total_lines_left;
+		int row = y & 15;
+
+		const int half_image_y_size = (m_image_y_size >> 1) - 1;
+
+		uint8* d0 = m_pScan_line_0;
+
+		const int w0 = (row & 1) ? 3 : 1;
+		const int w1 = (row & 1) ? 1 : 3;
+
+		int c_y0 = (y - 1) >> 1;
+		int c_y1 = JPGD_MIN(c_y0 + 1, half_image_y_size);
+
+		const uint8_t* p_YSamples = m_pSample_buf;
+		const uint8_t* p_C0Samples = m_pSample_buf;
+		if ((c_y0 >= 0) && (((row & 15) == 0) || ((row & 15) == 15)) && (m_total_lines_left > 1))
+		{
+			assert(y > 0);
+			assert(m_sample_buf_prev_valid);
+
+			if ((row & 15) == 15)
+				p_YSamples = m_pSample_buf_prev;
+
+			p_C0Samples = m_pSample_buf_prev;
+		}
+
+		const int y_sample_base_ofs = ((row & 8) ? 64 : 0) + (row & 7) * 8;
+		const int y0_base = (c_y0 & 7) * 8 + 128;
+		const int y1_base = (c_y1 & 7) * 8 + 128;
+
+		for (int x = 0; x < m_image_x_size; x++)
+		{
+			const int base_ofs = (x >> 3) * BLOCKS_PER_MCU * 64 + (x & 7);
+
+			int y_sample = p_YSamples[check_sample_buf_ofs(base_ofs + y_sample_base_ofs)];
+
+			int a = base_ofs + y0_base;
+			int cb0_sample = p_C0Samples[check_sample_buf_ofs(a)];
+			int cr0_sample = p_C0Samples[check_sample_buf_ofs(a + 64)];
+
+			int b = base_ofs + y1_base;
+			int cb1_sample = m_pSample_buf[check_sample_buf_ofs(b)];
+			int cr1_sample = m_pSample_buf[check_sample_buf_ofs(b + 64)];
+
+			int cb = (cb0_sample * w0 + cb1_sample * w1 + 2) >> 2;
+			int cr = (cr0_sample * w0 + cr1_sample * w1 + 2) >> 2;
+
+			int rc = m_crr[cr];
+			int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+			int bc = m_cbb[cb];
+
+			d0[0] = clamp(y_sample + rc);
+			d0[1] = clamp(y_sample + gc);
+			d0[2] = clamp(y_sample + bc);
+			d0[3] = 255;
+
+			d0 += 4;
+		}
+	}
+
+	// YCbCr H2V2 (2x2:1:1, 6 m_blocks per MCU) to RGB
+	void jpeg_decoder::H2V2Convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d0 = m_pScan_line_0;
+		uint8* d1 = m_pScan_line_1;
+		uint8* y;
+		uint8* c;
+
+		if (row < 8)
+			y = m_pSample_buf + row * 8;
+		else
+			y = m_pSample_buf + 64 * 2 + (row & 7) * 8;
+
+		c = m_pSample_buf + 64 * 4 + (row >> 1) * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			for (int l = 0; l < 2; l++)
+			{
+				for (int j = 0; j < 8; j += 2)
+				{
+					int cb = c[0];
+					int cr = c[64];
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					int yy = y[j];
+					d0[0] = clamp(yy + rc);
+					d0[1] = clamp(yy + gc);
+					d0[2] = clamp(yy + bc);
+					d0[3] = 255;
+
+					yy = y[j + 1];
+					d0[4] = clamp(yy + rc);
+					d0[5] = clamp(yy + gc);
+					d0[6] = clamp(yy + bc);
+					d0[7] = 255;
+
+					yy = y[j + 8];
+					d1[0] = clamp(yy + rc);
+					d1[1] = clamp(yy + gc);
+					d1[2] = clamp(yy + bc);
+					d1[3] = 255;
+
+					yy = y[j + 8 + 1];
+					d1[4] = clamp(yy + rc);
+					d1[5] = clamp(yy + gc);
+					d1[6] = clamp(yy + bc);
+					d1[7] = 255;
+
+					d0 += 8;
+					d1 += 8;
+
+					c++;
+				}
+				y += 64;
+			}
+
+			y += 64 * 6 - 64 * 2;
+			c += 64 * 6 - 8;
+		}
+	}
+
+	uint32_t jpeg_decoder::H2V2ConvertFiltered()
+	{
+		const uint BLOCKS_PER_MCU = 6;
+		int y = m_image_y_size - m_total_lines_left;
+		int row = y & 15;
+
+		const int half_image_y_size = (m_image_y_size >> 1) - 1;
+
+		uint8* d0 = m_pScan_line_0;
+
+		int c_y0 = (y - 1) >> 1;
+		int c_y1 = JPGD_MIN(c_y0 + 1, half_image_y_size);
+
+		const uint8_t* p_YSamples = m_pSample_buf;
+		const uint8_t* p_C0Samples = m_pSample_buf;
+		if ((c_y0 >= 0) && (((row & 15) == 0) || ((row & 15) == 15)) && (m_total_lines_left > 1))
+		{
+			assert(y > 0);
+			assert(m_sample_buf_prev_valid);
+
+			if ((row & 15) == 15)
+				p_YSamples = m_pSample_buf_prev;
+
+			p_C0Samples = m_pSample_buf_prev;
+		}
+
+		const int y_sample_base_ofs = ((row & 8) ? 128 : 0) + (row & 7) * 8;
+		const int y0_base = (c_y0 & 7) * 8 + 256;
+		const int y1_base = (c_y1 & 7) * 8 + 256;
+
+		const int half_image_x_size = (m_image_x_size >> 1) - 1;
+
+		static const uint8_t s_muls[2][2][4] =
+		{
+			{ { 1, 3, 3, 9 }, { 3, 9, 1, 3 }, },
+			{ { 3, 1, 9, 3 }, { 9, 3, 3, 1 } }
+		};
+
+		if (((row & 15) >= 1) && ((row & 15) <= 14))
+		{
+			assert((row & 1) == 1);
+			assert(((y + 1 - 1) >> 1) == c_y0);
+
+			assert(p_YSamples == m_pSample_buf);
+			assert(p_C0Samples == m_pSample_buf);
+
+			uint8* d1 = m_pScan_line_1;
+			const int y_sample_base_ofs1 = (((row + 1) & 8) ? 128 : 0) + ((row + 1) & 7) * 8;
+
+			for (int x = 0; x < m_image_x_size; x++)
+			{
+				int k = (x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7);
+				int y_sample0 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs)];
+				int y_sample1 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs1)];
+
+				int c_x0 = (x - 1) >> 1;
+				int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+				c_x0 = JPGD_MAX(c_x0, 0);
+
+				int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7);
+				int cb00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base)];
+				int cr00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base + 64)];
+
+				int cb01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base)];
+				int cr01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base + 64)];
+
+				int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7);
+				int cb10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base)];
+				int cr10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base + 64)];
+
+				int cb11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base)];
+				int cr11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base + 64)];
+
+				{
+					const uint8_t* pMuls = &s_muls[row & 1][x & 1][0];
+					int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+					int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					d0[0] = clamp(y_sample0 + rc);
+					d0[1] = clamp(y_sample0 + gc);
+					d0[2] = clamp(y_sample0 + bc);
+					d0[3] = 255;
+
+					d0 += 4;
+				}
+
+				{
+					const uint8_t* pMuls = &s_muls[(row + 1) & 1][x & 1][0];
+					int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+					int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+					int rc = m_crr[cr];
+					int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+					int bc = m_cbb[cb];
+
+					d1[0] = clamp(y_sample1 + rc);
+					d1[1] = clamp(y_sample1 + gc);
+					d1[2] = clamp(y_sample1 + bc);
+					d1[3] = 255;
+
+					d1 += 4;
+				}
+
+				if (((x & 1) == 1) && (x < m_image_x_size - 1))
+				{
+					const int nx = x + 1;
+					assert(c_x0 == (nx - 1) >> 1);
+
+					k = (nx >> 4) * BLOCKS_PER_MCU * 64 + ((nx & 8) ? 64 : 0) + (nx & 7);
+					y_sample0 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs)];
+					y_sample1 = p_YSamples[check_sample_buf_ofs(k + y_sample_base_ofs1)];
+
+					{
+						const uint8_t* pMuls = &s_muls[row & 1][nx & 1][0];
+						int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+						int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+						int rc = m_crr[cr];
+						int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+						int bc = m_cbb[cb];
+
+						d0[0] = clamp(y_sample0 + rc);
+						d0[1] = clamp(y_sample0 + gc);
+						d0[2] = clamp(y_sample0 + bc);
+						d0[3] = 255;
+
+						d0 += 4;
+					}
+
+					{
+						const uint8_t* pMuls = &s_muls[(row + 1) & 1][nx & 1][0];
+						int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+						int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+						int rc = m_crr[cr];
+						int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+						int bc = m_cbb[cb];
+
+						d1[0] = clamp(y_sample1 + rc);
+						d1[1] = clamp(y_sample1 + gc);
+						d1[2] = clamp(y_sample1 + bc);
+						d1[3] = 255;
+
+						d1 += 4;
+					}
+
+					++x;
+				}
+			}
+
+			return 2;
+		}
+		else
+		{
+			for (int x = 0; x < m_image_x_size; x++)
+			{
+				int y_sample = p_YSamples[check_sample_buf_ofs((x >> 4) * BLOCKS_PER_MCU * 64 + ((x & 8) ? 64 : 0) + (x & 7) + y_sample_base_ofs)];
+
+				int c_x0 = (x - 1) >> 1;
+				int c_x1 = JPGD_MIN(c_x0 + 1, half_image_x_size);
+				c_x0 = JPGD_MAX(c_x0, 0);
+
+				int a = (c_x0 >> 3) * BLOCKS_PER_MCU * 64 + (c_x0 & 7);
+				int cb00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base)];
+				int cr00_sample = p_C0Samples[check_sample_buf_ofs(a + y0_base + 64)];
+
+				int cb01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base)];
+				int cr01_sample = m_pSample_buf[check_sample_buf_ofs(a + y1_base + 64)];
+
+				int b = (c_x1 >> 3) * BLOCKS_PER_MCU * 64 + (c_x1 & 7);
+				int cb10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base)];
+				int cr10_sample = p_C0Samples[check_sample_buf_ofs(b + y0_base + 64)];
+
+				int cb11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base)];
+				int cr11_sample = m_pSample_buf[check_sample_buf_ofs(b + y1_base + 64)];
+
+				const uint8_t* pMuls = &s_muls[row & 1][x & 1][0];
+				int cb = (cb00_sample * pMuls[0] + cb01_sample * pMuls[1] + cb10_sample * pMuls[2] + cb11_sample * pMuls[3] + 8) >> 4;
+				int cr = (cr00_sample * pMuls[0] + cr01_sample * pMuls[1] + cr10_sample * pMuls[2] + cr11_sample * pMuls[3] + 8) >> 4;
+
+				int rc = m_crr[cr];
+				int gc = ((m_crg[cr] + m_cbg[cb]) >> 16);
+				int bc = m_cbb[cb];
+
+				d0[0] = clamp(y_sample + rc);
+				d0[1] = clamp(y_sample + gc);
+				d0[2] = clamp(y_sample + bc);
+				d0[3] = 255;
+
+				d0 += 4;
+			}
+
+			return 1;
+		}
+	}
+
+	// Y (1 block per MCU) to 8-bit grayscale
+	void jpeg_decoder::gray_convert()
+	{
+		int row = m_max_mcu_y_size - m_mcu_lines_left;
+		uint8* d = m_pScan_line_0;
+		uint8* s = m_pSample_buf + row * 8;
+
+		for (int i = m_max_mcus_per_row; i > 0; i--)
+		{
+			*(uint*)d = *(uint*)s;
+			*(uint*)(&d[4]) = *(uint*)(&s[4]);
+
+			s += 64;
+			d += 8;
+		}
+	}
+
+	// Find end of image (EOI) marker, so we can return to the user the exact size of the input stream.
+	void jpeg_decoder::find_eoi()
+	{
+		if (!m_progressive_flag)
+		{
+			// Attempt to read the EOI marker.
+			//get_bits_no_markers(m_bits_left & 7);
+
+			// Prime the bit buffer
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+
+			// The next marker _should_ be EOI
+			process_markers();
+		}
+
+		m_total_bytes_read -= m_in_buf_left;
+	}
+
+	int jpeg_decoder::decode_next_mcu_row()
+	{
+		if (setjmp(m_jmp_state))
+			return JPGD_FAILED;
+
+		const bool chroma_y_filtering = (m_flags & cFlagLinearChromaFiltering) && ((m_scan_type == JPGD_YH2V2) || (m_scan_type == JPGD_YH1V2));
+		if (chroma_y_filtering)
+		{
+			std::swap(m_pSample_buf, m_pSample_buf_prev);
+
+			m_sample_buf_prev_valid = true;
+		}
+
+		if (m_progressive_flag)
+			load_next_row();
+		else
+			decode_next_row();
+
+		// Find the EOI marker if that was the last row.
+		if (m_total_lines_left <= m_max_mcu_y_size)
+			find_eoi();
+
+		m_mcu_lines_left = m_max_mcu_y_size;
+		return 0;
+	}
+
+	int jpeg_decoder::decode(const void** pScan_line, uint* pScan_line_len)
+	{
+		if ((m_error_code) || (!m_ready_flag))
+			return JPGD_FAILED;
+
+		if (m_total_lines_left == 0)
+			return JPGD_DONE;
+
+		const bool chroma_y_filtering = (m_flags & cFlagLinearChromaFiltering) && ((m_scan_type == JPGD_YH2V2) || (m_scan_type == JPGD_YH1V2));
+
+		bool get_another_mcu_row = false;
+		bool got_mcu_early = false;
+		if (chroma_y_filtering)
+		{
+			if (m_total_lines_left == m_image_y_size)
+				get_another_mcu_row = true;
+			else if ((m_mcu_lines_left == 1) && (m_total_lines_left > 1))
+			{
+				get_another_mcu_row = true;
+				got_mcu_early = true;
+			}
+		}
+		else
+		{
+			get_another_mcu_row = (m_mcu_lines_left == 0);
+		}
+
+		if (get_another_mcu_row)
+		{
+			int status = decode_next_mcu_row();
+			if (status != 0)
+				return status;
+		}
+
+		switch (m_scan_type)
+		{
+		case JPGD_YH2V2:
+		{
+			if (m_flags & cFlagLinearChromaFiltering)
+			{
+				if (m_num_buffered_scanlines == 1)
+				{
+					*pScan_line = m_pScan_line_1;
+				}
+				else if (m_num_buffered_scanlines == 0)
+				{
+					m_num_buffered_scanlines = H2V2ConvertFiltered();
+					*pScan_line = m_pScan_line_0;
+				}
+
+				m_num_buffered_scanlines--;
+			}
+			else
+			{
+				if ((m_mcu_lines_left & 1) == 0)
+				{
+					H2V2Convert();
+					*pScan_line = m_pScan_line_0;
+				}
+				else
+					*pScan_line = m_pScan_line_1;
+			}
+
+			break;
+		}
+		case JPGD_YH2V1:
+		{
+			if (m_flags & cFlagLinearChromaFiltering)
+				H2V1ConvertFiltered();
+			else
+				H2V1Convert();
+			*pScan_line = m_pScan_line_0;
+			break;
+		}
+		case JPGD_YH1V2:
+		{
+			if (chroma_y_filtering)
+			{
+				H1V2ConvertFiltered();
+				*pScan_line = m_pScan_line_0;
+			}
+			else
+			{
+				if ((m_mcu_lines_left & 1) == 0)
+				{
+					H1V2Convert();
+					*pScan_line = m_pScan_line_0;
+				}
+				else
+					*pScan_line = m_pScan_line_1;
+			}
+
+			break;
+		}
+		case JPGD_YH1V1:
+		{
+			H1V1Convert();
+			*pScan_line = m_pScan_line_0;
+			break;
+		}
+		case JPGD_GRAYSCALE:
+		{
+			gray_convert();
+			*pScan_line = m_pScan_line_0;
+
+			break;
+		}
+		}
+
+		*pScan_line_len = m_real_dest_bytes_per_scan_line;
+
+		if (!got_mcu_early)
+		{
+			m_mcu_lines_left--;
+		}
+
+		m_total_lines_left--;
+
+		return JPGD_SUCCESS;
+	}
+
+	// Creates the tables needed for efficient Huffman decoding.
+	void jpeg_decoder::make_huff_table(int index, huff_tables* pH)
+	{
+		int p, i, l, si;
+		uint8 huffsize[258];
+		uint huffcode[258];
+		uint code;
+		uint subtree;
+		int code_size;
+		int lastp;
+		int nextfreeentry;
+		int currententry;
+
+		pH->ac_table = m_huff_ac[index] != 0;
+
+		p = 0;
+
+		for (l = 1; l <= 16; l++)
+		{
+			for (i = 1; i <= m_huff_num[index][l]; i++)
+			{
+				if (p >= 257)
+					stop_decoding(JPGD_DECODE_ERROR);
+				huffsize[p++] = static_cast<uint8>(l);
+			}
+		}
+
+		assert(p < 258);
+		huffsize[p] = 0;
+
+		lastp = p;
+
+		code = 0;
+		si = huffsize[0];
+		p = 0;
+
+		while (huffsize[p])
+		{
+			while (huffsize[p] == si)
+			{
+				if (p >= 257)
+					stop_decoding(JPGD_DECODE_ERROR);
+				huffcode[p++] = code;
+				code++;
+			}
+
+			code <<= 1;
+			si++;
+		}
+
+		memset(pH->look_up, 0, sizeof(pH->look_up));
+		memset(pH->look_up2, 0, sizeof(pH->look_up2));
+		memset(pH->tree, 0, sizeof(pH->tree));
+		memset(pH->code_size, 0, sizeof(pH->code_size));
+
+		nextfreeentry = -1;
+
+		p = 0;
+
+		while (p < lastp)
+		{
+			i = m_huff_val[index][p];
+
+			code = huffcode[p];
+			code_size = huffsize[p];
+
+			assert(i < JPGD_HUFF_CODE_SIZE_MAX_LENGTH);
+			pH->code_size[i] = static_cast<uint8>(code_size);
+
+			if (code_size <= 8)
+			{
+				code <<= (8 - code_size);
+
+				for (l = 1 << (8 - code_size); l > 0; l--)
+				{
+					if (code >= 256)
+						stop_decoding(JPGD_DECODE_ERROR);
+
+					pH->look_up[code] = i;
+
+					bool has_extrabits = false;
+					int extra_bits = 0;
+					int num_extra_bits = i & 15;
+
+					int bits_to_fetch = code_size;
+					if (num_extra_bits)
+					{
+						int total_codesize = code_size + num_extra_bits;
+						if (total_codesize <= 8)
+						{
+							has_extrabits = true;
+							extra_bits = ((1 << num_extra_bits) - 1) & (code >> (8 - total_codesize));
+
+							if (extra_bits > 0x7FFF)
+								stop_decoding(JPGD_DECODE_ERROR);
+
+							bits_to_fetch += num_extra_bits;
+						}
+					}
+
+					if (!has_extrabits)
+						pH->look_up2[code] = i | (bits_to_fetch << 8);
+					else
+						pH->look_up2[code] = i | 0x8000 | (extra_bits << 16) | (bits_to_fetch << 8);
+
+					code++;
+				}
+			}
+			else
+			{
+				subtree = (code >> (code_size - 8)) & 0xFF;
+
+				currententry = pH->look_up[subtree];
+
+				if (currententry == 0)
+				{
+					pH->look_up[subtree] = currententry = nextfreeentry;
+					pH->look_up2[subtree] = currententry = nextfreeentry;
+
+					nextfreeentry -= 2;
+				}
+
+				code <<= (16 - (code_size - 8));
+
+				for (l = code_size; l > 9; l--)
+				{
+					if ((code & 0x8000) == 0)
+						currententry--;
+
+					unsigned int idx = -currententry - 1;
+
+					if (idx >= JPGD_HUFF_TREE_MAX_LENGTH)
+						stop_decoding(JPGD_DECODE_ERROR);
+
+					if (pH->tree[idx] == 0)
+					{
+						pH->tree[idx] = nextfreeentry;
+
+						currententry = nextfreeentry;
+
+						nextfreeentry -= 2;
+					}
+					else
+					{
+						currententry = pH->tree[idx];
+					}
+
+					code <<= 1;
+				}
+
+				if ((code & 0x8000) == 0)
+					currententry--;
+
+				if ((-currententry - 1) >= JPGD_HUFF_TREE_MAX_LENGTH)
+					stop_decoding(JPGD_DECODE_ERROR);
+
+				pH->tree[-currententry - 1] = i;
+			}
+
+			p++;
+		}
+	}
+
+	// Verifies the quantization tables needed for this scan are available.
+	void jpeg_decoder::check_quant_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+			if (m_quant[m_comp_quant[m_comp_list[i]]] == nullptr)
+				stop_decoding(JPGD_UNDEFINED_QUANT_TABLE);
+	}
+
+	// Verifies that all the Huffman tables needed for this scan are available.
+	void jpeg_decoder::check_huff_tables()
+	{
+		for (int i = 0; i < m_comps_in_scan; i++)
+		{
+			if ((m_spectral_start == 0) && (m_huff_num[m_comp_dc_tab[m_comp_list[i]]] == nullptr))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+
+			if ((m_spectral_end > 0) && (m_huff_num[m_comp_ac_tab[m_comp_list[i]]] == nullptr))
+				stop_decoding(JPGD_UNDEFINED_HUFF_TABLE);
+		}
+
+		for (int i = 0; i < JPGD_MAX_HUFF_TABLES; i++)
+			if (m_huff_num[i])
+			{
+				if (!m_pHuff_tabs[i])
+					m_pHuff_tabs[i] = (huff_tables*)alloc(sizeof(huff_tables));
+
+				make_huff_table(i, m_pHuff_tabs[i]);
+			}
+	}
+
+	// Determines the component order inside each MCU.
+	// Also calcs how many MCU's are on each row, etc.
+	bool jpeg_decoder::calc_mcu_block_order()
+	{
+		int component_num, component_id;
+		int max_h_samp = 0, max_v_samp = 0;
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			if (m_comp_h_samp[component_id] > max_h_samp)
+				max_h_samp = m_comp_h_samp[component_id];
+
+			if (m_comp_v_samp[component_id] > max_v_samp)
+				max_v_samp = m_comp_v_samp[component_id];
+		}
+
+		for (component_id = 0; component_id < m_comps_in_frame; component_id++)
+		{
+			m_comp_h_blocks[component_id] = ((((m_image_x_size * m_comp_h_samp[component_id]) + (max_h_samp - 1)) / max_h_samp) + 7) / 8;
+			m_comp_v_blocks[component_id] = ((((m_image_y_size * m_comp_v_samp[component_id]) + (max_v_samp - 1)) / max_v_samp) + 7) / 8;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcus_per_row = m_comp_h_blocks[m_comp_list[0]];
+			m_mcus_per_col = m_comp_v_blocks[m_comp_list[0]];
+		}
+		else
+		{
+			m_mcus_per_row = (((m_image_x_size + 7) / 8) + (max_h_samp - 1)) / max_h_samp;
+			m_mcus_per_col = (((m_image_y_size + 7) / 8) + (max_v_samp - 1)) / max_v_samp;
+		}
+
+		if (m_comps_in_scan == 1)
+		{
+			m_mcu_org[0] = m_comp_list[0];
+
+			m_blocks_per_mcu = 1;
+		}
+		else
+		{
+			m_blocks_per_mcu = 0;
+
+			for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+			{
+				int num_blocks;
+
+				component_id = m_comp_list[component_num];
+
+				num_blocks = m_comp_h_samp[component_id] * m_comp_v_samp[component_id];
+
+				while (num_blocks--)
+					m_mcu_org[m_blocks_per_mcu++] = component_id;
+			}
+		}
+
+		if (m_blocks_per_mcu > m_max_blocks_per_mcu)
+			return false;
+
+		for (int mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+		{
+			int comp_id = m_mcu_org[mcu_block];
+			if (comp_id >= JPGD_MAX_QUANT_TABLES)
+				return false;
+		}
+
+		return true;
+	}
+
+	// Starts a new scan.
+	int jpeg_decoder::init_scan()
+	{
+		if (!locate_sos_marker())
+			return JPGD_FALSE;
+
+		if (!calc_mcu_block_order())
+			return JPGD_FALSE;
+
+		check_huff_tables();
+
+		check_quant_tables();
+
+		memset(m_last_dc_val, 0, m_comps_in_frame * sizeof(uint));
+
+		m_eob_run = 0;
+
+		if (m_restart_interval)
+		{
+			m_restarts_left = m_restart_interval;
+			m_next_restart_num = 0;
+		}
+
+		fix_in_buffer();
+
+		return JPGD_TRUE;
+	}
+
+	// Starts a frame. Determines if the number of components or sampling factors
+	// are supported.
+	void jpeg_decoder::init_frame()
+	{
+		int i;
+
+		if (m_comps_in_frame == 1)
+		{
+			if ((m_comp_h_samp[0] != 1) || (m_comp_v_samp[0] != 1))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			m_scan_type = JPGD_GRAYSCALE;
+			m_max_blocks_per_mcu = 1;
+			m_max_mcu_x_size = 8;
+			m_max_mcu_y_size = 8;
+		}
+		else if (m_comps_in_frame == 3)
+		{
+			if (((m_comp_h_samp[1] != 1) || (m_comp_v_samp[1] != 1)) ||
+				((m_comp_h_samp[2] != 1) || (m_comp_v_samp[2] != 1)))
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+
+			if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH1V1;
+
+				m_max_blocks_per_mcu = 3;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 1))
+			{
+				m_scan_type = JPGD_YH2V1;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 8;
+			}
+			else if ((m_comp_h_samp[0] == 1) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH1V2;
+				m_max_blocks_per_mcu = 4;
+				m_max_mcu_x_size = 8;
+				m_max_mcu_y_size = 16;
+			}
+			else if ((m_comp_h_samp[0] == 2) && (m_comp_v_samp[0] == 2))
+			{
+				m_scan_type = JPGD_YH2V2;
+				m_max_blocks_per_mcu = 6;
+				m_max_mcu_x_size = 16;
+				m_max_mcu_y_size = 16;
+			}
+			else
+				stop_decoding(JPGD_UNSUPPORTED_SAMP_FACTORS);
+		}
+		else
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		m_max_mcus_per_row = (m_image_x_size + (m_max_mcu_x_size - 1)) / m_max_mcu_x_size;
+		m_max_mcus_per_col = (m_image_y_size + (m_max_mcu_y_size - 1)) / m_max_mcu_y_size;
+
+		// These values are for the *destination* pixels: after conversion.
+		if (m_scan_type == JPGD_GRAYSCALE)
+			m_dest_bytes_per_pixel = 1;
+		else
+			m_dest_bytes_per_pixel = 4;
+
+		m_dest_bytes_per_scan_line = ((m_image_x_size + 15) & 0xFFF0) * m_dest_bytes_per_pixel;
+
+		m_real_dest_bytes_per_scan_line = (m_image_x_size * m_dest_bytes_per_pixel);
+
+		// Initialize two scan line buffers.
+		m_pScan_line_0 = (uint8*)alloc(m_dest_bytes_per_scan_line, true);
+		if ((m_scan_type == JPGD_YH1V2) || (m_scan_type == JPGD_YH2V2))
+			m_pScan_line_1 = (uint8*)alloc(m_dest_bytes_per_scan_line, true);
+
+		m_max_blocks_per_row = m_max_mcus_per_row * m_max_blocks_per_mcu;
+
+		// Should never happen
+		if (m_max_blocks_per_row > JPGD_MAX_BLOCKS_PER_ROW)
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		// Allocate the coefficient buffer, enough for one MCU
+		m_pMCU_coefficients = (jpgd_block_t*)alloc(m_max_blocks_per_mcu * 64 * sizeof(jpgd_block_t));
+
+		for (i = 0; i < m_max_blocks_per_mcu; i++)
+			m_mcu_block_max_zag[i] = 64;
+
+		m_pSample_buf = (uint8*)alloc(m_max_blocks_per_row * 64);
+		m_pSample_buf_prev = (uint8*)alloc(m_max_blocks_per_row * 64);
+
+		m_total_lines_left = m_image_y_size;
+
+		m_mcu_lines_left = 0;
+
+		create_look_ups();
+	}
+
+	// The coeff_buf series of methods originally stored the coefficients
+	// into a "virtual" file which was located in EMS, XMS, or a disk file. A cache
+	// was used to make this process more efficient. Now, we can store the entire
+	// thing in RAM.
+	jpeg_decoder::coeff_buf* jpeg_decoder::coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y)
+	{
+		coeff_buf* cb = (coeff_buf*)alloc(sizeof(coeff_buf));
+
+		cb->block_num_x = block_num_x;
+		cb->block_num_y = block_num_y;
+		cb->block_len_x = block_len_x;
+		cb->block_len_y = block_len_y;
+		cb->block_size = (block_len_x * block_len_y) * sizeof(jpgd_block_t);
+		cb->pData = (uint8*)alloc(cb->block_size * block_num_x * block_num_y, true);
+		return cb;
+	}
+
+	inline jpgd_block_t* jpeg_decoder::coeff_buf_getp(coeff_buf* cb, int block_x, int block_y)
+	{
+		if ((block_x >= cb->block_num_x) || (block_y >= cb->block_num_y))
+			stop_decoding(JPGD_DECODE_ERROR);
+
+		return (jpgd_block_t*)(cb->pData + block_x * cb->block_size + block_y * (cb->block_size * cb->block_num_x));
+	}
+
+	// The following methods decode the various types of m_blocks encountered
+	// in progressively encoded images.
+	void jpeg_decoder::decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int s, r;
+		jpgd_block_t* p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+		if ((s = pD->huff_decode(pD->m_pHuff_tabs[pD->m_comp_dc_tab[component_id]])) != 0)
+		{
+			if (s >= 16)
+				pD->stop_decoding(JPGD_DECODE_ERROR);
+
+			r = pD->get_bits_no_markers(s);
+			s = JPGD_HUFF_EXTEND(r, s);
+		}
+
+		pD->m_last_dc_val[component_id] = (s += pD->m_last_dc_val[component_id]);
+
+		p[0] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+	}
+
+	void jpeg_decoder::decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		if (pD->get_bits_no_markers(1))
+		{
+			jpgd_block_t* p = pD->coeff_buf_getp(pD->m_dc_coeffs[component_id], block_x, block_y);
+
+			p[0] |= (1 << pD->m_successive_low);
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int k, s, r;
+
+		if (pD->m_eob_run)
+		{
+			pD->m_eob_run--;
+			return;
+		}
+
+		jpgd_block_t* p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+
+		for (k = pD->m_spectral_start; k <= pD->m_spectral_end; k++)
+		{
+			unsigned int idx = pD->m_comp_ac_tab[component_id];
+			if (idx >= JPGD_MAX_HUFF_TABLES)
+				pD->stop_decoding(JPGD_DECODE_ERROR);
+
+			s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
+
+			r = s >> 4;
+			s &= 15;
+
+			if (s)
+			{
+				if ((k += r) > 63)
+					pD->stop_decoding(JPGD_DECODE_ERROR);
+
+				r = pD->get_bits_no_markers(s);
+				s = JPGD_HUFF_EXTEND(r, s);
+
+				p[g_ZAG[k]] = static_cast<jpgd_block_t>(s << pD->m_successive_low);
+			}
+			else
+			{
+				if (r == 15)
+				{
+					if ((k += 15) > 63)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+				}
+				else
+				{
+					pD->m_eob_run = 1 << r;
+
+					if (r)
+						pD->m_eob_run += pD->get_bits_no_markers(r);
+
+					pD->m_eob_run--;
+
+					break;
+				}
+			}
+		}
+	}
+
+	void jpeg_decoder::decode_block_ac_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y)
+	{
+		int s, k, r;
+
+		int p1 = 1 << pD->m_successive_low;
+
+		//int m1 = (-1) << pD->m_successive_low;
+		int m1 = static_cast<int>((UINT32_MAX << pD->m_successive_low));
+
+		jpgd_block_t* p = pD->coeff_buf_getp(pD->m_ac_coeffs[component_id], block_x, block_y);
+		if (pD->m_spectral_end > 63)
+			pD->stop_decoding(JPGD_DECODE_ERROR);
+
+		k = pD->m_spectral_start;
+
+		if (pD->m_eob_run == 0)
+		{
+			for (; k <= pD->m_spectral_end; k++)
+			{
+				unsigned int idx = pD->m_comp_ac_tab[component_id];
+				if (idx >= JPGD_MAX_HUFF_TABLES)
+					pD->stop_decoding(JPGD_DECODE_ERROR);
+
+				s = pD->huff_decode(pD->m_pHuff_tabs[idx]);
+
+				r = s >> 4;
+				s &= 15;
+
+				if (s)
+				{
+					if (s != 1)
+						pD->stop_decoding(JPGD_DECODE_ERROR);
+
+					if (pD->get_bits_no_markers(1))
+						s = p1;
+					else
+						s = m1;
+				}
+				else
+				{
+					if (r != 15)
+					{
+						pD->m_eob_run = 1 << r;
+
+						if (r)
+							pD->m_eob_run += pD->get_bits_no_markers(r);
+
+						break;
+					}
+				}
+
+				do
+				{
+					jpgd_block_t* this_coef = p + g_ZAG[k & 63];
+
+					if (*this_coef != 0)
+					{
+						if (pD->get_bits_no_markers(1))
+						{
+							if ((*this_coef & p1) == 0)
+							{
+								if (*this_coef >= 0)
+									*this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+								else
+									*this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+							}
+						}
+					}
+					else
+					{
+						if (--r < 0)
+							break;
+					}
+
+					k++;
+
+				} while (k <= pD->m_spectral_end);
+
+				if ((s) && (k < 64))
+				{
+					p[g_ZAG[k]] = static_cast<jpgd_block_t>(s);
+				}
+			}
+		}
+
+		if (pD->m_eob_run > 0)
+		{
+			for (; k <= pD->m_spectral_end; k++)
+			{
+				jpgd_block_t* this_coef = p + g_ZAG[k & 63]; // logical AND to shut up static code analysis
+
+				if (*this_coef != 0)
+				{
+					if (pD->get_bits_no_markers(1))
+					{
+						if ((*this_coef & p1) == 0)
+						{
+							if (*this_coef >= 0)
+								*this_coef = static_cast<jpgd_block_t>(*this_coef + p1);
+							else
+								*this_coef = static_cast<jpgd_block_t>(*this_coef + m1);
+						}
+					}
+				}
+			}
+
+			pD->m_eob_run--;
+		}
+	}
+
+	// Decode a scan in a progressively encoded image.
+	void jpeg_decoder::decode_scan(pDecode_block_func decode_block_func)
+	{
+		int mcu_row, mcu_col, mcu_block;
+		int block_x_mcu[JPGD_MAX_COMPONENTS], block_y_mcu[JPGD_MAX_COMPONENTS];
+
+		memset(block_y_mcu, 0, sizeof(block_y_mcu));
+
+		for (mcu_col = 0; mcu_col < m_mcus_per_col; mcu_col++)
+		{
+			int component_num, component_id;
+
+			memset(block_x_mcu, 0, sizeof(block_x_mcu));
+
+			for (mcu_row = 0; mcu_row < m_mcus_per_row; mcu_row++)
+			{
+				int block_x_mcu_ofs = 0, block_y_mcu_ofs = 0;
+
+				if ((m_restart_interval) && (m_restarts_left == 0))
+					process_restart();
+
+				for (mcu_block = 0; mcu_block < m_blocks_per_mcu; mcu_block++)
+				{
+					component_id = m_mcu_org[mcu_block];
+
+					decode_block_func(this, component_id, block_x_mcu[component_id] + block_x_mcu_ofs, block_y_mcu[component_id] + block_y_mcu_ofs);
+
+					if (m_comps_in_scan == 1)
+						block_x_mcu[component_id]++;
+					else
+					{
+						if (++block_x_mcu_ofs == m_comp_h_samp[component_id])
+						{
+							block_x_mcu_ofs = 0;
+
+							if (++block_y_mcu_ofs == m_comp_v_samp[component_id])
+							{
+								block_y_mcu_ofs = 0;
+								block_x_mcu[component_id] += m_comp_h_samp[component_id];
+							}
+						}
+					}
+				}
+
+				m_restarts_left--;
+			}
+
+			if (m_comps_in_scan == 1)
+				block_y_mcu[m_comp_list[0]]++;
+			else
+			{
+				for (component_num = 0; component_num < m_comps_in_scan; component_num++)
+				{
+					component_id = m_comp_list[component_num];
+					block_y_mcu[component_id] += m_comp_v_samp[component_id];
+				}
+			}
+		}
+	}
+
+	// Decode a progressively encoded image.
+	void jpeg_decoder::init_progressive()
+	{
+		int i;
+
+		if (m_comps_in_frame == 4)
+			stop_decoding(JPGD_UNSUPPORTED_COLORSPACE);
+
+		// Allocate the coefficient buffers.
+		for (i = 0; i < m_comps_in_frame; i++)
+		{
+			m_dc_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 1, 1);
+			m_ac_coeffs[i] = coeff_buf_open(m_max_mcus_per_row * m_comp_h_samp[i], m_max_mcus_per_col * m_comp_v_samp[i], 8, 8);
+		}
+
+		// See https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf
+		uint32_t total_scans = 0;
+		const uint32_t MAX_SCANS_TO_PROCESS = 1000;
+
+		for (; ; )
+		{
+			int dc_only_scan, refinement_scan;
+			pDecode_block_func decode_block_func;
+
+			if (!init_scan())
+				break;
+
+			dc_only_scan = (m_spectral_start == 0);
+			refinement_scan = (m_successive_high != 0);
+
+			if ((m_spectral_start > m_spectral_end) || (m_spectral_end > 63))
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if (dc_only_scan)
+			{
+				if (m_spectral_end)
+					stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+			}
+			else if (m_comps_in_scan != 1)  /* AC scans can only contain one component */
+				stop_decoding(JPGD_BAD_SOS_SPECTRAL);
+
+			if ((refinement_scan) && (m_successive_low != m_successive_high - 1))
+				stop_decoding(JPGD_BAD_SOS_SUCCESSIVE);
+
+			if (dc_only_scan)
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_dc_refine;
+				else
+					decode_block_func = decode_block_dc_first;
+			}
+			else
+			{
+				if (refinement_scan)
+					decode_block_func = decode_block_ac_refine;
+				else
+					decode_block_func = decode_block_ac_first;
+			}
+
+			decode_scan(decode_block_func);
+
+			m_bits_left = 16;
+			get_bits(16);
+			get_bits(16);
+
+			total_scans++;
+			if (total_scans > MAX_SCANS_TO_PROCESS)
+				stop_decoding(JPGD_TOO_MANY_SCANS);
+		}
+
+		m_comps_in_scan = m_comps_in_frame;
+
+		for (i = 0; i < m_comps_in_frame; i++)
+			m_comp_list[i] = i;
+
+		if (!calc_mcu_block_order())
+			stop_decoding(JPGD_DECODE_ERROR);
+	}
+
+	void jpeg_decoder::init_sequential()
+	{
+		if (!init_scan())
+			stop_decoding(JPGD_UNEXPECTED_MARKER);
+	}
+
+	void jpeg_decoder::decode_start()
+	{
+		init_frame();
+
+		if (m_progressive_flag)
+			init_progressive();
+		else
+			init_sequential();
+	}
+
+	void jpeg_decoder::decode_init(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		init(pStream, flags);
+		locate_sof_marker();
+	}
+
+	jpeg_decoder::jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags)
+	{
+		if (setjmp(m_jmp_state))
+			return;
+		decode_init(pStream, flags);
+	}
+
+	int jpeg_decoder::begin_decoding()
+	{
+		if (m_ready_flag)
+			return JPGD_SUCCESS;
+
+		if (m_error_code)
+			return JPGD_FAILED;
+
+		if (setjmp(m_jmp_state))
+			return JPGD_FAILED;
+
+		decode_start();
+
+		m_ready_flag = true;
+
+		return JPGD_SUCCESS;
+	}
+
+	jpeg_decoder::~jpeg_decoder()
+	{
+		free_all_blocks();
+	}
+
+	jpeg_decoder_file_stream::jpeg_decoder_file_stream()
+	{
+		m_pFile = nullptr;
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	void jpeg_decoder_file_stream::close()
+	{
+		if (m_pFile)
+		{
+			fclose(m_pFile);
+			m_pFile = nullptr;
+		}
+
+		m_eof_flag = false;
+		m_error_flag = false;
+	}
+
+	jpeg_decoder_file_stream::~jpeg_decoder_file_stream()
+	{
+		close();
+	}
+
+	bool jpeg_decoder_file_stream::open(const char* Pfilename)
+	{
+		close();
+
+		m_eof_flag = false;
+		m_error_flag = false;
+
+#if defined(_MSC_VER)
+		m_pFile = nullptr;
+		fopen_s(&m_pFile, Pfilename, "rb");
+#else
+		m_pFile = fopen(Pfilename, "rb");
+#endif
+		return m_pFile != nullptr;
+	}
+
+	int jpeg_decoder_file_stream::read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag)
+	{
+		if (!m_pFile)
+			return -1;
+
+		if (m_eof_flag)
+		{
+			*pEOF_flag = true;
+			return 0;
+		}
+
+		if (m_error_flag)
+			return -1;
+
+		int bytes_read = static_cast<int>(fread(pBuf, 1, max_bytes_to_read, m_pFile));
+		if (bytes_read < max_bytes_to_read)
+		{
+			if (ferror(m_pFile))
+			{
+				m_error_flag = true;
+				return -1;
+			}
+
+			m_eof_flag = true;
+			*pEOF_flag = true;
+		}
+
+		return bytes_read;
+	}
+
+	bool jpeg_decoder_mem_stream::open(const uint8* pSrc_data, uint size)
+	{
+		close();
+		m_pSrc_data = pSrc_data;
+		m_ofs = 0;
+		m_size = size;
+		return true;
+	}
+
+	int jpeg_decoder_mem_stream::read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag)
+	{
+		*pEOF_flag = false;
+
+		if (!m_pSrc_data)
+			return -1;
+
+		uint bytes_remaining = m_size - m_ofs;
+		if ((uint)max_bytes_to_read > bytes_remaining)
+		{
+			max_bytes_to_read = bytes_remaining;
+			*pEOF_flag = true;
+		}
+
+		memcpy(pBuf, m_pSrc_data + m_ofs, max_bytes_to_read);
+		m_ofs += max_bytes_to_read;
+
+		return max_bytes_to_read;
+	}
+
+	unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		if (!actual_comps)
+			return nullptr;
+		*actual_comps = 0;
+
+		if ((!pStream) || (!width) || (!height) || (!req_comps))
+			return nullptr;
+
+		if ((req_comps != 1) && (req_comps != 3) && (req_comps != 4))
+			return nullptr;
+
+		jpeg_decoder decoder(pStream, flags);
+		if (decoder.get_error_code() != JPGD_SUCCESS)
+			return nullptr;
+
+		const int image_width = decoder.get_width(), image_height = decoder.get_height();
+		*width = image_width;
+		*height = image_height;
+		*actual_comps = decoder.get_num_components();
+
+		if (decoder.begin_decoding() != JPGD_SUCCESS)
+			return nullptr;
+
+		const int dst_bpl = image_width * req_comps;
+
+		uint8* pImage_data = (uint8*)jpgd_malloc(dst_bpl * image_height);
+		if (!pImage_data)
+			return nullptr;
+
+		for (int y = 0; y < image_height; y++)
+		{
+			const uint8* pScan_line;
+			uint scan_line_len;
+			if (decoder.decode((const void**)&pScan_line, &scan_line_len) != JPGD_SUCCESS)
+			{
+				jpgd_free(pImage_data);
+				return nullptr;
+			}
+
+			uint8* pDst = pImage_data + y * dst_bpl;
+
+			if (((req_comps == 1) && (decoder.get_num_components() == 1)) || ((req_comps == 4) && (decoder.get_num_components() == 3)))
+				memcpy(pDst, pScan_line, dst_bpl);
+			else if (decoder.get_num_components() == 1)
+			{
+				if (req_comps == 3)
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst += 3;
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						uint8 luma = pScan_line[x];
+						pDst[0] = luma;
+						pDst[1] = luma;
+						pDst[2] = luma;
+						pDst[3] = 255;
+						pDst += 4;
+					}
+				}
+			}
+			else if (decoder.get_num_components() == 3)
+			{
+				if (req_comps == 1)
+				{
+					const int YR = 19595, YG = 38470, YB = 7471;
+					for (int x = 0; x < image_width; x++)
+					{
+						int r = pScan_line[x * 4 + 0];
+						int g = pScan_line[x * 4 + 1];
+						int b = pScan_line[x * 4 + 2];
+						*pDst++ = static_cast<uint8>((r * YR + g * YG + b * YB + 32768) >> 16);
+					}
+				}
+				else
+				{
+					for (int x = 0; x < image_width; x++)
+					{
+						pDst[0] = pScan_line[x * 4 + 0];
+						pDst[1] = pScan_line[x * 4 + 1];
+						pDst[2] = pScan_line[x * 4 + 2];
+						pDst += 3;
+					}
+				}
+			}
+		}
+
+		return pImage_data;
+	}
+
+	unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		jpgd::jpeg_decoder_mem_stream mem_stream(pSrc_data, src_data_size);
+		return decompress_jpeg_image_from_stream(&mem_stream, width, height, actual_comps, req_comps, flags);
+	}
+
+	unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags)
+	{
+		jpgd::jpeg_decoder_file_stream file_stream;
+		if (!file_stream.open(pSrc_filename))
+			return nullptr;
+		return decompress_jpeg_image_from_stream(&file_stream, width, height, actual_comps, req_comps, flags);
+	}
+
+} // namespace jpgd
diff --git a/thirdparty/basis_universal/encoder/jpgd.h b/thirdparty/basis_universal/encoder/jpgd.h
new file mode 100644
index 0000000000..86a7814cae
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/jpgd.h
@@ -0,0 +1,347 @@
+// jpgd.h - C++ class for JPEG decompression.
+// Public domain, Rich Geldreich <richgel99@gmail.com>
+#ifndef JPEG_DECODER_H
+#define JPEG_DECODER_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <setjmp.h>
+#include <assert.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+#define JPGD_NORETURN __declspec(noreturn) 
+#elif defined(__GNUC__)
+#define JPGD_NORETURN __attribute__ ((noreturn))
+#else
+#define JPGD_NORETURN
+#endif
+
+#define JPGD_HUFF_TREE_MAX_LENGTH 512
+#define JPGD_HUFF_CODE_SIZE_MAX_LENGTH 256
+
+namespace jpgd
+{
+	typedef unsigned char  uint8;
+	typedef   signed short int16;
+	typedef unsigned short uint16;
+	typedef unsigned int   uint;
+	typedef   signed int   int32;
+
+	// Loads a JPEG image from a memory buffer or a file.
+	// req_comps can be 1 (grayscale), 3 (RGB), or 4 (RGBA).
+	// On return, width/height will be set to the image's dimensions, and actual_comps will be set to the either 1 (grayscale) or 3 (RGB).
+	// Notes: For more control over where and how the source data is read, see the decompress_jpeg_image_from_stream() function below, or call the jpeg_decoder class directly.
+	// Requesting a 8 or 32bpp image is currently a little faster than 24bpp because the jpeg_decoder class itself currently always unpacks to either 8 or 32bpp.
+	unsigned char* decompress_jpeg_image_from_memory(const unsigned char* pSrc_data, int src_data_size, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+	unsigned char* decompress_jpeg_image_from_file(const char* pSrc_filename, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+
+	// Success/failure error codes.
+	enum jpgd_status
+	{
+		JPGD_SUCCESS = 0, JPGD_FAILED = -1, JPGD_DONE = 1,
+		JPGD_BAD_DHT_COUNTS = -256, JPGD_BAD_DHT_INDEX, JPGD_BAD_DHT_MARKER, JPGD_BAD_DQT_MARKER, JPGD_BAD_DQT_TABLE,
+		JPGD_BAD_PRECISION, JPGD_BAD_HEIGHT, JPGD_BAD_WIDTH, JPGD_TOO_MANY_COMPONENTS,
+		JPGD_BAD_SOF_LENGTH, JPGD_BAD_VARIABLE_MARKER, JPGD_BAD_DRI_LENGTH, JPGD_BAD_SOS_LENGTH,
+		JPGD_BAD_SOS_COMP_ID, JPGD_W_EXTRA_BYTES_BEFORE_MARKER, JPGD_NO_ARITHMITIC_SUPPORT, JPGD_UNEXPECTED_MARKER,
+		JPGD_NOT_JPEG, JPGD_UNSUPPORTED_MARKER, JPGD_BAD_DQT_LENGTH, JPGD_TOO_MANY_BLOCKS,
+		JPGD_UNDEFINED_QUANT_TABLE, JPGD_UNDEFINED_HUFF_TABLE, JPGD_NOT_SINGLE_SCAN, JPGD_UNSUPPORTED_COLORSPACE,
+		JPGD_UNSUPPORTED_SAMP_FACTORS, JPGD_DECODE_ERROR, JPGD_BAD_RESTART_MARKER,
+		JPGD_BAD_SOS_SPECTRAL, JPGD_BAD_SOS_SUCCESSIVE, JPGD_STREAM_READ, JPGD_NOTENOUGHMEM, JPGD_TOO_MANY_SCANS
+	};
+
+	// Input stream interface.
+	// Derive from this class to read input data from sources other than files or memory. Set m_eof_flag to true when no more data is available.
+	// The decoder is rather greedy: it will keep on calling this method until its internal input buffer is full, or until the EOF flag is set.
+	// It the input stream contains data after the JPEG stream's EOI (end of image) marker it will probably be pulled into the internal buffer.
+	// Call the get_total_bytes_read() method to determine the actual size of the JPEG stream after successful decoding.
+	class jpeg_decoder_stream
+	{
+	public:
+		jpeg_decoder_stream() { }
+		virtual ~jpeg_decoder_stream() { }
+
+		// The read() method is called when the internal input buffer is empty.
+		// Parameters:
+		// pBuf - input buffer
+		// max_bytes_to_read - maximum bytes that can be written to pBuf
+		// pEOF_flag - set this to true if at end of stream (no more bytes remaining)
+		// Returns -1 on error, otherwise return the number of bytes actually written to the buffer (which may be 0).
+		// Notes: This method will be called in a loop until you set *pEOF_flag to true or the internal buffer is full.
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag) = 0;
+	};
+
+	// stdio FILE stream class.
+	class jpeg_decoder_file_stream : public jpeg_decoder_stream
+	{
+		jpeg_decoder_file_stream(const jpeg_decoder_file_stream&);
+		jpeg_decoder_file_stream& operator =(const jpeg_decoder_file_stream&);
+
+		FILE* m_pFile;
+		bool m_eof_flag, m_error_flag;
+
+	public:
+		jpeg_decoder_file_stream();
+		virtual ~jpeg_decoder_file_stream();
+
+		bool open(const char* Pfilename);
+		void close();
+
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
+
+	// Memory stream class.
+	class jpeg_decoder_mem_stream : public jpeg_decoder_stream
+	{
+		const uint8* m_pSrc_data;
+		uint m_ofs, m_size;
+
+	public:
+		jpeg_decoder_mem_stream() : m_pSrc_data(NULL), m_ofs(0), m_size(0) { }
+		jpeg_decoder_mem_stream(const uint8* pSrc_data, uint size) : m_pSrc_data(pSrc_data), m_ofs(0), m_size(size) { }
+
+		virtual ~jpeg_decoder_mem_stream() { }
+
+		bool open(const uint8* pSrc_data, uint size);
+		void close() { m_pSrc_data = NULL; m_ofs = 0; m_size = 0; }
+
+		virtual int read(uint8* pBuf, int max_bytes_to_read, bool* pEOF_flag);
+	};
+
+	// Loads JPEG file from a jpeg_decoder_stream.
+	unsigned char* decompress_jpeg_image_from_stream(jpeg_decoder_stream* pStream, int* width, int* height, int* actual_comps, int req_comps, uint32_t flags = 0);
+
+	enum
+	{
+		JPGD_IN_BUF_SIZE = 8192, JPGD_MAX_BLOCKS_PER_MCU = 10, JPGD_MAX_HUFF_TABLES = 8, JPGD_MAX_QUANT_TABLES = 4,
+		JPGD_MAX_COMPONENTS = 4, JPGD_MAX_COMPS_IN_SCAN = 4, JPGD_MAX_BLOCKS_PER_ROW = 16384, JPGD_MAX_HEIGHT = 32768, JPGD_MAX_WIDTH = 32768
+	};
+
+	typedef int16 jpgd_quant_t;
+	typedef int16 jpgd_block_t;
+
+	class jpeg_decoder
+	{
+	public:
+		enum
+		{
+			cFlagLinearChromaFiltering = 1
+		};
+
+		// Call get_error_code() after constructing to determine if the stream is valid or not. You may call the get_width(), get_height(), etc.
+		// methods after the constructor is called. You may then either destruct the object, or begin decoding the image by calling begin_decoding(), then decode() on each scanline.
+		jpeg_decoder(jpeg_decoder_stream* pStream, uint32_t flags = cFlagLinearChromaFiltering);
+
+		~jpeg_decoder();
+
+		// Call this method after constructing the object to begin decompression.
+		// If JPGD_SUCCESS is returned you may then call decode() on each scanline.
+
+		int begin_decoding();
+
+		// Returns the next scan line.
+		// For grayscale images, pScan_line will point to a buffer containing 8-bit pixels (get_bytes_per_pixel() will return 1). 
+		// Otherwise, it will always point to a buffer containing 32-bit RGBA pixels (A will always be 255, and get_bytes_per_pixel() will return 4).
+		// Returns JPGD_SUCCESS if a scan line has been returned.
+		// Returns JPGD_DONE if all scan lines have been returned.
+		// Returns JPGD_FAILED if an error occurred. Call get_error_code() for a more info.
+		int decode(const void** pScan_line, uint* pScan_line_len);
+
+		inline jpgd_status get_error_code() const { return m_error_code; }
+
+		inline int get_width() const { return m_image_x_size; }
+		inline int get_height() const { return m_image_y_size; }
+
+		inline int get_num_components() const { return m_comps_in_frame; }
+
+		inline int get_bytes_per_pixel() const { return m_dest_bytes_per_pixel; }
+		inline int get_bytes_per_scan_line() const { return m_image_x_size * get_bytes_per_pixel(); }
+
+		// Returns the total number of bytes actually consumed by the decoder (which should equal the actual size of the JPEG file).
+		inline int get_total_bytes_read() const { return m_total_bytes_read; }
+
+	private:
+		jpeg_decoder(const jpeg_decoder&);
+		jpeg_decoder& operator =(const jpeg_decoder&);
+
+		typedef void (*pDecode_block_func)(jpeg_decoder*, int, int, int);
+
+		struct huff_tables
+		{
+			bool ac_table;
+			uint  look_up[256];
+			uint  look_up2[256];
+			uint8 code_size[JPGD_HUFF_CODE_SIZE_MAX_LENGTH];
+			uint  tree[JPGD_HUFF_TREE_MAX_LENGTH];
+		};
+
+		struct coeff_buf
+		{
+			uint8* pData;
+			int block_num_x, block_num_y;
+			int block_len_x, block_len_y;
+			int block_size;
+		};
+
+		struct mem_block
+		{
+			mem_block* m_pNext;
+			size_t m_used_count;
+			size_t m_size;
+			char m_data[1];
+		};
+
+		jmp_buf m_jmp_state;
+		uint32_t m_flags;
+		mem_block* m_pMem_blocks;
+		int m_image_x_size;
+		int m_image_y_size;
+		jpeg_decoder_stream* m_pStream;
+
+		int m_progressive_flag;
+
+		uint8 m_huff_ac[JPGD_MAX_HUFF_TABLES];
+		uint8* m_huff_num[JPGD_MAX_HUFF_TABLES];      // pointer to number of Huffman codes per bit size
+		uint8* m_huff_val[JPGD_MAX_HUFF_TABLES];      // pointer to Huffman codes per bit size
+		jpgd_quant_t* m_quant[JPGD_MAX_QUANT_TABLES]; // pointer to quantization tables
+		int m_scan_type;                              // Gray, Yh1v1, Yh1v2, Yh2v1, Yh2v2 (CMYK111, CMYK4114 no longer supported)
+		int m_comps_in_frame;                         // # of components in frame
+		int m_comp_h_samp[JPGD_MAX_COMPONENTS];       // component's horizontal sampling factor
+		int m_comp_v_samp[JPGD_MAX_COMPONENTS];       // component's vertical sampling factor
+		int m_comp_quant[JPGD_MAX_COMPONENTS];        // component's quantization table selector
+		int m_comp_ident[JPGD_MAX_COMPONENTS];        // component's ID
+		int m_comp_h_blocks[JPGD_MAX_COMPONENTS];
+		int m_comp_v_blocks[JPGD_MAX_COMPONENTS];
+		int m_comps_in_scan;                          // # of components in scan
+		int m_comp_list[JPGD_MAX_COMPS_IN_SCAN];      // components in this scan
+		int m_comp_dc_tab[JPGD_MAX_COMPONENTS];       // component's DC Huffman coding table selector
+		int m_comp_ac_tab[JPGD_MAX_COMPONENTS];       // component's AC Huffman coding table selector
+		int m_spectral_start;                         // spectral selection start
+		int m_spectral_end;                           // spectral selection end
+		int m_successive_low;                         // successive approximation low
+		int m_successive_high;                        // successive approximation high
+		int m_max_mcu_x_size;                         // MCU's max. X size in pixels
+		int m_max_mcu_y_size;                         // MCU's max. Y size in pixels
+		int m_blocks_per_mcu;
+		int m_max_blocks_per_row;
+		int m_mcus_per_row, m_mcus_per_col;
+		int m_mcu_org[JPGD_MAX_BLOCKS_PER_MCU];
+		int m_total_lines_left;                       // total # lines left in image
+		int m_mcu_lines_left;                         // total # lines left in this MCU
+		int m_num_buffered_scanlines;
+		int m_real_dest_bytes_per_scan_line;
+		int m_dest_bytes_per_scan_line;               // rounded up
+		int m_dest_bytes_per_pixel;                   // 4 (RGB) or 1 (Y)
+		huff_tables* m_pHuff_tabs[JPGD_MAX_HUFF_TABLES];
+		coeff_buf* m_dc_coeffs[JPGD_MAX_COMPONENTS];
+		coeff_buf* m_ac_coeffs[JPGD_MAX_COMPONENTS];
+		int m_eob_run;
+		int m_block_y_mcu[JPGD_MAX_COMPONENTS];
+		uint8* m_pIn_buf_ofs;
+		int m_in_buf_left;
+		int m_tem_flag;
+
+		uint8 m_in_buf_pad_start[64];
+		uint8 m_in_buf[JPGD_IN_BUF_SIZE + 128];
+		uint8 m_in_buf_pad_end[64];
+
+		int m_bits_left;
+		uint m_bit_buf;
+		int m_restart_interval;
+		int m_restarts_left;
+		int m_next_restart_num;
+		int m_max_mcus_per_row;
+		int m_max_blocks_per_mcu;
+
+		int m_max_mcus_per_col;
+		uint m_last_dc_val[JPGD_MAX_COMPONENTS];
+		jpgd_block_t* m_pMCU_coefficients;
+		int m_mcu_block_max_zag[JPGD_MAX_BLOCKS_PER_MCU];
+		uint8* m_pSample_buf;
+		uint8* m_pSample_buf_prev;
+		int m_crr[256];
+		int m_cbb[256];
+		int m_crg[256];
+		int m_cbg[256];
+		uint8* m_pScan_line_0;
+		uint8* m_pScan_line_1;
+		jpgd_status m_error_code;
+		int m_total_bytes_read;
+
+		bool m_ready_flag;
+		bool m_eof_flag;
+		bool m_sample_buf_prev_valid;
+
+		inline int check_sample_buf_ofs(int ofs) const { assert(ofs >= 0); assert(ofs < m_max_blocks_per_row * 64); return ofs; }
+		void free_all_blocks();
+		JPGD_NORETURN void stop_decoding(jpgd_status status);
+		void* alloc(size_t n, bool zero = false);
+		void word_clear(void* p, uint16 c, uint n);
+		void prep_in_buffer();
+		void read_dht_marker();
+		void read_dqt_marker();
+		void read_sof_marker();
+		void skip_variable_marker();
+		void read_dri_marker();
+		void read_sos_marker();
+		int next_marker();
+		int process_markers();
+		void locate_soi_marker();
+		void locate_sof_marker();
+		int locate_sos_marker();
+		void init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void create_look_ups();
+		void fix_in_buffer();
+		void transform_mcu(int mcu_row);
+		coeff_buf* coeff_buf_open(int block_num_x, int block_num_y, int block_len_x, int block_len_y);
+		inline jpgd_block_t* coeff_buf_getp(coeff_buf* cb, int block_x, int block_y);
+		void load_next_row();
+		void decode_next_row();
+		void make_huff_table(int index, huff_tables* pH);
+		void check_quant_tables();
+		void check_huff_tables();
+		bool calc_mcu_block_order();
+		int init_scan();
+		void init_frame();
+		void process_restart();
+		void decode_scan(pDecode_block_func decode_block_func);
+		void init_progressive();
+		void init_sequential();
+		void decode_start();
+		void decode_init(jpeg_decoder_stream* pStream, uint32_t flags);
+		void H2V2Convert();
+		uint32_t H2V2ConvertFiltered();
+		void H2V1Convert();
+		void H2V1ConvertFiltered();
+		void H1V2Convert();
+		void H1V2ConvertFiltered();
+		void H1V1Convert();
+		void gray_convert();
+		void find_eoi();
+		inline uint get_char();
+		inline uint get_char(bool* pPadding_flag);
+		inline void stuff_char(uint8 q);
+		inline uint8 get_octet();
+		inline uint get_bits(int num_bits);
+		inline uint get_bits_no_markers(int numbits);
+		inline int huff_decode(huff_tables* pH);
+		inline int huff_decode(huff_tables* pH, int& extrabits);
+
+		// Clamps a value between 0-255.
+		static inline uint8 clamp(int i)
+		{
+			if (static_cast<uint>(i) > 255)
+				i = (((~i) >> 31) & 0xFF);
+			return static_cast<uint8>(i);
+		}
+		int decode_next_mcu_row();
+
+		static void decode_block_dc_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_dc_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_first(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+		static void decode_block_ac_refine(jpeg_decoder* pD, int component_id, int block_x, int block_y);
+	};
+
+} // namespace jpgd
+
+#endif // JPEG_DECODER_H
diff --git a/thirdparty/basis_universal/encoder/lodepng.cpp b/thirdparty/basis_universal/encoder/lodepng.cpp
new file mode 100644
index 0000000000..63adcf49b6
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/lodepng.cpp
@@ -0,0 +1,6008 @@
+/*
+LodePNG version 20190210
+
+Copyright (c) 2005-2019 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+/*
+The manual and changelog are in the header file "lodepng.h"
+Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for C.
+*/
+
+#ifdef _MSC_VER
+#define _CRT_SECURE_NO_DEPRECATE
+#pragma warning (disable : 4201)
+
+#ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL
+#if defined(_DEBUG) || defined(DEBUG)
+#define _ITERATOR_DEBUG_LEVEL 1
+#define _SECURE_SCL 1
+#else
+#define _SECURE_SCL 0
+#define _ITERATOR_DEBUG_LEVEL 0
+#endif
+#endif
+#endif
+
+#include "lodepng.h"
+
+#include <limits.h> /* LONG_MAX */
+#include <stdio.h> /* file handling */
+#include <stdlib.h> /* allocations */
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1310) /*Visual Studio: A few warning types are not desired here.*/
+#pragma warning( disable : 4244 ) /*implicit conversions: not warned by gcc -Wall -Wextra and requires too much casts*/
+#pragma warning( disable : 4996 ) /*VS does not like fopen, but fopen_s is not standard C so unusable here*/
+#endif /*_MSC_VER */
+
+const char* LODEPNG_VERSION_STRING = "20190210";
+
+/*
+This source file is built up in the following large parts. The code sections
+with the "LODEPNG_COMPILE_" #defines divide this up further in an intermixed way.
+-Tools for C and common code for PNG and Zlib
+-C Code for Zlib (huffman, deflate, ...)
+-C Code for PNG (file format chunks, adam7, PNG filters, color conversions, ...)
+-The C++ wrapper around all of the above
+*/
+
+/*The malloc, realloc and free functions defined here with "lodepng_" in front
+of the name, so that you can easily change them to others related to your
+platform if needed. Everything else in the code calls these. Pass
+-DLODEPNG_NO_COMPILE_ALLOCATORS to the compiler, or comment out
+#define LODEPNG_COMPILE_ALLOCATORS in the header, to disable the ones here and
+define them in your own project's source files without needing to change
+lodepng source code. Don't forget to remove "static" if you copypaste them
+from here.*/
+
+#ifdef LODEPNG_COMPILE_ALLOCATORS
+static void* lodepng_malloc(size_t size) {
+#ifdef LODEPNG_MAX_ALLOC
+  if(size > LODEPNG_MAX_ALLOC) return 0;
+#endif
+  return malloc(size);
+}
+
+static void* lodepng_realloc(void* ptr, size_t new_size) {
+#ifdef LODEPNG_MAX_ALLOC
+  if(new_size > LODEPNG_MAX_ALLOC) return 0;
+#endif
+  return realloc(ptr, new_size);
+}
+
+static void lodepng_free(void* ptr) {
+  free(ptr);
+}
+#else /*LODEPNG_COMPILE_ALLOCATORS*/
+void* lodepng_malloc(size_t size);
+void* lodepng_realloc(void* ptr, size_t new_size);
+void lodepng_free(void* ptr);
+#endif /*LODEPNG_COMPILE_ALLOCATORS*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // Tools for C, and common code for PNG and Zlib.                       // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#define LODEPNG_MAX(a, b) (((a) > (b)) ? (a) : (b))
+#define LODEPNG_MIN(a, b) (((a) < (b)) ? (a) : (b))
+
+/*
+Often in case of an error a value is assigned to a variable and then it breaks
+out of a loop (to go to the cleanup phase of a function). This macro does that.
+It makes the error handling code shorter and more readable.
+
+Example: if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83);
+*/
+#define CERROR_BREAK(errorvar, code){\
+  errorvar = code;\
+  break;\
+}
+
+/*version of CERROR_BREAK that assumes the common case where the error variable is named "error"*/
+#define ERROR_BREAK(code) CERROR_BREAK(error, code)
+
+/*Set error var to the error code, and return it.*/
+#define CERROR_RETURN_ERROR(errorvar, code){\
+  errorvar = code;\
+  return code;\
+}
+
+/*Try the code, if it returns error, also return the error.*/
+#define CERROR_TRY_RETURN(call){\
+  unsigned error = call;\
+  if(error) return error;\
+}
+
+/*Set error var to the error code, and return from the void function.*/
+#define CERROR_RETURN(errorvar, code){\
+  errorvar = code;\
+  return;\
+}
+
+/*
+About uivector, ucvector and string:
+-All of them wrap dynamic arrays or text strings in a similar way.
+-LodePNG was originally written in C++. The vectors replace the std::vectors that were used in the C++ version.
+-The string tools are made to avoid problems with compilers that declare things like strncat as deprecated.
+-They're not used in the interface, only internally in this file as static functions.
+-As with many other structs in this file, the init and cleanup functions serve as ctor and dtor.
+*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*dynamic vector of unsigned ints*/
+typedef struct uivector {
+  unsigned* data;
+  size_t size; /*size in number of unsigned longs*/
+  size_t allocsize; /*allocated size in bytes*/
+} uivector;
+
+static void uivector_cleanup(void* p) {
+  ((uivector*)p)->size = ((uivector*)p)->allocsize = 0;
+  lodepng_free(((uivector*)p)->data);
+  ((uivector*)p)->data = NULL;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_reserve(uivector* p, size_t allocsize) {
+  if(allocsize > p->allocsize) {
+    size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+    void* data = lodepng_realloc(p->data, newsize);
+    if(data) {
+      p->allocsize = newsize;
+      p->data = (unsigned*)data;
+    }
+    else return 0; /*error: not enough memory*/
+  }
+  return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_resize(uivector* p, size_t size) {
+  if(!uivector_reserve(p, size * sizeof(unsigned))) return 0;
+  p->size = size;
+  return 1; /*success*/
+}
+
+/*resize and give all new elements the value*/
+static unsigned uivector_resizev(uivector* p, size_t size, unsigned value) {
+  size_t oldsize = p->size, i;
+  if(!uivector_resize(p, size)) return 0;
+  for(i = oldsize; i < size; ++i) p->data[i] = value;
+  return 1;
+}
+
+static void uivector_init(uivector* p) {
+  p->data = NULL;
+  p->size = p->allocsize = 0;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_push_back(uivector* p, unsigned c) {
+  if(!uivector_resize(p, p->size + 1)) return 0;
+  if (!p->data) return 0;
+  p->data[p->size - 1] = c;
+  return 1;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+/*dynamic vector of unsigned chars*/
+typedef struct ucvector {
+  unsigned char* data;
+  size_t size; /*used size*/
+  size_t allocsize; /*allocated size*/
+} ucvector;
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_reserve(ucvector* p, size_t allocsize) {
+  if(allocsize > p->allocsize) {
+    size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+    void* data = lodepng_realloc(p->data, newsize);
+    if(data) {
+      p->allocsize = newsize;
+      p->data = (unsigned char*)data;
+    }
+    else return 0; /*error: not enough memory*/
+  }
+  return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_resize(ucvector* p, size_t size) {
+  if(!ucvector_reserve(p, size * sizeof(unsigned char))) return 0;
+  p->size = size;
+  return 1; /*success*/
+}
+
+#ifdef LODEPNG_COMPILE_PNG
+
+static void ucvector_cleanup(void* p) {
+  ((ucvector*)p)->size = ((ucvector*)p)->allocsize = 0;
+  lodepng_free(((ucvector*)p)->data);
+  ((ucvector*)p)->data = NULL;
+}
+
+static void ucvector_init(ucvector* p) {
+  p->data = NULL;
+  p->size = p->allocsize = 0;
+}
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*you can both convert from vector to buffer&size and vica versa. If you use
+init_buffer to take over a buffer and size, it is not needed to use cleanup*/
+static void ucvector_init_buffer(ucvector* p, unsigned char* buffer, size_t size) {
+  p->data = buffer;
+  p->allocsize = p->size = size;
+}
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#if (defined(LODEPNG_COMPILE_PNG) && defined(LODEPNG_COMPILE_ANCILLARY_CHUNKS)) || defined(LODEPNG_COMPILE_ENCODER)
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_push_back(ucvector* p, unsigned char c) {
+  if(!ucvector_resize(p, p->size + 1)) return 0;
+  p->data[p->size - 1] = c;
+  return 1;
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+/*free string pointer and set it to NULL*/
+static void string_cleanup(char** out) {
+  lodepng_free(*out);
+  *out = NULL;
+}
+
+/* dynamically allocates a new string with a copy of the null terminated input text */
+static char* alloc_string(const char* in) {
+  size_t insize = strlen(in);
+  char* out = (char*)lodepng_malloc(insize + 1);
+  if(out) {
+    size_t i;
+    for(i = 0; i != insize; ++i) {
+      out[i] = in[i];
+    }
+    out[i] = 0;
+  }
+  return out;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_read32bitInt(const unsigned char* buffer) {
+  return (unsigned)((buffer[0] << 24) | (buffer[1] << 16) | (buffer[2] << 8) | buffer[3]);
+}
+
+#if defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)
+/*buffer must have at least 4 allocated bytes available*/
+static void lodepng_set32bitInt(unsigned char* buffer, unsigned value) {
+  buffer[0] = (unsigned char)((value >> 24) & 0xff);
+  buffer[1] = (unsigned char)((value >> 16) & 0xff);
+  buffer[2] = (unsigned char)((value >>  8) & 0xff);
+  buffer[3] = (unsigned char)((value      ) & 0xff);
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static void lodepng_add32bitInt(ucvector* buffer, unsigned value) {
+  ucvector_resize(buffer, buffer->size + 4); /*todo: give error if resize failed*/
+  lodepng_set32bitInt(&buffer->data[buffer->size - 4], value);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / File IO                                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DISK
+
+/* returns negative value on error. This should be pure C compatible, so no fstat. */
+static long lodepng_filesize(const char* filename) {
+  FILE* file;
+  long size;
+  file = fopen(filename, "rb");
+  if(!file) return -1;
+
+  if(fseek(file, 0, SEEK_END) != 0) {
+    fclose(file);
+    return -1;
+  }
+
+  size = ftell(file);
+  /* It may give LONG_MAX as directory size, this is invalid for us. */
+  if(size == LONG_MAX) size = -1;
+
+  fclose(file);
+  return size;
+}
+
+/* load file into buffer that already has the correct allocated size. Returns error code.*/
+static unsigned lodepng_buffer_file(unsigned char* out, size_t size, const char* filename) {
+  FILE* file;
+  size_t readsize;
+  file = fopen(filename, "rb");
+  if(!file) return 78;
+
+  readsize = fread(out, 1, size, file);
+  fclose(file);
+
+  if (readsize != size) return 78;
+  return 0;
+}
+
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename) {
+  long size = lodepng_filesize(filename);
+  if (size < 0) return 78;
+  *outsize = (size_t)size;
+
+  *out = (unsigned char*)lodepng_malloc((size_t)size);
+  if(!(*out) && size > 0) return 83; /*the above malloc failed*/
+
+  return lodepng_buffer_file(*out, (size_t)size, filename);
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename) {
+  FILE* file;
+  file = fopen(filename, "wb" );
+  if(!file) return 79;
+  fwrite(buffer, 1, buffersize, file);
+  fclose(file);
+  return 0;
+}
+
+#endif /*LODEPNG_COMPILE_DISK*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of common code and tools. Begin of Zlib related code.            // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_ENCODER
+/*TODO: this ignores potential out of memory errors*/
+#define addBitToStream(/*size_t**/ bitpointer, /*ucvector**/ bitstream, /*unsigned char*/ bit){\
+  /*add a new byte at the end*/\
+  if(((*bitpointer) & 7) == 0) ucvector_push_back(bitstream, (unsigned char)0);\
+  /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/\
+  (bitstream->data[bitstream->size - 1]) |= (bit << ((*bitpointer) & 0x7));\
+  ++(*bitpointer);\
+}
+
+static void addBitsToStream(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits) {
+  size_t i;
+  for(i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> i) & 1));
+}
+
+static void addBitsToStreamReversed(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits) {
+  size_t i;
+  for(i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> (nbits - 1 - i)) & 1));
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+#define READBIT(bitpointer, bitstream) ((bitstream[bitpointer >> 3] >> (bitpointer & 0x7)) & (unsigned char)1)
+
+static unsigned char readBitFromStream(size_t* bitpointer, const unsigned char* bitstream) {
+  unsigned char result = (unsigned char)(READBIT(*bitpointer, bitstream));
+  ++(*bitpointer);
+  return result;
+}
+
+static unsigned readBitsFromStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits) {
+  unsigned result = 0, i;
+  for(i = 0; i != nbits; ++i) {
+    result += ((unsigned)READBIT(*bitpointer, bitstream)) << i;
+    ++(*bitpointer);
+  }
+  return result;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflate - Huffman                                                      / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#define FIRST_LENGTH_CODE_INDEX 257
+#define LAST_LENGTH_CODE_INDEX 285
+/*256 literals, the end code, some length codes, and 2 unused codes*/
+#define NUM_DEFLATE_CODE_SYMBOLS 288
+/*the distance codes have their own symbols, 30 used, 2 unused*/
+#define NUM_DISTANCE_SYMBOLS 32
+/*the code length codes. 0-15: code lengths, 16: copy previous 3-6 times, 17: 3-10 zeros, 18: 11-138 zeros*/
+#define NUM_CODE_LENGTH_CODES 19
+
+/*the base lengths represented by codes 257-285*/
+static const unsigned LENGTHBASE[29]
+  = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
+     67, 83, 99, 115, 131, 163, 195, 227, 258};
+
+/*the extra bits used by codes 257-285 (added to base length)*/
+static const unsigned LENGTHEXTRA[29]
+  = {0, 0, 0, 0, 0, 0, 0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+      4,  4,  4,   4,   5,   5,   5,   5,   0};
+
+/*the base backwards distances (the bits of distance codes appear after length codes and use their own huffman tree)*/
+static const unsigned DISTANCEBASE[30]
+  = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
+     769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577};
+
+/*the extra bits of backwards distances (added to base)*/
+static const unsigned DISTANCEEXTRA[30]
+  = {0, 0, 0, 0, 1, 1, 2,  2,  3,  3,  4,  4,  5,  5,   6,   6,   7,   7,   8,
+       8,    9,    9,   10,   10,   11,   11,   12,    12,    13,    13};
+
+/*the order in which "code length alphabet code lengths" are stored, out of this
+the huffman tree of the dynamic huffman tree lengths is generated*/
+static const unsigned CLCL_ORDER[NUM_CODE_LENGTH_CODES]
+  = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Huffman tree struct, containing multiple representations of the tree
+*/
+typedef struct HuffmanTree {
+  unsigned* tree2d;
+  unsigned* tree1d;
+  unsigned* lengths; /*the lengths of the codes of the 1d-tree*/
+  unsigned maxbitlen; /*maximum number of bits a single code can get*/
+  unsigned numcodes; /*number of symbols in the alphabet = number of codes*/
+} HuffmanTree;
+
+/*function used for debug purposes to draw the tree in ascii art with C++*/
+/*
+static void HuffmanTree_draw(HuffmanTree* tree) {
+  std::cout << "tree. length: " << tree->numcodes << " maxbitlen: " << tree->maxbitlen << std::endl;
+  for(size_t i = 0; i != tree->tree1d.size; ++i) {
+    if(tree->lengths.data[i])
+      std::cout << i << " " << tree->tree1d.data[i] << " " << tree->lengths.data[i] << std::endl;
+  }
+  std::cout << std::endl;
+}*/
+
+static void HuffmanTree_init(HuffmanTree* tree) {
+  tree->tree2d = 0;
+  tree->tree1d = 0;
+  tree->lengths = 0;
+}
+
+static void HuffmanTree_cleanup(HuffmanTree* tree) {
+  lodepng_free(tree->tree2d);
+  lodepng_free(tree->tree1d);
+  lodepng_free(tree->lengths);
+}
+
+/*the tree representation used by the decoder. return value is error*/
+static unsigned HuffmanTree_make2DTree(HuffmanTree* tree) {
+  unsigned nodefilled = 0; /*up to which node it is filled*/
+  unsigned treepos = 0; /*position in the tree (1 of the numcodes columns)*/
+  unsigned n, i;
+
+  tree->tree2d = (unsigned*)lodepng_malloc(tree->numcodes * 2 * sizeof(unsigned));
+  if(!tree->tree2d) return 83; /*alloc fail*/
+
+  /*
+  convert tree1d[] to tree2d[][]. In the 2D array, a value of 32767 means
+  uninited, a value >= numcodes is an address to another bit, a value < numcodes
+  is a code. The 2 rows are the 2 possible bit values (0 or 1), there are as
+  many columns as codes - 1.
+  A good huffman tree has N * 2 - 1 nodes, of which N - 1 are internal nodes.
+  Here, the internal nodes are stored (what their 0 and 1 option point to).
+  There is only memory for such good tree currently, if there are more nodes
+  (due to too long length codes), error 55 will happen
+  */
+  for(n = 0; n < tree->numcodes * 2; ++n) {
+    tree->tree2d[n] = 32767; /*32767 here means the tree2d isn't filled there yet*/
+  }
+
+  for(n = 0; n < tree->numcodes; ++n) /*the codes*/ {
+    for(i = 0; i != tree->lengths[n]; ++i) /*the bits for this code*/ {
+      unsigned char bit = (unsigned char)((tree->tree1d[n] >> (tree->lengths[n] - i - 1)) & 1);
+      /*oversubscribed, see comment in lodepng_error_text*/
+      if(treepos > 2147483647 || treepos + 2 > tree->numcodes) return 55;
+      if(tree->tree2d[2 * treepos + bit] == 32767) /*not yet filled in*/ {
+        if(i + 1 == tree->lengths[n]) /*last bit*/ {
+          tree->tree2d[2 * treepos + bit] = n; /*put the current code in it*/
+          treepos = 0;
+        } else {
+          /*put address of the next step in here, first that address has to be found of course
+          (it's just nodefilled + 1)...*/
+          ++nodefilled;
+          /*addresses encoded with numcodes added to it*/
+          tree->tree2d[2 * treepos + bit] = nodefilled + tree->numcodes;
+          treepos = nodefilled;
+        }
+      }
+      else treepos = tree->tree2d[2 * treepos + bit] - tree->numcodes;
+    }
+  }
+
+  for(n = 0; n < tree->numcodes * 2; ++n) {
+    if(tree->tree2d[n] == 32767) tree->tree2d[n] = 0; /*remove possible remaining 32767's*/
+  }
+
+  return 0;
+}
+
+/*
+Second step for the ...makeFromLengths and ...makeFromFrequencies functions.
+numcodes, lengths and maxbitlen must already be filled in correctly. return
+value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths2(HuffmanTree* tree) {
+  uivector blcount;
+  uivector nextcode;
+  unsigned error = 0;
+  unsigned bits, n;
+
+  uivector_init(&blcount);
+  uivector_init(&nextcode);
+
+  tree->tree1d = (unsigned*)lodepng_malloc(tree->numcodes * sizeof(unsigned));
+  if(!tree->tree1d) error = 83; /*alloc fail*/
+
+  if(!uivector_resizev(&blcount, tree->maxbitlen + 1, 0)
+  || !uivector_resizev(&nextcode, tree->maxbitlen + 1, 0))
+    error = 83; /*alloc fail*/
+
+  if(!error) {
+    /*step 1: count number of instances of each code length*/
+    for(bits = 0; bits != tree->numcodes; ++bits) ++blcount.data[tree->lengths[bits]];
+    /*step 2: generate the nextcode values*/
+    for(bits = 1; bits <= tree->maxbitlen; ++bits) {
+      nextcode.data[bits] = (nextcode.data[bits - 1] + blcount.data[bits - 1]) << 1;
+    }
+    /*step 3: generate all the codes*/
+    for(n = 0; n != tree->numcodes; ++n) {
+      if(tree->lengths[n] != 0) tree->tree1d[n] = nextcode.data[tree->lengths[n]]++;
+    }
+  }
+
+  uivector_cleanup(&blcount);
+  uivector_cleanup(&nextcode);
+
+  if(!error) return HuffmanTree_make2DTree(tree);
+  else return error;
+}
+
+/*
+given the code lengths (as stored in the PNG file), generate the tree as defined
+by Deflate. maxbitlen is the maximum bits that a code in the tree can have.
+return value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths(HuffmanTree* tree, const unsigned* bitlen,
+                                            size_t numcodes, unsigned maxbitlen) {
+  unsigned i;
+  tree->lengths = (unsigned*)lodepng_malloc(numcodes * sizeof(unsigned));
+  if(!tree->lengths) return 83; /*alloc fail*/
+  for(i = 0; i != numcodes; ++i) tree->lengths[i] = bitlen[i];
+  tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+  tree->maxbitlen = maxbitlen;
+  return HuffmanTree_makeFromLengths2(tree);
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*BPM: Boundary Package Merge, see "A Fast and Space-Economical Algorithm for Length-Limited Coding",
+Jyrki Katajainen, Alistair Moffat, Andrew Turpin, 1995.*/
+
+/*chain node for boundary package merge*/
+typedef struct BPMNode {
+  int weight; /*the sum of all weights in this chain*/
+  unsigned index; /*index of this leaf node (called "count" in the paper)*/
+  struct BPMNode* tail; /*the next nodes in this chain (null if last)*/
+  int in_use;
+} BPMNode;
+
+/*lists of chains*/
+typedef struct BPMLists {
+  /*memory pool*/
+  unsigned memsize;
+  BPMNode* memory;
+  unsigned numfree;
+  unsigned nextfree;
+  BPMNode** freelist;
+  /*two heads of lookahead chains per list*/
+  unsigned listsize;
+  BPMNode** chains0;
+  BPMNode** chains1;
+} BPMLists;
+
+/*creates a new chain node with the given parameters, from the memory in the lists */
+static BPMNode* bpmnode_create(BPMLists* lists, int weight, unsigned index, BPMNode* tail) {
+  unsigned i;
+  BPMNode* result;
+
+  /*memory full, so garbage collect*/
+  if(lists->nextfree >= lists->numfree) {
+    /*mark only those that are in use*/
+    for(i = 0; i != lists->memsize; ++i) lists->memory[i].in_use = 0;
+    for(i = 0; i != lists->listsize; ++i) {
+      BPMNode* node;
+      for(node = lists->chains0[i]; node != 0; node = node->tail) node->in_use = 1;
+      for(node = lists->chains1[i]; node != 0; node = node->tail) node->in_use = 1;
+    }
+    /*collect those that are free*/
+    lists->numfree = 0;
+    for(i = 0; i != lists->memsize; ++i) {
+      if(!lists->memory[i].in_use) lists->freelist[lists->numfree++] = &lists->memory[i];
+    }
+    lists->nextfree = 0;
+  }
+
+  result = lists->freelist[lists->nextfree++];
+  result->weight = weight;
+  result->index = index;
+  result->tail = tail;
+  return result;
+}
+
+/*sort the leaves with stable mergesort*/
+static void bpmnode_sort(BPMNode* leaves, size_t num) {
+  BPMNode* mem = (BPMNode*)lodepng_malloc(sizeof(*leaves) * num);
+  size_t width, counter = 0;
+  for(width = 1; width < num; width *= 2) {
+    BPMNode* a = (counter & 1) ? mem : leaves;
+    BPMNode* b = (counter & 1) ? leaves : mem;
+    size_t p;
+    for(p = 0; p < num; p += 2 * width) {
+      size_t q = (p + width > num) ? num : (p + width);
+      size_t r = (p + 2 * width > num) ? num : (p + 2 * width);
+      size_t i = p, j = q, k;
+      for(k = p; k < r; k++) {
+        if(i < q && (j >= r || a[i].weight <= a[j].weight)) b[k] = a[i++];
+        else b[k] = a[j++];
+      }
+    }
+    counter++;
+  }
+  if(counter & 1) memcpy(leaves, mem, sizeof(*leaves) * num);
+  lodepng_free(mem);
+}
+
+/*Boundary Package Merge step, numpresent is the amount of leaves, and c is the current chain.*/
+static void boundaryPM(BPMLists* lists, BPMNode* leaves, size_t numpresent, int c, int num) {
+  unsigned lastindex = lists->chains1[c]->index;
+
+  if(c == 0) {
+    if(lastindex >= numpresent) return;
+    lists->chains0[c] = lists->chains1[c];
+    lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, 0);
+  } else {
+    /*sum of the weights of the head nodes of the previous lookahead chains.*/
+    int sum = lists->chains0[c - 1]->weight + lists->chains1[c - 1]->weight;
+    lists->chains0[c] = lists->chains1[c];
+    if(lastindex < numpresent && sum > leaves[lastindex].weight) {
+      lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, lists->chains1[c]->tail);
+      return;
+    }
+    lists->chains1[c] = bpmnode_create(lists, sum, lastindex, lists->chains1[c - 1]);
+    /*in the end we are only interested in the chain of the last list, so no
+    need to recurse if we're at the last one (this gives measurable speedup)*/
+    if(num + 1 < (int)(2 * numpresent - 2)) {
+      boundaryPM(lists, leaves, numpresent, c - 1, num);
+      boundaryPM(lists, leaves, numpresent, c - 1, num);
+    }
+  }
+}
+
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+                                      size_t numcodes, unsigned maxbitlen) {
+  unsigned error = 0;
+  unsigned i;
+  size_t numpresent = 0; /*number of symbols with non-zero frequency*/
+  BPMNode* leaves; /*the symbols, only those with > 0 frequency*/
+
+  if(numcodes == 0) return 80; /*error: a tree of 0 symbols is not supposed to be made*/
+  if((1u << maxbitlen) < (unsigned)numcodes) return 80; /*error: represent all symbols*/
+
+  leaves = (BPMNode*)lodepng_malloc(numcodes * sizeof(*leaves));
+  if(!leaves) return 83; /*alloc fail*/
+
+  for(i = 0; i != numcodes; ++i) {
+    if(frequencies[i] > 0) {
+      leaves[numpresent].weight = (int)frequencies[i];
+      leaves[numpresent].index = i;
+      ++numpresent;
+    }
+  }
+
+  for(i = 0; i != numcodes; ++i) lengths[i] = 0;
+
+  /*ensure at least two present symbols. There should be at least one symbol
+  according to RFC 1951 section 3.2.7. Some decoders incorrectly require two. To
+  make these work as well ensure there are at least two symbols. The
+  Package-Merge code below also doesn't work correctly if there's only one
+  symbol, it'd give it the theoritical 0 bits but in practice zlib wants 1 bit*/
+  if(numpresent == 0) {
+    lengths[0] = lengths[1] = 1; /*note that for RFC 1951 section 3.2.7, only lengths[0] = 1 is needed*/
+  } else if(numpresent == 1) {
+    lengths[leaves[0].index] = 1;
+    lengths[leaves[0].index == 0 ? 1 : 0] = 1;
+  } else {
+    BPMLists lists;
+    BPMNode* node;
+
+    bpmnode_sort(leaves, numpresent);
+
+    lists.listsize = maxbitlen;
+    lists.memsize = 2 * maxbitlen * (maxbitlen + 1);
+    lists.nextfree = 0;
+    lists.numfree = lists.memsize;
+    lists.memory = (BPMNode*)lodepng_malloc(lists.memsize * sizeof(*lists.memory));
+    lists.freelist = (BPMNode**)lodepng_malloc(lists.memsize * sizeof(BPMNode*));
+    lists.chains0 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+    lists.chains1 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+    if(!lists.memory || !lists.freelist || !lists.chains0 || !lists.chains1) error = 83; /*alloc fail*/
+
+    if(!error) {
+      for(i = 0; i != lists.memsize; ++i) lists.freelist[i] = &lists.memory[i];
+
+      bpmnode_create(&lists, leaves[0].weight, 1, 0);
+      bpmnode_create(&lists, leaves[1].weight, 2, 0);
+
+      for(i = 0; i != lists.listsize; ++i) {
+        lists.chains0[i] = &lists.memory[0];
+        lists.chains1[i] = &lists.memory[1];
+      }
+
+      /*each boundaryPM call adds one chain to the last list, and we need 2 * numpresent - 2 chains.*/
+      for(i = 2; i != 2 * numpresent - 2; ++i) boundaryPM(&lists, leaves, numpresent, (int)maxbitlen - 1, (int)i);
+
+      for(node = lists.chains1[maxbitlen - 1]; node; node = node->tail) {
+        for(i = 0; i != node->index; ++i) ++lengths[leaves[i].index];
+      }
+    }
+
+    lodepng_free(lists.memory);
+    lodepng_free(lists.freelist);
+    lodepng_free(lists.chains0);
+    lodepng_free(lists.chains1);
+  }
+
+  lodepng_free(leaves);
+  return error;
+}
+
+/*Create the Huffman tree given the symbol frequencies*/
+static unsigned HuffmanTree_makeFromFrequencies(HuffmanTree* tree, const unsigned* frequencies,
+                                                size_t mincodes, size_t numcodes, unsigned maxbitlen) {
+  unsigned error = 0;
+  while(!frequencies[numcodes - 1] && numcodes > mincodes) --numcodes; /*trim zeroes*/
+  tree->maxbitlen = maxbitlen;
+  tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+  tree->lengths = (unsigned*)lodepng_realloc(tree->lengths, numcodes * sizeof(unsigned));
+  if(!tree->lengths) return 83; /*alloc fail*/
+  /*initialize all lengths to 0*/
+  memset(tree->lengths, 0, numcodes * sizeof(unsigned));
+
+  error = lodepng_huffman_code_lengths(tree->lengths, frequencies, numcodes, maxbitlen);
+  if(!error) error = HuffmanTree_makeFromLengths2(tree);
+  return error;
+}
+
+static unsigned HuffmanTree_getCode(const HuffmanTree* tree, unsigned index) {
+  return tree->tree1d[index];
+}
+
+static unsigned HuffmanTree_getLength(const HuffmanTree* tree, unsigned index) {
+  return tree->lengths[index];
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*get the literal and length code tree of a deflated block with fixed tree, as per the deflate specification*/
+static unsigned generateFixedLitLenTree(HuffmanTree* tree) {
+  unsigned i, error = 0;
+  unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+  if(!bitlen) return 83; /*alloc fail*/
+
+  /*288 possible codes: 0-255=literals, 256=endcode, 257-285=lengthcodes, 286-287=unused*/
+  for(i =   0; i <= 143; ++i) bitlen[i] = 8;
+  for(i = 144; i <= 255; ++i) bitlen[i] = 9;
+  for(i = 256; i <= 279; ++i) bitlen[i] = 7;
+  for(i = 280; i <= 287; ++i) bitlen[i] = 8;
+
+  error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DEFLATE_CODE_SYMBOLS, 15);
+
+  lodepng_free(bitlen);
+  return error;
+}
+
+/*get the distance code tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static unsigned generateFixedDistanceTree(HuffmanTree* tree) {
+  unsigned i, error = 0;
+  unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+  if(!bitlen) return 83; /*alloc fail*/
+
+  /*there are 32 distance codes, but 30-31 are unused*/
+  for(i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen[i] = 5;
+  error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DISTANCE_SYMBOLS, 15);
+
+  lodepng_free(bitlen);
+  return error;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/*
+returns the code, or (unsigned)(-1) if error happened
+inbitlength is the length of the complete buffer, in bits (so its byte length times 8)
+*/
+static unsigned huffmanDecodeSymbol(const unsigned char* in, size_t* bp,
+                                    const HuffmanTree* codetree, size_t inbitlength) {
+  unsigned treepos = 0, ct;
+  for(;;) {
+    if(*bp >= inbitlength) return (unsigned)(-1); /*error: end of input memory reached without endcode*/
+    /*
+    decode the symbol from the tree. The "readBitFromStream" code is inlined in
+    the expression below because this is the biggest bottleneck while decoding
+    */
+    ct = codetree->tree2d[(treepos << 1) + READBIT(*bp, in)];
+    ++(*bp);
+    if(ct < codetree->numcodes) return ct; /*the symbol is decoded, return it*/
+    else treepos = ct - codetree->numcodes; /*symbol not yet decoded, instead move tree position*/
+
+    if(treepos >= codetree->numcodes) return (unsigned)(-1); /*error: it appeared outside the codetree*/
+  }
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Inflator (Decompressor)                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*get the tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static void getTreeInflateFixed(HuffmanTree* tree_ll, HuffmanTree* tree_d) {
+  /*TODO: check for out of memory errors*/
+  generateFixedLitLenTree(tree_ll);
+  generateFixedDistanceTree(tree_d);
+}
+
+/*get the tree of a deflated block with dynamic tree, the tree itself is also Huffman compressed with a known tree*/
+static unsigned getTreeInflateDynamic(HuffmanTree* tree_ll, HuffmanTree* tree_d,
+                                      const unsigned char* in, size_t* bp, size_t inlength) {
+  /*make sure that length values that aren't filled in will be 0, or a wrong tree will be generated*/
+  unsigned error = 0;
+  unsigned n, HLIT, HDIST, HCLEN, i;
+  size_t inbitlength = inlength * 8;
+
+  /*see comments in deflateDynamic for explanation of the context and these variables, it is analogous*/
+  unsigned* bitlen_ll = 0; /*lit,len code lengths*/
+  unsigned* bitlen_d = 0; /*dist code lengths*/
+  /*code length code lengths ("clcl"), the bit lengths of the huffman tree used to compress bitlen_ll and bitlen_d*/
+  unsigned* bitlen_cl = 0;
+  HuffmanTree tree_cl; /*the code tree for code length codes (the huffman tree for compressed huffman trees)*/
+
+  if((*bp) + 14 > (inlength << 3)) return 49; /*error: the bit pointer is or will go past the memory*/
+
+  /*number of literal/length codes + 257. Unlike the spec, the value 257 is added to it here already*/
+  HLIT =  readBitsFromStream(bp, in, 5) + 257;
+  /*number of distance codes. Unlike the spec, the value 1 is added to it here already*/
+  HDIST = readBitsFromStream(bp, in, 5) + 1;
+  /*number of code length codes. Unlike the spec, the value 4 is added to it here already*/
+  HCLEN = readBitsFromStream(bp, in, 4) + 4;
+
+  if((*bp) + HCLEN * 3 > (inlength << 3)) return 50; /*error: the bit pointer is or will go past the memory*/
+
+  HuffmanTree_init(&tree_cl);
+
+  while(!error) {
+    /*read the code length codes out of 3 * (amount of code length codes) bits*/
+
+    bitlen_cl = (unsigned*)lodepng_malloc(NUM_CODE_LENGTH_CODES * sizeof(unsigned));
+    if(!bitlen_cl) ERROR_BREAK(83 /*alloc fail*/);
+
+    for(i = 0; i != NUM_CODE_LENGTH_CODES; ++i) {
+      if(i < HCLEN) bitlen_cl[CLCL_ORDER[i]] = readBitsFromStream(bp, in, 3);
+      else bitlen_cl[CLCL_ORDER[i]] = 0; /*if not, it must stay 0*/
+    }
+
+    error = HuffmanTree_makeFromLengths(&tree_cl, bitlen_cl, NUM_CODE_LENGTH_CODES, 7);
+    if(error) break;
+
+    /*now we can use this tree to read the lengths for the tree that this function will return*/
+    bitlen_ll = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+    bitlen_d = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+    if(!bitlen_ll || !bitlen_d) ERROR_BREAK(83 /*alloc fail*/);
+    for(i = 0; i != NUM_DEFLATE_CODE_SYMBOLS; ++i) bitlen_ll[i] = 0;
+    for(i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen_d[i] = 0;
+
+    /*i is the current symbol we're reading in the part that contains the code lengths of lit/len and dist codes*/
+    i = 0;
+    while(i < HLIT + HDIST) {
+      unsigned code = huffmanDecodeSymbol(in, bp, &tree_cl, inbitlength);
+      if(code <= 15) /*a length code*/ {
+        if(i < HLIT) bitlen_ll[i] = code;
+        else bitlen_d[i - HLIT] = code;
+        ++i;
+      } else if(code == 16) /*repeat previous*/ {
+        unsigned replength = 3; /*read in the 2 bits that indicate repeat length (3-6)*/
+        unsigned value; /*set value to the previous code*/
+
+        if(i == 0) ERROR_BREAK(54); /*can't repeat previous if i is 0*/
+
+        if((*bp + 2) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+        replength += readBitsFromStream(bp, in, 2);
+
+        if(i < HLIT + 1) value = bitlen_ll[i - 1];
+        else value = bitlen_d[i - HLIT - 1];
+        /*repeat this value in the next lengths*/
+        for(n = 0; n < replength; ++n) {
+          if(i >= HLIT + HDIST) ERROR_BREAK(13); /*error: i is larger than the amount of codes*/
+          if(i < HLIT) bitlen_ll[i] = value;
+          else bitlen_d[i - HLIT] = value;
+          ++i;
+        }
+      } else if(code == 17) /*repeat "0" 3-10 times*/ {
+        unsigned replength = 3; /*read in the bits that indicate repeat length*/
+        if((*bp + 3) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+        replength += readBitsFromStream(bp, in, 3);
+
+        /*repeat this value in the next lengths*/
+        for(n = 0; n < replength; ++n) {
+          if(i >= HLIT + HDIST) ERROR_BREAK(14); /*error: i is larger than the amount of codes*/
+
+          if(i < HLIT) bitlen_ll[i] = 0;
+          else bitlen_d[i - HLIT] = 0;
+          ++i;
+        }
+      } else if(code == 18) /*repeat "0" 11-138 times*/ {
+        unsigned replength = 11; /*read in the bits that indicate repeat length*/
+        if((*bp + 7) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+        replength += readBitsFromStream(bp, in, 7);
+
+        /*repeat this value in the next lengths*/
+        for(n = 0; n < replength; ++n) {
+          if(i >= HLIT + HDIST) ERROR_BREAK(15); /*error: i is larger than the amount of codes*/
+
+          if(i < HLIT) bitlen_ll[i] = 0;
+          else bitlen_d[i - HLIT] = 0;
+          ++i;
+        }
+      } else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/ {
+        if(code == (unsigned)(-1)) {
+          /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+          (10=no endcode, 11=wrong jump outside of tree)*/
+          error = (*bp) > inbitlength ? 10 : 11;
+        }
+        else error = 16; /*unexisting code, this can never happen*/
+        break;
+      }
+    }
+    if(error) break;
+
+    if(bitlen_ll[256] == 0) ERROR_BREAK(64); /*the length of the end code 256 must be larger than 0*/
+
+    /*now we've finally got HLIT and HDIST, so generate the code trees, and the function is done*/
+    error = HuffmanTree_makeFromLengths(tree_ll, bitlen_ll, NUM_DEFLATE_CODE_SYMBOLS, 15);
+    if(error) break;
+    error = HuffmanTree_makeFromLengths(tree_d, bitlen_d, NUM_DISTANCE_SYMBOLS, 15);
+
+    break; /*end of error-while*/
+  }
+
+  lodepng_free(bitlen_cl);
+  lodepng_free(bitlen_ll);
+  lodepng_free(bitlen_d);
+  HuffmanTree_cleanup(&tree_cl);
+
+  return error;
+}
+
+/*inflate a block with dynamic of fixed Huffman tree*/
+static unsigned inflateHuffmanBlock(ucvector* out, const unsigned char* in, size_t* bp,
+                                    size_t* pos, size_t inlength, unsigned btype) {
+  unsigned error = 0;
+  HuffmanTree tree_ll; /*the huffman tree for literal and length codes*/
+  HuffmanTree tree_d; /*the huffman tree for distance codes*/
+  size_t inbitlength = inlength * 8;
+
+  HuffmanTree_init(&tree_ll);
+  HuffmanTree_init(&tree_d);
+
+  if(btype == 1) getTreeInflateFixed(&tree_ll, &tree_d);
+  else if(btype == 2) error = getTreeInflateDynamic(&tree_ll, &tree_d, in, bp, inlength);
+
+  while(!error) /*decode all symbols until end reached, breaks at end code*/ {
+    /*code_ll is literal, length or end code*/
+    unsigned code_ll = huffmanDecodeSymbol(in, bp, &tree_ll, inbitlength);
+    if(code_ll <= 255) /*literal symbol*/ {
+      /*ucvector_push_back would do the same, but for some reason the two lines below run 10% faster*/
+      if(!ucvector_resize(out, (*pos) + 1)) ERROR_BREAK(83 /*alloc fail*/);
+      out->data[*pos] = (unsigned char)code_ll;
+      ++(*pos);
+    } else if(code_ll >= FIRST_LENGTH_CODE_INDEX && code_ll <= LAST_LENGTH_CODE_INDEX) /*length code*/ {
+      unsigned code_d, distance;
+      unsigned numextrabits_l, numextrabits_d; /*extra bits for length and distance*/
+      size_t start, forward, backward, length;
+
+      /*part 1: get length base*/
+      length = LENGTHBASE[code_ll - FIRST_LENGTH_CODE_INDEX];
+
+      /*part 2: get extra bits and add the value of that to length*/
+      numextrabits_l = LENGTHEXTRA[code_ll - FIRST_LENGTH_CODE_INDEX];
+      if((*bp + numextrabits_l) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+      length += readBitsFromStream(bp, in, numextrabits_l);
+
+      /*part 3: get distance code*/
+      code_d = huffmanDecodeSymbol(in, bp, &tree_d, inbitlength);
+      if(code_d > 29) {
+        if(code_d == (unsigned)(-1)) /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/ {
+          /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+          (10=no endcode, 11=wrong jump outside of tree)*/
+          error = (*bp) > inlength * 8 ? 10 : 11;
+        }
+        else error = 18; /*error: invalid distance code (30-31 are never used)*/
+        break;
+      }
+      distance = DISTANCEBASE[code_d];
+
+      /*part 4: get extra bits from distance*/
+      numextrabits_d = DISTANCEEXTRA[code_d];
+      if((*bp + numextrabits_d) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+      distance += readBitsFromStream(bp, in, numextrabits_d);
+
+      /*part 5: fill in all the out[n] values based on the length and dist*/
+      start = (*pos);
+      if(distance > start) ERROR_BREAK(52); /*too long backward distance*/
+      backward = start - distance;
+
+      if(!ucvector_resize(out, (*pos) + length)) ERROR_BREAK(83 /*alloc fail*/);
+      if (distance < length) {
+        for(forward = 0; forward < length; ++forward) {
+          out->data[(*pos)++] = out->data[backward++];
+        }
+      } else {
+        memcpy(out->data + *pos, out->data + backward, length);
+        *pos += length;
+      }
+    } else if(code_ll == 256) {
+      break; /*end code, break the loop*/
+    } else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/ {
+      /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+      (10=no endcode, 11=wrong jump outside of tree)*/
+      error = ((*bp) > inlength * 8) ? 10 : 11;
+      break;
+    }
+  }
+
+  HuffmanTree_cleanup(&tree_ll);
+  HuffmanTree_cleanup(&tree_d);
+
+  return error;
+}
+
+static unsigned inflateNoCompression(ucvector* out, const unsigned char* in, size_t* bp, size_t* pos, size_t inlength) {
+  size_t p;
+  unsigned LEN, NLEN, n, error = 0;
+
+  /*go to first boundary of byte*/
+  while(((*bp) & 0x7) != 0) ++(*bp);
+  p = (*bp) / 8; /*byte position*/
+
+  /*read LEN (2 bytes) and NLEN (2 bytes)*/
+  if(p + 4 >= inlength) return 52; /*error, bit pointer will jump past memory*/
+  LEN = in[p] + 256u * in[p + 1]; p += 2;
+  NLEN = in[p] + 256u * in[p + 1]; p += 2;
+
+  /*check if 16-bit NLEN is really the one's complement of LEN*/
+  if(LEN + NLEN != 65535) return 21; /*error: NLEN is not one's complement of LEN*/
+
+  if(!ucvector_resize(out, (*pos) + LEN)) return 83; /*alloc fail*/
+
+  /*read the literal data: LEN bytes are now stored in the out buffer*/
+  if(p + LEN > inlength) return 23; /*error: reading outside of in buffer*/
+  for(n = 0; n < LEN; ++n) out->data[(*pos)++] = in[p++];
+
+  (*bp) = p * 8;
+
+  return error;
+}
+
+static unsigned lodepng_inflatev(ucvector* out,
+                                 const unsigned char* in, size_t insize,
+                                 const LodePNGDecompressSettings* settings) {
+  /*bit pointer in the "in" data, current byte is bp >> 3, current bit is bp & 0x7 (from lsb to msb of the byte)*/
+  size_t bp = 0;
+  unsigned BFINAL = 0;
+  size_t pos = 0; /*byte position in the out buffer*/
+  unsigned error = 0;
+
+  (void)settings;
+
+  while(!BFINAL) {
+    unsigned BTYPE;
+    if(bp + 2 >= insize * 8) return 52; /*error, bit pointer will jump past memory*/
+    BFINAL = readBitFromStream(&bp, in);
+    BTYPE = 1u * readBitFromStream(&bp, in);
+    BTYPE += 2u * readBitFromStream(&bp, in);
+
+    if(BTYPE == 3) return 20; /*error: invalid BTYPE*/
+    else if(BTYPE == 0) error = inflateNoCompression(out, in, &bp, &pos, insize); /*no compression*/
+    else error = inflateHuffmanBlock(out, in, &bp, &pos, insize, BTYPE); /*compression, BTYPE 01 or 10*/
+
+    if(error) return error;
+  }
+
+  return error;
+}
+
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGDecompressSettings* settings) {
+  unsigned error;
+  ucvector v;
+  ucvector_init_buffer(&v, *out, *outsize);
+  error = lodepng_inflatev(&v, in, insize, settings);
+  *out = v.data;
+  *outsize = v.size;
+  return error;
+}
+
+static unsigned inflate(unsigned char** out, size_t* outsize,
+                        const unsigned char* in, size_t insize,
+                        const LodePNGDecompressSettings* settings) {
+  if(settings->custom_inflate) {
+    return settings->custom_inflate(out, outsize, in, insize, settings);
+  } else {
+    return lodepng_inflate(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflator (Compressor)                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static const size_t MAX_SUPPORTED_DEFLATE_LENGTH = 258;
+
+/*bitlen is the size in bits of the code*/
+static void addHuffmanSymbol(size_t* bp, ucvector* compressed, unsigned code, unsigned bitlen) {
+  addBitsToStreamReversed(bp, compressed, code, bitlen);
+}
+
+/*search the index in the array, that has the largest value smaller than or equal to the given value,
+given array must be sorted (if no value is smaller, it returns the size of the given array)*/
+static size_t searchCodeIndex(const unsigned* array, size_t array_size, size_t value) {
+  /*binary search (only small gain over linear). TODO: use CPU log2 instruction for getting symbols instead*/
+  size_t left = 1;
+  size_t right = array_size - 1;
+
+  while(left <= right) {
+    size_t mid = (left + right) >> 1;
+    if (array[mid] >= value) right = mid - 1;
+    else left = mid + 1;
+  }
+  if(left >= array_size || array[left] > value) left--;
+  return left;
+}
+
+static void addLengthDistance(uivector* values, size_t length, size_t distance) {
+  /*values in encoded vector are those used by deflate:
+  0-255: literal bytes
+  256: end
+  257-285: length/distance pair (length code, followed by extra length bits, distance code, extra distance bits)
+  286-287: invalid*/
+
+  unsigned length_code = (unsigned)searchCodeIndex(LENGTHBASE, 29, length);
+  unsigned extra_length = (unsigned)(length - LENGTHBASE[length_code]);
+  unsigned dist_code = (unsigned)searchCodeIndex(DISTANCEBASE, 30, distance);
+  unsigned extra_distance = (unsigned)(distance - DISTANCEBASE[dist_code]);
+
+  uivector_push_back(values, length_code + FIRST_LENGTH_CODE_INDEX);
+  uivector_push_back(values, extra_length);
+  uivector_push_back(values, dist_code);
+  uivector_push_back(values, extra_distance);
+}
+
+/*3 bytes of data get encoded into two bytes. The hash cannot use more than 3
+bytes as input because 3 is the minimum match length for deflate*/
+static const unsigned HASH_NUM_VALUES = 65536;
+static const unsigned HASH_BIT_MASK = 65535; /*HASH_NUM_VALUES - 1, but C90 does not like that as initializer*/
+
+typedef struct Hash {
+  int* head; /*hash value to head circular pos - can be outdated if went around window*/
+  /*circular pos to prev circular pos*/
+  unsigned short* chain;
+  int* val; /*circular pos to hash value*/
+
+  /*TODO: do this not only for zeros but for any repeated byte. However for PNG
+  it's always going to be the zeros that dominate, so not important for PNG*/
+  int* headz; /*similar to head, but for chainz*/
+  unsigned short* chainz; /*those with same amount of zeros*/
+  unsigned short* zeros; /*length of zeros streak, used as a second hash chain*/
+} Hash;
+
+static unsigned hash_init(Hash* hash, unsigned windowsize) {
+  unsigned i;
+  hash->head = (int*)lodepng_malloc(sizeof(int) * HASH_NUM_VALUES);
+  hash->val = (int*)lodepng_malloc(sizeof(int) * windowsize);
+  hash->chain = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+  hash->zeros = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+  hash->headz = (int*)lodepng_malloc(sizeof(int) * (MAX_SUPPORTED_DEFLATE_LENGTH + 1));
+  hash->chainz = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+  if(!hash->head || !hash->chain || !hash->val  || !hash->headz|| !hash->chainz || !hash->zeros) {
+    return 83; /*alloc fail*/
+  }
+
+  /*initialize hash table*/
+  for(i = 0; i != HASH_NUM_VALUES; ++i) hash->head[i] = -1;
+  for(i = 0; i != windowsize; ++i) hash->val[i] = -1;
+  for(i = 0; i != windowsize; ++i) hash->chain[i] = i; /*same value as index indicates uninitialized*/
+
+  for(i = 0; i <= MAX_SUPPORTED_DEFLATE_LENGTH; ++i) hash->headz[i] = -1;
+  for(i = 0; i != windowsize; ++i) hash->chainz[i] = i; /*same value as index indicates uninitialized*/
+
+  return 0;
+}
+
+static void hash_cleanup(Hash* hash) {
+  lodepng_free(hash->head);
+  lodepng_free(hash->val);
+  lodepng_free(hash->chain);
+
+  lodepng_free(hash->zeros);
+  lodepng_free(hash->headz);
+  lodepng_free(hash->chainz);
+}
+
+
+
+static unsigned getHash(const unsigned char* data, size_t size, size_t pos) {
+  unsigned result = 0;
+  if(pos + 2 < size) {
+    /*A simple shift and xor hash is used. Since the data of PNGs is dominated
+    by zeroes due to the filters, a better hash does not have a significant
+    effect on speed in traversing the chain, and causes more time spend on
+    calculating the hash.*/
+    result ^= (unsigned)(data[pos + 0] << 0u);
+    result ^= (unsigned)(data[pos + 1] << 4u);
+    result ^= (unsigned)(data[pos + 2] << 8u);
+  } else {
+    size_t amount, i;
+    if(pos >= size) return 0;
+    amount = size - pos;
+    for(i = 0; i != amount; ++i) result ^= (unsigned)(data[pos + i] << (i * 8u));
+  }
+  return result & HASH_BIT_MASK;
+}
+
+static unsigned countZeros(const unsigned char* data, size_t size, size_t pos) {
+  const unsigned char* start = data + pos;
+  const unsigned char* end = start + MAX_SUPPORTED_DEFLATE_LENGTH;
+  if(end > data + size) end = data + size;
+  data = start;
+  while(data != end && *data == 0) ++data;
+  /*subtracting two addresses returned as 32-bit number (max value is MAX_SUPPORTED_DEFLATE_LENGTH)*/
+  return (unsigned)(data - start);
+}
+
+/*wpos = pos & (windowsize - 1)*/
+static void updateHashChain(Hash* hash, size_t wpos, unsigned hashval, unsigned short numzeros) {
+  hash->val[wpos] = (int)hashval;
+  if(hash->head[hashval] != -1) hash->chain[wpos] = hash->head[hashval];
+  hash->head[hashval] = (int)wpos;
+
+  hash->zeros[wpos] = numzeros;
+  if(hash->headz[numzeros] != -1) hash->chainz[wpos] = hash->headz[numzeros];
+  hash->headz[numzeros] = (int)wpos;
+}
+
+/*
+LZ77-encode the data. Return value is error code. The input are raw bytes, the output
+is in the form of unsigned integers with codes representing for example literal bytes, or
+length/distance pairs.
+It uses a hash table technique to let it encode faster. When doing LZ77 encoding, a
+sliding window (of windowsize) is used, and all past bytes in that window can be used as
+the "dictionary". A brute force search through all possible distances would be slow, and
+this hash technique is one out of several ways to speed this up.
+*/
+static unsigned encodeLZ77(uivector* out, Hash* hash,
+                           const unsigned char* in, size_t inpos, size_t insize, unsigned windowsize,
+                           unsigned minmatch, unsigned nicematch, unsigned lazymatching) {
+  size_t pos;
+  unsigned i, error = 0;
+  /*for large window lengths, assume the user wants no compression loss. Otherwise, max hash chain length speedup.*/
+  unsigned maxchainlength = windowsize >= 8192 ? windowsize : windowsize / 8;
+  unsigned maxlazymatch = windowsize >= 8192 ? MAX_SUPPORTED_DEFLATE_LENGTH : 64;
+
+  unsigned usezeros = 1; /*not sure if setting it to false for windowsize < 8192 is better or worse*/
+  unsigned numzeros = 0;
+
+  unsigned offset; /*the offset represents the distance in LZ77 terminology*/
+  unsigned length;
+  unsigned lazy = 0;
+  unsigned lazylength = 0, lazyoffset = 0;
+  unsigned hashval;
+  unsigned current_offset, current_length;
+  unsigned prev_offset;
+  const unsigned char *lastptr, *foreptr, *backptr;
+  unsigned hashpos;
+
+  if(windowsize == 0 || windowsize > 32768) return 60; /*error: windowsize smaller/larger than allowed*/
+  if((windowsize & (windowsize - 1)) != 0) return 90; /*error: must be power of two*/
+
+  if(nicematch > MAX_SUPPORTED_DEFLATE_LENGTH) nicematch = MAX_SUPPORTED_DEFLATE_LENGTH;
+
+  for(pos = inpos; pos < insize; ++pos) {
+    size_t wpos = pos & (windowsize - 1); /*position for in 'circular' hash buffers*/
+    unsigned chainlength = 0;
+
+    hashval = getHash(in, insize, pos);
+
+    if(usezeros && hashval == 0) {
+      if(numzeros == 0) numzeros = countZeros(in, insize, pos);
+      else if(pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+    } else {
+      numzeros = 0;
+    }
+
+    updateHashChain(hash, wpos, hashval, numzeros);
+
+    /*the length and offset found for the current position*/
+    length = 0;
+    offset = 0;
+
+    hashpos = hash->chain[wpos];
+
+    lastptr = &in[insize < pos + MAX_SUPPORTED_DEFLATE_LENGTH ? insize : pos + MAX_SUPPORTED_DEFLATE_LENGTH];
+
+    /*search for the longest string*/
+    prev_offset = 0;
+    for(;;) {
+      if(chainlength++ >= maxchainlength) break;
+      current_offset = (unsigned)(hashpos <= wpos ? wpos - hashpos : wpos - hashpos + windowsize);
+
+      if(current_offset < prev_offset) break; /*stop when went completely around the circular buffer*/
+      prev_offset = current_offset;
+      if(current_offset > 0) {
+        /*test the next characters*/
+        foreptr = &in[pos];
+        backptr = &in[pos - current_offset];
+
+        /*common case in PNGs is lots of zeros. Quickly skip over them as a speedup*/
+        if(numzeros >= 3) {
+          unsigned skip = hash->zeros[hashpos];
+          if(skip > numzeros) skip = numzeros;
+          backptr += skip;
+          foreptr += skip;
+        }
+
+        while(foreptr != lastptr && *backptr == *foreptr) /*maximum supported length by deflate is max length*/ {
+          ++backptr;
+          ++foreptr;
+        }
+        current_length = (unsigned)(foreptr - &in[pos]);
+
+        if(current_length > length) {
+          length = current_length; /*the longest length*/
+          offset = current_offset; /*the offset that is related to this longest length*/
+          /*jump out once a length of max length is found (speed gain). This also jumps
+          out if length is MAX_SUPPORTED_DEFLATE_LENGTH*/
+          if(current_length >= nicematch) break;
+        }
+      }
+
+      if(hashpos == hash->chain[hashpos]) break;
+
+      if(numzeros >= 3 && length > numzeros) {
+        hashpos = hash->chainz[hashpos];
+        if(hash->zeros[hashpos] != numzeros) break;
+      } else {
+        hashpos = hash->chain[hashpos];
+        /*outdated hash value, happens if particular value was not encountered in whole last window*/
+        if(hash->val[hashpos] != (int)hashval) break;
+      }
+    }
+
+    if(lazymatching) {
+      if(!lazy && length >= 3 && length <= maxlazymatch && length < MAX_SUPPORTED_DEFLATE_LENGTH) {
+        lazy = 1;
+        lazylength = length;
+        lazyoffset = offset;
+        continue; /*try the next byte*/
+      }
+      if(lazy) {
+        lazy = 0;
+        if(pos == 0) ERROR_BREAK(81);
+        if(length > lazylength + 1) {
+          /*push the previous character as literal*/
+          if(!uivector_push_back(out, in[pos - 1])) ERROR_BREAK(83 /*alloc fail*/);
+        } else {
+          length = lazylength;
+          offset = lazyoffset;
+          hash->head[hashval] = -1; /*the same hashchain update will be done, this ensures no wrong alteration*/
+          hash->headz[numzeros] = -1; /*idem*/
+          --pos;
+        }
+      }
+    }
+    if(length >= 3 && offset > windowsize) ERROR_BREAK(86 /*too big (or overflown negative) offset*/);
+
+    /*encode it as length/distance pair or literal value*/
+    if(length < 3) /*only lengths of 3 or higher are supported as length/distance pair*/ {
+      if(!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+    } else if(length < minmatch || (length == 3 && offset > 4096)) {
+      /*compensate for the fact that longer offsets have more extra bits, a
+      length of only 3 may be not worth it then*/
+      if(!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+    } else {
+      addLengthDistance(out, length, offset);
+      for(i = 1; i < length; ++i) {
+        ++pos;
+        wpos = pos & (windowsize - 1);
+        hashval = getHash(in, insize, pos);
+        if(usezeros && hashval == 0) {
+          if(numzeros == 0) numzeros = countZeros(in, insize, pos);
+          else if(pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+        } else {
+          numzeros = 0;
+        }
+        updateHashChain(hash, wpos, hashval, numzeros);
+      }
+    }
+  } /*end of the loop through each character of input*/
+
+  return error;
+}
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+static unsigned deflateNoCompression(ucvector* out, const unsigned char* data, size_t datasize) {
+  /*non compressed deflate block data: 1 bit BFINAL,2 bits BTYPE,(5 bits): it jumps to start of next byte,
+  2 bytes LEN, 2 bytes NLEN, LEN bytes literal DATA*/
+
+  size_t i, j, numdeflateblocks = (datasize + 65534) / 65535;
+  unsigned datapos = 0;
+  for(i = 0; i != numdeflateblocks; ++i) {
+    unsigned BFINAL, BTYPE, LEN, NLEN;
+    unsigned char firstbyte;
+
+    BFINAL = (i == numdeflateblocks - 1);
+    BTYPE = 0;
+
+    firstbyte = (unsigned char)(BFINAL + ((BTYPE & 1) << 1) + ((BTYPE & 2) << 1));
+    ucvector_push_back(out, firstbyte);
+
+    LEN = 65535;
+    if(datasize - datapos < 65535) LEN = (unsigned)datasize - datapos;
+    NLEN = 65535 - LEN;
+
+    ucvector_push_back(out, (unsigned char)(LEN & 255));
+    ucvector_push_back(out, (unsigned char)(LEN >> 8));
+    ucvector_push_back(out, (unsigned char)(NLEN & 255));
+    ucvector_push_back(out, (unsigned char)(NLEN >> 8));
+
+    /*Decompressed data*/
+    for(j = 0; j < 65535 && datapos < datasize; ++j) {
+      ucvector_push_back(out, data[datapos++]);
+    }
+  }
+
+  return 0;
+}
+
+/*
+write the lz77-encoded data, which has lit, len and dist codes, to compressed stream using huffman trees.
+tree_ll: the tree for lit and len codes.
+tree_d: the tree for distance codes.
+*/
+static void writeLZ77data(size_t* bp, ucvector* out, const uivector* lz77_encoded,
+                          const HuffmanTree* tree_ll, const HuffmanTree* tree_d) {
+  size_t i = 0;
+  for(i = 0; i != lz77_encoded->size; ++i) {
+    unsigned val = lz77_encoded->data[i];
+    addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_ll, val), HuffmanTree_getLength(tree_ll, val));
+    if(val > 256) /*for a length code, 3 more things have to be added*/ {
+      unsigned length_index = val - FIRST_LENGTH_CODE_INDEX;
+      unsigned n_length_extra_bits = LENGTHEXTRA[length_index];
+      unsigned length_extra_bits = lz77_encoded->data[++i];
+
+      unsigned distance_code = lz77_encoded->data[++i];
+
+      unsigned distance_index = distance_code;
+      unsigned n_distance_extra_bits = DISTANCEEXTRA[distance_index];
+      unsigned distance_extra_bits = lz77_encoded->data[++i];
+
+      addBitsToStream(bp, out, length_extra_bits, n_length_extra_bits);
+      addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_d, distance_code),
+                       HuffmanTree_getLength(tree_d, distance_code));
+      addBitsToStream(bp, out, distance_extra_bits, n_distance_extra_bits);
+    }
+  }
+}
+
+/*Deflate for a block of type "dynamic", that is, with freely, optimally, created huffman trees*/
+static unsigned deflateDynamic(ucvector* out, size_t* bp, Hash* hash,
+                               const unsigned char* data, size_t datapos, size_t dataend,
+                               const LodePNGCompressSettings* settings, unsigned final) {
+  unsigned error = 0;
+
+  /*
+  A block is compressed as follows: The PNG data is lz77 encoded, resulting in
+  literal bytes and length/distance pairs. This is then huffman compressed with
+  two huffman trees. One huffman tree is used for the lit and len values ("ll"),
+  another huffman tree is used for the dist values ("d"). These two trees are
+  stored using their code lengths, and to compress even more these code lengths
+  are also run-length encoded and huffman compressed. This gives a huffman tree
+  of code lengths "cl". The code lenghts used to describe this third tree are
+  the code length code lengths ("clcl").
+  */
+
+  /*The lz77 encoded data, represented with integers since there will also be length and distance codes in it*/
+  uivector lz77_encoded;
+  HuffmanTree tree_ll; /*tree for lit,len values*/
+  HuffmanTree tree_d; /*tree for distance codes*/
+  HuffmanTree tree_cl; /*tree for encoding the code lengths representing tree_ll and tree_d*/
+  uivector frequencies_ll; /*frequency of lit,len codes*/
+  uivector frequencies_d; /*frequency of dist codes*/
+  uivector frequencies_cl; /*frequency of code length codes*/
+  uivector bitlen_lld; /*lit,len,dist code lenghts (int bits), literally (without repeat codes).*/
+  uivector bitlen_lld_e; /*bitlen_lld encoded with repeat codes (this is a rudemtary run length compression)*/
+  /*bitlen_cl is the code length code lengths ("clcl"). The bit lengths of codes to represent tree_cl
+  (these are written as is in the file, it would be crazy to compress these using yet another huffman
+  tree that needs to be represented by yet another set of code lengths)*/
+  uivector bitlen_cl;
+  size_t datasize = dataend - datapos;
+
+  /*
+  Due to the huffman compression of huffman tree representations ("two levels"), there are some anologies:
+  bitlen_lld is to tree_cl what data is to tree_ll and tree_d.
+  bitlen_lld_e is to bitlen_lld what lz77_encoded is to data.
+  bitlen_cl is to bitlen_lld_e what bitlen_lld is to lz77_encoded.
+  */
+
+  unsigned BFINAL = final;
+  size_t numcodes_ll, numcodes_d, i;
+  unsigned HLIT, HDIST, HCLEN;
+
+  uivector_init(&lz77_encoded);
+  HuffmanTree_init(&tree_ll);
+  HuffmanTree_init(&tree_d);
+  HuffmanTree_init(&tree_cl);
+  uivector_init(&frequencies_ll);
+  uivector_init(&frequencies_d);
+  uivector_init(&frequencies_cl);
+  uivector_init(&bitlen_lld);
+  uivector_init(&bitlen_lld_e);
+  uivector_init(&bitlen_cl);
+
+  /*This while loop never loops due to a break at the end, it is here to
+  allow breaking out of it to the cleanup phase on error conditions.*/
+  while(!error) {
+    if(settings->use_lz77) {
+      error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+                         settings->minmatch, settings->nicematch, settings->lazymatching);
+      if(error) break;
+    } else {
+      if(!uivector_resize(&lz77_encoded, datasize)) ERROR_BREAK(83 /*alloc fail*/);
+      for(i = datapos; i < dataend; ++i) lz77_encoded.data[i - datapos] = data[i]; /*no LZ77, but still will be Huffman compressed*/
+    }
+
+    if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83 /*alloc fail*/);
+    if(!uivector_resizev(&frequencies_d, 30, 0)) ERROR_BREAK(83 /*alloc fail*/);
+
+    /*Count the frequencies of lit, len and dist codes*/
+    for(i = 0; i != lz77_encoded.size; ++i) {
+      unsigned symbol = lz77_encoded.data[i];
+      ++frequencies_ll.data[symbol];
+      if(symbol > 256) {
+        unsigned dist = lz77_encoded.data[i + 2];
+        ++frequencies_d.data[dist];
+        i += 3;
+      }
+    }
+    frequencies_ll.data[256] = 1; /*there will be exactly 1 end code, at the end of the block*/
+
+    /*Make both huffman trees, one for the lit and len codes, one for the dist codes*/
+    error = HuffmanTree_makeFromFrequencies(&tree_ll, frequencies_ll.data, 257, frequencies_ll.size, 15);
+    if(error) break;
+    /*2, not 1, is chosen for mincodes: some buggy PNG decoders require at least 2 symbols in the dist tree*/
+    error = HuffmanTree_makeFromFrequencies(&tree_d, frequencies_d.data, 2, frequencies_d.size, 15);
+    if(error) break;
+
+    numcodes_ll = tree_ll.numcodes; if(numcodes_ll > 286) numcodes_ll = 286;
+    numcodes_d = tree_d.numcodes; if(numcodes_d > 30) numcodes_d = 30;
+    /*store the code lengths of both generated trees in bitlen_lld*/
+    for(i = 0; i != numcodes_ll; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_ll, (unsigned)i));
+    for(i = 0; i != numcodes_d; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_d, (unsigned)i));
+
+    /*run-length compress bitlen_ldd into bitlen_lld_e by using repeat codes 16 (copy length 3-6 times),
+    17 (3-10 zeroes), 18 (11-138 zeroes)*/
+    for(i = 0; i != (unsigned)bitlen_lld.size; ++i) {
+      unsigned j = 0; /*amount of repititions*/
+      while(i + j + 1 < (unsigned)bitlen_lld.size && bitlen_lld.data[i + j + 1] == bitlen_lld.data[i]) ++j;
+
+      if(bitlen_lld.data[i] == 0 && j >= 2) /*repeat code for zeroes*/ {
+        ++j; /*include the first zero*/
+        if(j <= 10) /*repeat code 17 supports max 10 zeroes*/ {
+          uivector_push_back(&bitlen_lld_e, 17);
+          uivector_push_back(&bitlen_lld_e, j - 3);
+        } else /*repeat code 18 supports max 138 zeroes*/ {
+          if(j > 138) j = 138;
+          uivector_push_back(&bitlen_lld_e, 18);
+          uivector_push_back(&bitlen_lld_e, j - 11);
+        }
+        i += (j - 1);
+      } else if(j >= 3) /*repeat code for value other than zero*/ {
+        size_t k;
+        unsigned num = j / 6, rest = j % 6;
+        uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+        for(k = 0; k < num; ++k) {
+          uivector_push_back(&bitlen_lld_e, 16);
+          uivector_push_back(&bitlen_lld_e, 6 - 3);
+        }
+        if(rest >= 3) {
+          uivector_push_back(&bitlen_lld_e, 16);
+          uivector_push_back(&bitlen_lld_e, rest - 3);
+        }
+        else j -= rest;
+        i += j;
+      } else /*too short to benefit from repeat code*/ {
+        uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+      }
+    }
+
+    /*generate tree_cl, the huffmantree of huffmantrees*/
+
+    if(!uivector_resizev(&frequencies_cl, NUM_CODE_LENGTH_CODES, 0)) ERROR_BREAK(83 /*alloc fail*/);
+    for(i = 0; i != bitlen_lld_e.size; ++i) {
+      ++frequencies_cl.data[bitlen_lld_e.data[i]];
+      /*after a repeat code come the bits that specify the number of repetitions,
+      those don't need to be in the frequencies_cl calculation*/
+      if(bitlen_lld_e.data[i] >= 16) ++i;
+    }
+
+    error = HuffmanTree_makeFromFrequencies(&tree_cl, frequencies_cl.data,
+                                            frequencies_cl.size, frequencies_cl.size, 7);
+    if(error) break;
+
+    if(!uivector_resize(&bitlen_cl, tree_cl.numcodes)) ERROR_BREAK(83 /*alloc fail*/);
+    for(i = 0; i != tree_cl.numcodes; ++i) {
+      /*lenghts of code length tree is in the order as specified by deflate*/
+      bitlen_cl.data[i] = HuffmanTree_getLength(&tree_cl, CLCL_ORDER[i]);
+    }
+    while(bitlen_cl.data[bitlen_cl.size - 1] == 0 && bitlen_cl.size > 4) {
+      /*remove zeros at the end, but minimum size must be 4*/
+      if(!uivector_resize(&bitlen_cl, bitlen_cl.size - 1)) ERROR_BREAK(83 /*alloc fail*/);
+    }
+    if(error) break;
+
+    /*
+    Write everything into the output
+
+    After the BFINAL and BTYPE, the dynamic block consists out of the following:
+    - 5 bits HLIT, 5 bits HDIST, 4 bits HCLEN
+    - (HCLEN+4)*3 bits code lengths of code length alphabet
+    - HLIT + 257 code lenghts of lit/length alphabet (encoded using the code length
+      alphabet, + possible repetition codes 16, 17, 18)
+    - HDIST + 1 code lengths of distance alphabet (encoded using the code length
+      alphabet, + possible repetition codes 16, 17, 18)
+    - compressed data
+    - 256 (end code)
+    */
+
+    /*Write block type*/
+    addBitToStream(bp, out, BFINAL);
+    addBitToStream(bp, out, 0); /*first bit of BTYPE "dynamic"*/
+    addBitToStream(bp, out, 1); /*second bit of BTYPE "dynamic"*/
+
+    /*write the HLIT, HDIST and HCLEN values*/
+    HLIT = (unsigned)(numcodes_ll - 257);
+    HDIST = (unsigned)(numcodes_d - 1);
+    HCLEN = (unsigned)bitlen_cl.size - 4;
+    /*trim zeroes for HCLEN. HLIT and HDIST were already trimmed at tree creation*/
+    while(!bitlen_cl.data[HCLEN + 4 - 1] && HCLEN > 0) --HCLEN;
+    addBitsToStream(bp, out, HLIT, 5);
+    addBitsToStream(bp, out, HDIST, 5);
+    addBitsToStream(bp, out, HCLEN, 4);
+
+    /*write the code lenghts of the code length alphabet*/
+    for(i = 0; i != HCLEN + 4; ++i) addBitsToStream(bp, out, bitlen_cl.data[i], 3);
+
+    /*write the lenghts of the lit/len AND the dist alphabet*/
+    for(i = 0; i != bitlen_lld_e.size; ++i) {
+      addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_cl, bitlen_lld_e.data[i]),
+                       HuffmanTree_getLength(&tree_cl, bitlen_lld_e.data[i]));
+      /*extra bits of repeat codes*/
+      if(bitlen_lld_e.data[i] == 16) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 2);
+      else if(bitlen_lld_e.data[i] == 17) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 3);
+      else if(bitlen_lld_e.data[i] == 18) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 7);
+    }
+
+    /*write the compressed data symbols*/
+    writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+    /*error: the length of the end code 256 must be larger than 0*/
+    if(HuffmanTree_getLength(&tree_ll, 256) == 0) ERROR_BREAK(64);
+
+    /*write the end code*/
+    addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+    break; /*end of error-while*/
+  }
+
+  /*cleanup*/
+  uivector_cleanup(&lz77_encoded);
+  HuffmanTree_cleanup(&tree_ll);
+  HuffmanTree_cleanup(&tree_d);
+  HuffmanTree_cleanup(&tree_cl);
+  uivector_cleanup(&frequencies_ll);
+  uivector_cleanup(&frequencies_d);
+  uivector_cleanup(&frequencies_cl);
+  uivector_cleanup(&bitlen_lld_e);
+  uivector_cleanup(&bitlen_lld);
+  uivector_cleanup(&bitlen_cl);
+
+  return error;
+}
+
+static unsigned deflateFixed(ucvector* out, size_t* bp, Hash* hash,
+                             const unsigned char* data,
+                             size_t datapos, size_t dataend,
+                             const LodePNGCompressSettings* settings, unsigned final) {
+  HuffmanTree tree_ll; /*tree for literal values and length codes*/
+  HuffmanTree tree_d; /*tree for distance codes*/
+
+  unsigned BFINAL = final;
+  unsigned error = 0;
+  size_t i;
+
+  HuffmanTree_init(&tree_ll);
+  HuffmanTree_init(&tree_d);
+
+  generateFixedLitLenTree(&tree_ll);
+  generateFixedDistanceTree(&tree_d);
+
+  addBitToStream(bp, out, BFINAL);
+  addBitToStream(bp, out, 1); /*first bit of BTYPE*/
+  addBitToStream(bp, out, 0); /*second bit of BTYPE*/
+
+  if(settings->use_lz77) /*LZ77 encoded*/ {
+    uivector lz77_encoded;
+    uivector_init(&lz77_encoded);
+    error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+                       settings->minmatch, settings->nicematch, settings->lazymatching);
+    if(!error) writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+    uivector_cleanup(&lz77_encoded);
+  } else /*no LZ77, but still will be Huffman compressed*/ {
+    for(i = datapos; i < dataend; ++i) {
+      addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, data[i]), HuffmanTree_getLength(&tree_ll, data[i]));
+    }
+  }
+  /*add END code*/
+  if(!error) addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+  /*cleanup*/
+  HuffmanTree_cleanup(&tree_ll);
+  HuffmanTree_cleanup(&tree_d);
+
+  return error;
+}
+
+static unsigned lodepng_deflatev(ucvector* out, const unsigned char* in, size_t insize,
+                                 const LodePNGCompressSettings* settings) {
+  unsigned error = 0;
+  size_t i, blocksize, numdeflateblocks;
+  size_t bp = 0; /*the bit pointer*/
+  Hash hash;
+
+  if(settings->btype > 2) return 61;
+  else if(settings->btype == 0) return deflateNoCompression(out, in, insize);
+  else if(settings->btype == 1) blocksize = insize;
+  else /*if(settings->btype == 2)*/ {
+    /*on PNGs, deflate blocks of 65-262k seem to give most dense encoding*/
+    blocksize = insize / 8 + 8;
+    if(blocksize < 65536) blocksize = 65536;
+    if(blocksize > 262144) blocksize = 262144;
+  }
+
+  numdeflateblocks = (insize + blocksize - 1) / blocksize;
+  if(numdeflateblocks == 0) numdeflateblocks = 1;
+
+  error = hash_init(&hash, settings->windowsize);
+  if(error) return error;
+
+  for(i = 0; i != numdeflateblocks && !error; ++i) {
+    unsigned final = (i == numdeflateblocks - 1);
+    size_t start = i * blocksize;
+    size_t end = start + blocksize;
+    if(end > insize) end = insize;
+
+    if(settings->btype == 1) error = deflateFixed(out, &bp, &hash, in, start, end, settings, final);
+    else if(settings->btype == 2) error = deflateDynamic(out, &bp, &hash, in, start, end, settings, final);
+  }
+
+  hash_cleanup(&hash);
+
+  return error;
+}
+
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGCompressSettings* settings) {
+  unsigned error;
+  ucvector v;
+  ucvector_init_buffer(&v, *out, *outsize);
+  error = lodepng_deflatev(&v, in, insize, settings);
+  *out = v.data;
+  *outsize = v.size;
+  return error;
+}
+
+static unsigned deflate(unsigned char** out, size_t* outsize,
+                        const unsigned char* in, size_t insize,
+                        const LodePNGCompressSettings* settings) {
+  if(settings->custom_deflate) {
+    return settings->custom_deflate(out, outsize, in, insize, settings);
+  } else {
+    return lodepng_deflate(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Adler32                                                                  */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned update_adler32(unsigned adler, const unsigned char* data, unsigned len) {
+  unsigned s1 = adler & 0xffff;
+  unsigned s2 = (adler >> 16) & 0xffff;
+
+  while(len > 0) {
+    /*at least 5552 sums can be done before the sums overflow, saving a lot of module divisions*/
+    unsigned amount = len > 5552 ? 5552 : len;
+    len -= amount;
+    while(amount > 0) {
+      s1 += (*data++);
+      s2 += s1;
+      --amount;
+    }
+    s1 %= 65521;
+    s2 %= 65521;
+  }
+
+  return (s2 << 16) | s1;
+}
+
+/*Return the adler32 of the bytes data[0..len-1]*/
+static unsigned adler32(const unsigned char* data, unsigned len) {
+  return update_adler32(1L, data, len);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Zlib                                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                 size_t insize, const LodePNGDecompressSettings* settings) {
+  unsigned error = 0;
+  unsigned CM, CINFO, FDICT;
+
+  if(insize < 2) return 53; /*error, size of zlib data too small*/
+  /*read information from zlib header*/
+  if((in[0] * 256 + in[1]) % 31 != 0) {
+    /*error: 256 * in[0] + in[1] must be a multiple of 31, the FCHECK value is supposed to be made that way*/
+    return 24;
+  }
+
+  CM = in[0] & 15;
+  CINFO = (in[0] >> 4) & 15;
+  /*FCHECK = in[1] & 31;*/ /*FCHECK is already tested above*/
+  FDICT = (in[1] >> 5) & 1;
+  /*FLEVEL = (in[1] >> 6) & 3;*/ /*FLEVEL is not used here*/
+
+  if(CM != 8 || CINFO > 7) {
+    /*error: only compression method 8: inflate with sliding window of 32k is supported by the PNG spec*/
+    return 25;
+  }
+  if(FDICT != 0) {
+    /*error: the specification of PNG says about the zlib stream:
+      "The additional flags shall not specify a preset dictionary."*/
+    return 26;
+  }
+
+  error = inflate(out, outsize, in + 2, insize - 2, settings);
+  if(error) return error;
+
+  if(!settings->ignore_adler32) {
+    unsigned ADLER32 = lodepng_read32bitInt(&in[insize - 4]);
+    unsigned checksum = adler32(*out, (unsigned)(*outsize));
+    if(checksum != ADLER32) return 58; /*error, adler checksum not correct, data must be corrupted*/
+  }
+
+  return 0; /*no error*/
+}
+
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                size_t insize, const LodePNGDecompressSettings* settings) {
+  if(settings->custom_zlib) {
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+  } else {
+    return lodepng_zlib_decompress(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                               size_t insize, const LodePNGCompressSettings* settings) {
+  /*initially, *out must be NULL and outsize 0, if you just give some random *out
+  that's pointing to a non allocated buffer, this'll crash*/
+  ucvector outv;
+  size_t i;
+  unsigned error;
+  unsigned char* deflatedata = 0;
+  size_t deflatesize = 0;
+
+  /*zlib data: 1 byte CMF (CM+CINFO), 1 byte FLG, deflate data, 4 byte ADLER32 checksum of the Decompressed data*/
+  unsigned CMF = 120; /*0b01111000: CM 8, CINFO 7. With CINFO 7, any window size up to 32768 can be used.*/
+  unsigned FLEVEL = 0;
+  unsigned FDICT = 0;
+  unsigned CMFFLG = 256 * CMF + FDICT * 32 + FLEVEL * 64;
+  unsigned FCHECK = 31 - CMFFLG % 31;
+  CMFFLG += FCHECK;
+
+  /*ucvector-controlled version of the output buffer, for dynamic array*/
+  ucvector_init_buffer(&outv, *out, *outsize);
+
+  ucvector_push_back(&outv, (unsigned char)(CMFFLG >> 8));
+  ucvector_push_back(&outv, (unsigned char)(CMFFLG & 255));
+
+  error = deflate(&deflatedata, &deflatesize, in, insize, settings);
+
+  if(!error) {
+    unsigned ADLER32 = adler32(in, (unsigned)insize);
+    for(i = 0; i != deflatesize; ++i) ucvector_push_back(&outv, deflatedata[i]);
+    lodepng_free(deflatedata);
+    lodepng_add32bitInt(&outv, ADLER32);
+  }
+
+  *out = outv.data;
+  *outsize = outv.size;
+
+  return error;
+}
+
+/* compress using the default or custom zlib function */
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                              size_t insize, const LodePNGCompressSettings* settings) {
+  if(settings->custom_zlib) {
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+  } else {
+    return lodepng_zlib_compress(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#else /*no LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                size_t insize, const LodePNGDecompressSettings* settings) {
+  if(!settings->custom_zlib) return 87; /*no custom zlib function provided */
+  return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                              size_t insize, const LodePNGCompressSettings* settings) {
+  if(!settings->custom_zlib) return 87; /*no custom zlib function provided */
+  return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*this is a good tradeoff between speed and compression ratio*/
+#define DEFAULT_WINDOWSIZE 2048
+
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings) {
+  /*compress with dynamic huffman tree (not in the mathematical sense, just not the predefined one)*/
+  settings->btype = 2;
+  settings->use_lz77 = 1;
+  settings->windowsize = DEFAULT_WINDOWSIZE;
+  settings->minmatch = 3;
+  settings->nicematch = 128;
+  settings->lazymatching = 1;
+
+  settings->custom_zlib = 0;
+  settings->custom_deflate = 0;
+  settings->custom_context = 0;
+}
+
+const LodePNGCompressSettings lodepng_default_compress_settings = {2, 1, DEFAULT_WINDOWSIZE, 3, 128, 1, 0, 0, 0};
+
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings) {
+  settings->ignore_adler32 = 0;
+
+  settings->custom_zlib = 0;
+  settings->custom_inflate = 0;
+  settings->custom_context = 0;
+}
+
+const LodePNGDecompressSettings lodepng_default_decompress_settings = {0, 0, 0, 0};
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of Zlib related code. Begin of PNG related code.                 // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / CRC32                                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+
+#ifndef LODEPNG_NO_COMPILE_CRC
+/* CRC polynomial: 0xedb88320 */
+static unsigned lodepng_crc32_table[256] = {
+           0u, 1996959894u, 3993919788u, 2567524794u,  124634137u, 1886057615u, 3915621685u, 2657392035u,
+   249268274u, 2044508324u, 3772115230u, 2547177864u,  162941995u, 2125561021u, 3887607047u, 2428444049u,
+   498536548u, 1789927666u, 4089016648u, 2227061214u,  450548861u, 1843258603u, 4107580753u, 2211677639u,
+   325883990u, 1684777152u, 4251122042u, 2321926636u,  335633487u, 1661365465u, 4195302755u, 2366115317u,
+   997073096u, 1281953886u, 3579855332u, 2724688242u, 1006888145u, 1258607687u, 3524101629u, 2768942443u,
+   901097722u, 1119000684u, 3686517206u, 2898065728u,  853044451u, 1172266101u, 3705015759u, 2882616665u,
+   651767980u, 1373503546u, 3369554304u, 3218104598u,  565507253u, 1454621731u, 3485111705u, 3099436303u,
+   671266974u, 1594198024u, 3322730930u, 2970347812u,  795835527u, 1483230225u, 3244367275u, 3060149565u,
+  1994146192u,   31158534u, 2563907772u, 4023717930u, 1907459465u,  112637215u, 2680153253u, 3904427059u,
+  2013776290u,  251722036u, 2517215374u, 3775830040u, 2137656763u,  141376813u, 2439277719u, 3865271297u,
+  1802195444u,  476864866u, 2238001368u, 4066508878u, 1812370925u,  453092731u, 2181625025u, 4111451223u,
+  1706088902u,  314042704u, 2344532202u, 4240017532u, 1658658271u,  366619977u, 2362670323u, 4224994405u,
+  1303535960u,  984961486u, 2747007092u, 3569037538u, 1256170817u, 1037604311u, 2765210733u, 3554079995u,
+  1131014506u,  879679996u, 2909243462u, 3663771856u, 1141124467u,  855842277u, 2852801631u, 3708648649u,
+  1342533948u,  654459306u, 3188396048u, 3373015174u, 1466479909u,  544179635u, 3110523913u, 3462522015u,
+  1591671054u,  702138776u, 2966460450u, 3352799412u, 1504918807u,  783551873u, 3082640443u, 3233442989u,
+  3988292384u, 2596254646u,   62317068u, 1957810842u, 3939845945u, 2647816111u,   81470997u, 1943803523u,
+  3814918930u, 2489596804u,  225274430u, 2053790376u, 3826175755u, 2466906013u,  167816743u, 2097651377u,
+  4027552580u, 2265490386u,  503444072u, 1762050814u, 4150417245u, 2154129355u,  426522225u, 1852507879u,
+  4275313526u, 2312317920u,  282753626u, 1742555852u, 4189708143u, 2394877945u,  397917763u, 1622183637u,
+  3604390888u, 2714866558u,  953729732u, 1340076626u, 3518719985u, 2797360999u, 1068828381u, 1219638859u,
+  3624741850u, 2936675148u,  906185462u, 1090812512u, 3747672003u, 2825379669u,  829329135u, 1181335161u,
+  3412177804u, 3160834842u,  628085408u, 1382605366u, 3423369109u, 3138078467u,  570562233u, 1426400815u,
+  3317316542u, 2998733608u,  733239954u, 1555261956u, 3268935591u, 3050360625u,  752459403u, 1541320221u,
+  2607071920u, 3965973030u, 1969922972u,   40735498u, 2617837225u, 3943577151u, 1913087877u,   83908371u,
+  2512341634u, 3803740692u, 2075208622u,  213261112u, 2463272603u, 3855990285u, 2094854071u,  198958881u,
+  2262029012u, 4057260610u, 1759359992u,  534414190u, 2176718541u, 4139329115u, 1873836001u,  414664567u,
+  2282248934u, 4279200368u, 1711684554u,  285281116u, 2405801727u, 4167216745u, 1634467795u,  376229701u,
+  2685067896u, 3608007406u, 1308918612u,  956543938u, 2808555105u, 3495958263u, 1231636301u, 1047427035u,
+  2932959818u, 3654703836u, 1088359270u,  936918000u, 2847714899u, 3736837829u, 1202900863u,  817233897u,
+  3183342108u, 3401237130u, 1404277552u,  615818150u, 3134207493u, 3453421203u, 1423857449u,  601450431u,
+  3009837614u, 3294710456u, 1567103746u,  711928724u, 3020668471u, 3272380065u, 1510334235u,  755167117u
+};
+
+/*Return the CRC of the bytes buf[0..len-1].*/
+unsigned lodepng_crc32(const unsigned char* data, size_t length) {
+  unsigned r = 0xffffffffu;
+  size_t i;
+  for(i = 0; i < length; ++i) {
+    r = lodepng_crc32_table[(r ^ data[i]) & 0xff] ^ (r >> 8);
+  }
+  return r ^ 0xffffffffu;
+}
+#else /* !LODEPNG_NO_COMPILE_CRC */
+unsigned lodepng_crc32(const unsigned char* data, size_t length);
+#endif /* !LODEPNG_NO_COMPILE_CRC */
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Reading and writing single bits and bytes from/to stream for LodePNG   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned char readBitFromReversedStream(size_t* bitpointer, const unsigned char* bitstream) {
+  unsigned char result = (unsigned char)((bitstream[(*bitpointer) >> 3] >> (7 - ((*bitpointer) & 0x7))) & 1);
+  ++(*bitpointer);
+  return result;
+}
+
+static unsigned readBitsFromReversedStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits) {
+  unsigned result = 0;
+  size_t i;
+  for(i = 0 ; i < nbits; ++i) {
+    result <<= 1;
+    result |= (unsigned)readBitFromReversedStream(bitpointer, bitstream);
+  }
+  return result;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+static void setBitOfReversedStream0(size_t* bitpointer, unsigned char* bitstream, unsigned char bit) {
+  /*the current bit in bitstream must be 0 for this to work*/
+  if(bit) {
+    /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/
+    bitstream[(*bitpointer) >> 3] |= (bit << (7 - ((*bitpointer) & 0x7)));
+  }
+  ++(*bitpointer);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+static void setBitOfReversedStream(size_t* bitpointer, unsigned char* bitstream, unsigned char bit) {
+  /*the current bit in bitstream may be 0 or 1 for this to work*/
+  if(bit == 0) bitstream[(*bitpointer) >> 3] &=  (unsigned char)(~(1 << (7 - ((*bitpointer) & 0x7))));
+  else         bitstream[(*bitpointer) >> 3] |=  (1 << (7 - ((*bitpointer) & 0x7)));
+  ++(*bitpointer);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG chunks                                                             / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_chunk_length(const unsigned char* chunk) {
+  return lodepng_read32bitInt(&chunk[0]);
+}
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk) {
+  unsigned i;
+  for(i = 0; i != 4; ++i) type[i] = (char)chunk[4 + i];
+  type[4] = 0; /*null termination char*/
+}
+
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type) {
+  if(strlen(type) != 4) return 0;
+  return (chunk[4] == type[0] && chunk[5] == type[1] && chunk[6] == type[2] && chunk[7] == type[3]);
+}
+
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk) {
+  return((chunk[4] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_private(const unsigned char* chunk) {
+  return((chunk[6] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk) {
+  return((chunk[7] & 32) != 0);
+}
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk) {
+  return &chunk[8];
+}
+
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk) {
+  return &chunk[8];
+}
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk) {
+  unsigned length = lodepng_chunk_length(chunk);
+  unsigned CRC = lodepng_read32bitInt(&chunk[length + 8]);
+  /*the CRC is taken of the data and the 4 chunk type letters, not the length*/
+  unsigned checksum = lodepng_crc32(&chunk[4], length + 4);
+  if(CRC != checksum) return 1;
+  else return 0;
+}
+
+void lodepng_chunk_generate_crc(unsigned char* chunk) {
+  unsigned length = lodepng_chunk_length(chunk);
+  unsigned CRC = lodepng_crc32(&chunk[4], length + 4);
+  lodepng_set32bitInt(chunk + 8 + length, CRC);
+}
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk) {
+  if(chunk[0] == 0x89 && chunk[1] == 0x50 && chunk[2] == 0x4e && chunk[3] == 0x47
+    && chunk[4] == 0x0d && chunk[5] == 0x0a && chunk[6] == 0x1a && chunk[7] == 0x0a) {
+    /* Is PNG magic header at start of PNG file. Jump to first actual chunk. */
+    return chunk + 8;
+  } else {
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return chunk + total_chunk_length;
+  }
+}
+
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk) {
+  if(chunk[0] == 0x89 && chunk[1] == 0x50 && chunk[2] == 0x4e && chunk[3] == 0x47
+    && chunk[4] == 0x0d && chunk[5] == 0x0a && chunk[6] == 0x1a && chunk[7] == 0x0a) {
+    /* Is PNG magic header at start of PNG file. Jump to first actual chunk. */
+    return chunk + 8;
+  } else {
+    unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+    return chunk + total_chunk_length;
+  }
+}
+
+unsigned char* lodepng_chunk_find(unsigned char* chunk, const unsigned char* end, const char type[5]) {
+  for(;;) {
+    if(chunk + 12 >= end) return 0;
+    if(lodepng_chunk_type_equals(chunk, type)) return chunk;
+    chunk = lodepng_chunk_next(chunk);
+  }
+}
+
+const unsigned char* lodepng_chunk_find_const(const unsigned char* chunk, const unsigned char* end, const char type[5]) {
+  for(;;) {
+    if(chunk + 12 >= end) return 0;
+    if(lodepng_chunk_type_equals(chunk, type)) return chunk;
+    chunk = lodepng_chunk_next_const(chunk);
+  }
+}
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk) {
+  unsigned i;
+  unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+  unsigned char *chunk_start, *new_buffer;
+  size_t new_length = (*outlength) + total_chunk_length;
+  if(new_length < total_chunk_length || new_length < (*outlength)) return 77; /*integer overflow happened*/
+
+  new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+  if(!new_buffer) return 83; /*alloc fail*/
+  (*out) = new_buffer;
+  (*outlength) = new_length;
+  chunk_start = &(*out)[new_length - total_chunk_length];
+
+  for(i = 0; i != total_chunk_length; ++i) chunk_start[i] = chunk[i];
+
+  return 0;
+}
+
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+                              const char* type, const unsigned char* data) {
+  unsigned i;
+  unsigned char *chunk, *new_buffer;
+  size_t new_length = (*outlength) + length + 12;
+  if(new_length < length + 12 || new_length < (*outlength)) return 77; /*integer overflow happened*/
+  new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+  if(!new_buffer) return 83; /*alloc fail*/
+  (*out) = new_buffer;
+  (*outlength) = new_length;
+  chunk = &(*out)[(*outlength) - length - 12];
+
+  /*1: length*/
+  lodepng_set32bitInt(chunk, (unsigned)length);
+
+  /*2: chunk name (4 letters)*/
+  chunk[4] = (unsigned char)type[0];
+  chunk[5] = (unsigned char)type[1];
+  chunk[6] = (unsigned char)type[2];
+  chunk[7] = (unsigned char)type[3];
+
+  /*3: the data*/
+  for(i = 0; i != length; ++i) chunk[8 + i] = data[i];
+
+  /*4: CRC (of the chunkname characters and the data)*/
+  lodepng_chunk_generate_crc(chunk);
+
+  return 0;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Color types and such                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*return type is a LodePNG error code*/
+static unsigned checkColorValidity(LodePNGColorType colortype, unsigned bd) /*bd = bitdepth*/ {
+  switch(colortype) {
+    case 0: if(!(bd == 1 || bd == 2 || bd == 4 || bd == 8 || bd == 16)) return 37; break; /*gray*/
+    case 2: if(!(                                 bd == 8 || bd == 16)) return 37; break; /*RGB*/
+    case 3: if(!(bd == 1 || bd == 2 || bd == 4 || bd == 8            )) return 37; break; /*palette*/
+    case 4: if(!(                                 bd == 8 || bd == 16)) return 37; break; /*gray + alpha*/
+    case 6: if(!(                                 bd == 8 || bd == 16)) return 37; break; /*RGBA*/
+    default: return 31;
+  }
+  return 0; /*allowed color type / bits combination*/
+}
+
+static unsigned getNumColorChannels(LodePNGColorType colortype) {
+  switch(colortype) {
+    case 0: return 1; /*gray*/
+    case 2: return 3; /*RGB*/
+    case 3: return 1; /*palette*/
+    case 4: return 2; /*gray + alpha*/
+    case 6: return 4; /*RGBA*/
+  }
+  return 0; /*unexisting color type*/
+}
+
+static unsigned lodepng_get_bpp_lct(LodePNGColorType colortype, unsigned bitdepth) {
+  /*bits per pixel is amount of channels * bits per channel*/
+  return getNumColorChannels(colortype) * bitdepth;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+void lodepng_color_mode_init(LodePNGColorMode* info) {
+  info->key_defined = 0;
+  info->key_r = info->key_g = info->key_b = 0;
+  info->colortype = LCT_RGBA;
+  info->bitdepth = 8;
+  info->palette = 0;
+  info->palettesize = 0;
+}
+
+void lodepng_color_mode_cleanup(LodePNGColorMode* info) {
+  lodepng_palette_clear(info);
+}
+
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source) {
+  size_t i;
+  lodepng_color_mode_cleanup(dest);
+  *dest = *source;
+  if(source->palette) {
+    dest->palette = (unsigned char*)lodepng_malloc(1024);
+    if(!dest->palette && source->palettesize) return 83; /*alloc fail*/
+    for(i = 0; i != source->palettesize * 4; ++i) dest->palette[i] = source->palette[i];
+  }
+  return 0;
+}
+
+LodePNGColorMode lodepng_color_mode_make(LodePNGColorType colortype, unsigned bitdepth) {
+  LodePNGColorMode result;
+  lodepng_color_mode_init(&result);
+  result.colortype = colortype;
+  result.bitdepth = bitdepth;
+  return result;
+}
+
+static int lodepng_color_mode_equal(const LodePNGColorMode* a, const LodePNGColorMode* b) {
+  size_t i;
+  if(a->colortype != b->colortype) return 0;
+  if(a->bitdepth != b->bitdepth) return 0;
+  if(a->key_defined != b->key_defined) return 0;
+  if(a->key_defined) {
+    if(a->key_r != b->key_r) return 0;
+    if(a->key_g != b->key_g) return 0;
+    if(a->key_b != b->key_b) return 0;
+  }
+  if(a->palettesize != b->palettesize) return 0;
+  for(i = 0; i != a->palettesize * 4; ++i) {
+    if(a->palette[i] != b->palette[i]) return 0;
+  }
+  return 1;
+}
+
+void lodepng_palette_clear(LodePNGColorMode* info) {
+  if(info->palette) lodepng_free(info->palette);
+  info->palette = 0;
+  info->palettesize = 0;
+}
+
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+                             unsigned char r, unsigned char g, unsigned char b, unsigned char a) {
+  unsigned char* data;
+  /*the same resize technique as C++ std::vectors is used, and here it's made so that for a palette with
+  the max of 256 colors, it'll have the exact alloc size*/
+  if(!info->palette) /*allocate palette if empty*/ {
+    /*room for 256 colors with 4 bytes each*/
+    data = (unsigned char*)lodepng_realloc(info->palette, 1024);
+    if(!data) return 83; /*alloc fail*/
+    else info->palette = data;
+  }
+  info->palette[4 * info->palettesize + 0] = r;
+  info->palette[4 * info->palettesize + 1] = g;
+  info->palette[4 * info->palettesize + 2] = b;
+  info->palette[4 * info->palettesize + 3] = a;
+  ++info->palettesize;
+  return 0;
+}
+
+/*calculate bits per pixel out of colortype and bitdepth*/
+unsigned lodepng_get_bpp(const LodePNGColorMode* info) {
+  return lodepng_get_bpp_lct(info->colortype, info->bitdepth);
+}
+
+unsigned lodepng_get_channels(const LodePNGColorMode* info) {
+  return getNumColorChannels(info->colortype);
+}
+
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info) {
+  return info->colortype == LCT_GREY || info->colortype == LCT_GREY_ALPHA;
+}
+
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info) {
+  return (info->colortype & 4) != 0; /*4 or 6*/
+}
+
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info) {
+  return info->colortype == LCT_PALETTE;
+}
+
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info) {
+  size_t i;
+  for(i = 0; i != info->palettesize; ++i) {
+    if(info->palette[i * 4 + 3] < 255) return 1;
+  }
+  return 0;
+}
+
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info) {
+  return info->key_defined
+      || lodepng_is_alpha_type(info)
+      || lodepng_has_palette_alpha(info);
+}
+
+size_t lodepng_get_raw_size_lct(unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth) {
+  size_t bpp = lodepng_get_bpp_lct(colortype, bitdepth);
+  size_t n = (size_t)w * (size_t)h;
+  return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color) {
+  return lodepng_get_raw_size_lct(w, h, color->colortype, color->bitdepth);
+}
+
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_DECODER
+
+/*in an idat chunk, each scanline is a multiple of 8 bits, unlike the lodepng output buffer,
+and in addition has one extra byte per line: the filter byte. So this gives a larger
+result than lodepng_get_raw_size. */
+static size_t lodepng_get_raw_size_idat(unsigned w, unsigned h, const LodePNGColorMode* color) {
+  size_t bpp = lodepng_get_bpp(color);
+  /* + 1 for the filter byte, and possibly plus padding bits per line */
+  size_t line = ((size_t)(w / 8) * bpp) + 1 + ((w & 7) * bpp + 7) / 8;
+  return (size_t)h * line;
+}
+
+/* Safely check if multiplying two integers will overflow (no undefined
+behavior, compiler removing the code, etc...) and output result. */
+static int lodepng_mulofl(size_t a, size_t b, size_t* result) {
+  *result = a * b; /* Unsigned multiplication is well defined and safe in C90 */
+  return (a != 0 && *result / a != b);
+}
+
+/* Safely check if adding two integers will overflow (no undefined
+behavior, compiler removing the code, etc...) and output result. */
+static int lodepng_addofl(size_t a, size_t b, size_t* result) {
+  *result = a + b; /* Unsigned addition is well defined and safe in C90 */
+  return *result < a;
+}
+
+/*Safely checks whether size_t overflow can be caused due to amount of pixels.
+This check is overcautious rather than precise. If this check indicates no overflow,
+you can safely compute in a size_t (but not an unsigned):
+-(size_t)w * (size_t)h * 8
+-amount of bytes in IDAT (including filter, padding and Adam7 bytes)
+-amount of bytes in raw color model
+Returns 1 if overflow possible, 0 if not.
+*/
+static int lodepng_pixel_overflow(unsigned w, unsigned h,
+                                  const LodePNGColorMode* pngcolor, const LodePNGColorMode* rawcolor) {
+  size_t bpp = LODEPNG_MAX(lodepng_get_bpp(pngcolor), lodepng_get_bpp(rawcolor));
+  size_t numpixels, total;
+  size_t line; /* bytes per line in worst case */
+
+  if(lodepng_mulofl((size_t)w, (size_t)h, &numpixels)) return 1;
+  if(lodepng_mulofl(numpixels, 8, &total)) return 1; /* bit pointer with 8-bit color, or 8 bytes per channel color */
+
+  /* Bytes per scanline with the expression "(w / 8) * bpp) + ((w & 7) * bpp + 7) / 8" */
+  if(lodepng_mulofl((size_t)(w / 8), bpp, &line)) return 1;
+  if(lodepng_addofl(line, ((w & 7) * bpp + 7) / 8, &line)) return 1;
+
+  if(lodepng_addofl(line, 5, &line)) return 1; /* 5 bytes overhead per line: 1 filterbyte, 4 for Adam7 worst case */
+  if(lodepng_mulofl(line, h, &total)) return 1; /* Total bytes in worst case */
+
+  return 0; /* no overflow */
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static void LodePNGUnknownChunks_init(LodePNGInfo* info) {
+  unsigned i;
+  for(i = 0; i != 3; ++i) info->unknown_chunks_data[i] = 0;
+  for(i = 0; i != 3; ++i) info->unknown_chunks_size[i] = 0;
+}
+
+static void LodePNGUnknownChunks_cleanup(LodePNGInfo* info) {
+  unsigned i;
+  for(i = 0; i != 3; ++i) lodepng_free(info->unknown_chunks_data[i]);
+}
+
+static unsigned LodePNGUnknownChunks_copy(LodePNGInfo* dest, const LodePNGInfo* src) {
+  unsigned i;
+
+  LodePNGUnknownChunks_cleanup(dest);
+
+  for(i = 0; i != 3; ++i) {
+    size_t j;
+    dest->unknown_chunks_size[i] = src->unknown_chunks_size[i];
+    dest->unknown_chunks_data[i] = (unsigned char*)lodepng_malloc(src->unknown_chunks_size[i]);
+    if(!dest->unknown_chunks_data[i] && dest->unknown_chunks_size[i]) return 83; /*alloc fail*/
+    for(j = 0; j < src->unknown_chunks_size[i]; ++j) {
+      dest->unknown_chunks_data[i][j] = src->unknown_chunks_data[i][j];
+    }
+  }
+
+  return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGText_init(LodePNGInfo* info) {
+  info->text_num = 0;
+  info->text_keys = NULL;
+  info->text_strings = NULL;
+}
+
+static void LodePNGText_cleanup(LodePNGInfo* info) {
+  size_t i;
+  for(i = 0; i != info->text_num; ++i) {
+    string_cleanup(&info->text_keys[i]);
+    string_cleanup(&info->text_strings[i]);
+  }
+  lodepng_free(info->text_keys);
+  lodepng_free(info->text_strings);
+}
+
+static unsigned LodePNGText_copy(LodePNGInfo* dest, const LodePNGInfo* source) {
+  size_t i = 0;
+  dest->text_keys = 0;
+  dest->text_strings = 0;
+  dest->text_num = 0;
+  for(i = 0; i != source->text_num; ++i) {
+    CERROR_TRY_RETURN(lodepng_add_text(dest, source->text_keys[i], source->text_strings[i]));
+  }
+  return 0;
+}
+
+void lodepng_clear_text(LodePNGInfo* info) {
+  LodePNGText_cleanup(info);
+}
+
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str) {
+  char** new_keys = (char**)(lodepng_realloc(info->text_keys, sizeof(char*) * (info->text_num + 1)));
+  char** new_strings = (char**)(lodepng_realloc(info->text_strings, sizeof(char*) * (info->text_num + 1)));
+  if(!new_keys || !new_strings) {
+    lodepng_free(new_keys);
+    lodepng_free(new_strings);
+    return 83; /*alloc fail*/
+  }
+
+  ++info->text_num;
+  info->text_keys = new_keys;
+  info->text_strings = new_strings;
+
+  info->text_keys[info->text_num - 1] = alloc_string(key);
+  info->text_strings[info->text_num - 1] = alloc_string(str);
+
+  return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGIText_init(LodePNGInfo* info) {
+  info->itext_num = 0;
+  info->itext_keys = NULL;
+  info->itext_langtags = NULL;
+  info->itext_transkeys = NULL;
+  info->itext_strings = NULL;
+}
+
+static void LodePNGIText_cleanup(LodePNGInfo* info) {
+  size_t i;
+  for(i = 0; i != info->itext_num; ++i) {
+    string_cleanup(&info->itext_keys[i]);
+    string_cleanup(&info->itext_langtags[i]);
+    string_cleanup(&info->itext_transkeys[i]);
+    string_cleanup(&info->itext_strings[i]);
+  }
+  lodepng_free(info->itext_keys);
+  lodepng_free(info->itext_langtags);
+  lodepng_free(info->itext_transkeys);
+  lodepng_free(info->itext_strings);
+}
+
+static unsigned LodePNGIText_copy(LodePNGInfo* dest, const LodePNGInfo* source) {
+  size_t i = 0;
+  dest->itext_keys = 0;
+  dest->itext_langtags = 0;
+  dest->itext_transkeys = 0;
+  dest->itext_strings = 0;
+  dest->itext_num = 0;
+  for(i = 0; i != source->itext_num; ++i) {
+    CERROR_TRY_RETURN(lodepng_add_itext(dest, source->itext_keys[i], source->itext_langtags[i],
+                                        source->itext_transkeys[i], source->itext_strings[i]));
+  }
+  return 0;
+}
+
+void lodepng_clear_itext(LodePNGInfo* info) {
+  LodePNGIText_cleanup(info);
+}
+
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+                           const char* transkey, const char* str) {
+  char** new_keys = (char**)(lodepng_realloc(info->itext_keys, sizeof(char*) * (info->itext_num + 1)));
+  char** new_langtags = (char**)(lodepng_realloc(info->itext_langtags, sizeof(char*) * (info->itext_num + 1)));
+  char** new_transkeys = (char**)(lodepng_realloc(info->itext_transkeys, sizeof(char*) * (info->itext_num + 1)));
+  char** new_strings = (char**)(lodepng_realloc(info->itext_strings, sizeof(char*) * (info->itext_num + 1)));
+  if(!new_keys || !new_langtags || !new_transkeys || !new_strings) {
+    lodepng_free(new_keys);
+    lodepng_free(new_langtags);
+    lodepng_free(new_transkeys);
+    lodepng_free(new_strings);
+    return 83; /*alloc fail*/
+  }
+
+  ++info->itext_num;
+  info->itext_keys = new_keys;
+  info->itext_langtags = new_langtags;
+  info->itext_transkeys = new_transkeys;
+  info->itext_strings = new_strings;
+
+  info->itext_keys[info->itext_num - 1] = alloc_string(key);
+  info->itext_langtags[info->itext_num - 1] = alloc_string(langtag);
+  info->itext_transkeys[info->itext_num - 1] = alloc_string(transkey);
+  info->itext_strings[info->itext_num - 1] = alloc_string(str);
+
+  return 0;
+}
+
+/* same as set but does not delete */
+static unsigned lodepng_assign_icc(LodePNGInfo* info, const char* name, const unsigned char* profile, unsigned profile_size) {
+  info->iccp_name = alloc_string(name);
+  info->iccp_profile = (unsigned char*)lodepng_malloc(profile_size);
+
+  if(!info->iccp_name || !info->iccp_profile) return 83; /*alloc fail*/
+
+  memcpy(info->iccp_profile, profile, profile_size);
+  info->iccp_profile_size = profile_size;
+
+  return 0; /*ok*/
+}
+
+unsigned lodepng_set_icc(LodePNGInfo* info, const char* name, const unsigned char* profile, unsigned profile_size) {
+  if(info->iccp_name) lodepng_clear_icc(info);
+  info->iccp_defined = 1;
+
+  return lodepng_assign_icc(info, name, profile, profile_size);
+}
+
+void lodepng_clear_icc(LodePNGInfo* info) {
+  string_cleanup(&info->iccp_name);
+  lodepng_free(info->iccp_profile);
+  info->iccp_profile = NULL;
+  info->iccp_profile_size = 0;
+  info->iccp_defined = 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+void lodepng_info_init(LodePNGInfo* info) {
+  lodepng_color_mode_init(&info->color);
+  info->interlace_method = 0;
+  info->compression_method = 0;
+  info->filter_method = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  info->background_defined = 0;
+  info->background_r = info->background_g = info->background_b = 0;
+
+  LodePNGText_init(info);
+  LodePNGIText_init(info);
+
+  info->time_defined = 0;
+  info->phys_defined = 0;
+
+  info->gama_defined = 0;
+  info->chrm_defined = 0;
+  info->srgb_defined = 0;
+  info->iccp_defined = 0;
+  info->iccp_name = NULL;
+  info->iccp_profile = NULL;
+
+  LodePNGUnknownChunks_init(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+void lodepng_info_cleanup(LodePNGInfo* info) {
+  lodepng_color_mode_cleanup(&info->color);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  LodePNGText_cleanup(info);
+  LodePNGIText_cleanup(info);
+
+  lodepng_clear_icc(info);
+
+  LodePNGUnknownChunks_cleanup(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source) {
+  lodepng_info_cleanup(dest);
+  *dest = *source;
+  lodepng_color_mode_init(&dest->color);
+  CERROR_TRY_RETURN(lodepng_color_mode_copy(&dest->color, &source->color));
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  CERROR_TRY_RETURN(LodePNGText_copy(dest, source));
+  CERROR_TRY_RETURN(LodePNGIText_copy(dest, source));
+  if(source->iccp_defined) {
+    CERROR_TRY_RETURN(lodepng_assign_icc(dest, source->iccp_name, source->iccp_profile, source->iccp_profile_size));
+  }
+
+  LodePNGUnknownChunks_init(dest);
+  CERROR_TRY_RETURN(LodePNGUnknownChunks_copy(dest, source));
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+  return 0;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*index: bitgroup index, bits: bitgroup size(1, 2 or 4), in: bitgroup value, out: octet array to add bits to*/
+static void addColorBits(unsigned char* out, size_t index, unsigned bits, unsigned in) {
+  unsigned m = bits == 1 ? 7 : bits == 2 ? 3 : 1; /*8 / bits - 1*/
+  /*p = the partial index in the byte, e.g. with 4 palettebits it is 0 for first half or 1 for second half*/
+  unsigned p = index & m;
+  in &= (1u << bits) - 1u; /*filter out any other bits of the input value*/
+  in = in << (bits * (m - p));
+  if(p == 0) out[index * bits / 8] = in;
+  else out[index * bits / 8] |= in;
+}
+
+typedef struct ColorTree ColorTree;
+
+/*
+One node of a color tree
+This is the data structure used to count the number of unique colors and to get a palette
+index for a color. It's like an octree, but because the alpha channel is used too, each
+node has 16 instead of 8 children.
+*/
+struct ColorTree {
+  ColorTree* children[16]; /*up to 16 pointers to ColorTree of next level*/
+  int index; /*the payload. Only has a meaningful value if this is in the last level*/
+};
+
+static void color_tree_init(ColorTree* tree) {
+  int i;
+  for(i = 0; i != 16; ++i) tree->children[i] = 0;
+  tree->index = -1;
+}
+
+static void color_tree_cleanup(ColorTree* tree) {
+  int i;
+  for(i = 0; i != 16; ++i) {
+    if(tree->children[i]) {
+      color_tree_cleanup(tree->children[i]);
+      lodepng_free(tree->children[i]);
+    }
+  }
+}
+
+/*returns -1 if color not present, its index otherwise*/
+static int color_tree_get(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a) {
+  int bit = 0;
+  for(bit = 0; bit < 8; ++bit) {
+    int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+    if(!tree->children[i]) return -1;
+    else tree = tree->children[i];
+  }
+  return tree ? tree->index : -1;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static int color_tree_has(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a) {
+  return color_tree_get(tree, r, g, b, a) >= 0;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*color is not allowed to already exist.
+Index should be >= 0 (it's signed to be compatible with using -1 for "doesn't exist")*/
+static void color_tree_add(ColorTree* tree,
+                           unsigned char r, unsigned char g, unsigned char b, unsigned char a, unsigned index) {
+  int bit;
+  for(bit = 0; bit < 8; ++bit) {
+    int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+    if(!tree->children[i]) {
+      tree->children[i] = (ColorTree*)lodepng_malloc(sizeof(ColorTree));
+      color_tree_init(tree->children[i]);
+    }
+    tree = tree->children[i];
+  }
+  tree->index = (int)index;
+}
+
+/*put a pixel, given its RGBA color, into image of any color type*/
+static unsigned rgba8ToPixel(unsigned char* out, size_t i,
+                             const LodePNGColorMode* mode, ColorTree* tree /*for palette*/,
+                             unsigned char r, unsigned char g, unsigned char b, unsigned char a) {
+  if(mode->colortype == LCT_GREY) {
+    unsigned char gray = r; /*((unsigned short)r + g + b) / 3;*/
+    if(mode->bitdepth == 8) out[i] = gray;
+    else if(mode->bitdepth == 16) out[i * 2 + 0] = out[i * 2 + 1] = gray;
+    else {
+      /*take the most significant bits of gray*/
+      gray = (gray >> (8 - mode->bitdepth)) & ((1 << mode->bitdepth) - 1);
+      addColorBits(out, i, mode->bitdepth, gray);
+    }
+  } else if(mode->colortype == LCT_RGB) {
+    if(mode->bitdepth == 8) {
+      out[i * 3 + 0] = r;
+      out[i * 3 + 1] = g;
+      out[i * 3 + 2] = b;
+    } else {
+      out[i * 6 + 0] = out[i * 6 + 1] = r;
+      out[i * 6 + 2] = out[i * 6 + 3] = g;
+      out[i * 6 + 4] = out[i * 6 + 5] = b;
+    }
+  } else if(mode->colortype == LCT_PALETTE) {
+    int index = color_tree_get(tree, r, g, b, a);
+    if(index < 0) return 82; /*color not in palette*/
+    if(mode->bitdepth == 8) out[i] = index;
+    else addColorBits(out, i, mode->bitdepth, (unsigned)index);
+  } else if(mode->colortype == LCT_GREY_ALPHA) {
+    unsigned char gray = r; /*((unsigned short)r + g + b) / 3;*/
+    if(mode->bitdepth == 8) {
+      out[i * 2 + 0] = gray;
+      out[i * 2 + 1] = a;
+    } else if(mode->bitdepth == 16) {
+      out[i * 4 + 0] = out[i * 4 + 1] = gray;
+      out[i * 4 + 2] = out[i * 4 + 3] = a;
+    }
+  } else if(mode->colortype == LCT_RGBA) {
+    if(mode->bitdepth == 8) {
+      out[i * 4 + 0] = r;
+      out[i * 4 + 1] = g;
+      out[i * 4 + 2] = b;
+      out[i * 4 + 3] = a;
+    } else {
+      out[i * 8 + 0] = out[i * 8 + 1] = r;
+      out[i * 8 + 2] = out[i * 8 + 3] = g;
+      out[i * 8 + 4] = out[i * 8 + 5] = b;
+      out[i * 8 + 6] = out[i * 8 + 7] = a;
+    }
+  }
+
+  return 0; /*no error*/
+}
+
+/*put a pixel, given its RGBA16 color, into image of any color 16-bitdepth type*/
+static void rgba16ToPixel(unsigned char* out, size_t i,
+                         const LodePNGColorMode* mode,
+                         unsigned short r, unsigned short g, unsigned short b, unsigned short a) {
+  if(mode->colortype == LCT_GREY) {
+    unsigned short gray = r; /*((unsigned)r + g + b) / 3;*/
+    out[i * 2 + 0] = (gray >> 8) & 255;
+    out[i * 2 + 1] = gray & 255;
+  } else if(mode->colortype == LCT_RGB) {
+    out[i * 6 + 0] = (r >> 8) & 255;
+    out[i * 6 + 1] = r & 255;
+    out[i * 6 + 2] = (g >> 8) & 255;
+    out[i * 6 + 3] = g & 255;
+    out[i * 6 + 4] = (b >> 8) & 255;
+    out[i * 6 + 5] = b & 255;
+  } else if(mode->colortype == LCT_GREY_ALPHA) {
+    unsigned short gray = r; /*((unsigned)r + g + b) / 3;*/
+    out[i * 4 + 0] = (gray >> 8) & 255;
+    out[i * 4 + 1] = gray & 255;
+    out[i * 4 + 2] = (a >> 8) & 255;
+    out[i * 4 + 3] = a & 255;
+  } else if(mode->colortype == LCT_RGBA) {
+    out[i * 8 + 0] = (r >> 8) & 255;
+    out[i * 8 + 1] = r & 255;
+    out[i * 8 + 2] = (g >> 8) & 255;
+    out[i * 8 + 3] = g & 255;
+    out[i * 8 + 4] = (b >> 8) & 255;
+    out[i * 8 + 5] = b & 255;
+    out[i * 8 + 6] = (a >> 8) & 255;
+    out[i * 8 + 7] = a & 255;
+  }
+}
+
+/*Get RGBA8 color of pixel with index i (y * width + x) from the raw image with given color type.*/
+static void getPixelColorRGBA8(unsigned char* r, unsigned char* g,
+                               unsigned char* b, unsigned char* a,
+                               const unsigned char* in, size_t i,
+                               const LodePNGColorMode* mode) {
+  if(mode->colortype == LCT_GREY) {
+    if(mode->bitdepth == 8) {
+      *r = *g = *b = in[i];
+      if(mode->key_defined && *r == mode->key_r) *a = 0;
+      else *a = 255;
+    } else if(mode->bitdepth == 16) {
+      *r = *g = *b = in[i * 2 + 0];
+      if(mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+      else *a = 255;
+    } else {
+      unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+      size_t j = i * mode->bitdepth;
+      unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+      *r = *g = *b = (value * 255) / highest;
+      if(mode->key_defined && value == mode->key_r) *a = 0;
+      else *a = 255;
+    }
+  } else if(mode->colortype == LCT_RGB) {
+    if(mode->bitdepth == 8) {
+      *r = in[i * 3 + 0]; *g = in[i * 3 + 1]; *b = in[i * 3 + 2];
+      if(mode->key_defined && *r == mode->key_r && *g == mode->key_g && *b == mode->key_b) *a = 0;
+      else *a = 255;
+    } else {
+      *r = in[i * 6 + 0];
+      *g = in[i * 6 + 2];
+      *b = in[i * 6 + 4];
+      if(mode->key_defined && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+         && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+         && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+      else *a = 255;
+    }
+  } else if(mode->colortype == LCT_PALETTE) {
+    unsigned index;
+    if(mode->bitdepth == 8) index = in[i];
+    else {
+      size_t j = i * mode->bitdepth;
+      index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+    }
+
+    if(index >= mode->palettesize) {
+      /*This is an error according to the PNG spec, but common PNG decoders make it black instead.
+      Done here too, slightly faster due to no error handling needed.*/
+      *r = *g = *b = 0;
+      *a = 255;
+    } else {
+      *r = mode->palette[index * 4 + 0];
+      *g = mode->palette[index * 4 + 1];
+      *b = mode->palette[index * 4 + 2];
+      *a = mode->palette[index * 4 + 3];
+    }
+  } else if(mode->colortype == LCT_GREY_ALPHA) {
+    if(mode->bitdepth == 8) {
+      *r = *g = *b = in[i * 2 + 0];
+      *a = in[i * 2 + 1];
+    } else {
+      *r = *g = *b = in[i * 4 + 0];
+      *a = in[i * 4 + 2];
+    }
+  } else if(mode->colortype == LCT_RGBA) {
+    if(mode->bitdepth == 8) {
+      *r = in[i * 4 + 0];
+      *g = in[i * 4 + 1];
+      *b = in[i * 4 + 2];
+      *a = in[i * 4 + 3];
+    } else {
+      *r = in[i * 8 + 0];
+      *g = in[i * 8 + 2];
+      *b = in[i * 8 + 4];
+      *a = in[i * 8 + 6];
+    }
+  }
+}
+
+/*Similar to getPixelColorRGBA8, but with all the for loops inside of the color
+mode test cases, optimized to convert the colors much faster, when converting
+to RGBA or RGB with 8 bit per cannel. buffer must be RGBA or RGB output with
+enough memory, if has_alpha is true the output is RGBA. mode has the color mode
+of the input buffer.*/
+static void getPixelColorsRGBA8(unsigned char* buffer, size_t numpixels,
+                                unsigned has_alpha, const unsigned char* in,
+                                const LodePNGColorMode* mode) {
+  unsigned num_channels = has_alpha ? 4 : 3;
+  size_t i;
+  if(mode->colortype == LCT_GREY) {
+    if(mode->bitdepth == 8) {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = buffer[1] = buffer[2] = in[i];
+        if(has_alpha) buffer[3] = mode->key_defined && in[i] == mode->key_r ? 0 : 255;
+      }
+    } else if(mode->bitdepth == 16) {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = buffer[1] = buffer[2] = in[i * 2];
+        if(has_alpha) buffer[3] = mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r ? 0 : 255;
+      }
+    } else {
+      unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+      size_t j = 0;
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+        buffer[0] = buffer[1] = buffer[2] = (value * 255) / highest;
+        if(has_alpha) buffer[3] = mode->key_defined && value == mode->key_r ? 0 : 255;
+      }
+    }
+  } else if(mode->colortype == LCT_RGB) {
+    if(mode->bitdepth == 8) {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = in[i * 3 + 0];
+        buffer[1] = in[i * 3 + 1];
+        buffer[2] = in[i * 3 + 2];
+        if(has_alpha) buffer[3] = mode->key_defined && buffer[0] == mode->key_r
+           && buffer[1]== mode->key_g && buffer[2] == mode->key_b ? 0 : 255;
+      }
+    } else {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = in[i * 6 + 0];
+        buffer[1] = in[i * 6 + 2];
+        buffer[2] = in[i * 6 + 4];
+        if(has_alpha) buffer[3] = mode->key_defined
+           && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+           && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+           && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b ? 0 : 255;
+      }
+    }
+  } else if(mode->colortype == LCT_PALETTE) {
+    unsigned index;
+    size_t j = 0;
+    for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+      if(mode->bitdepth == 8) index = in[i];
+      else index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+
+      if(index >= mode->palettesize) {
+        /*This is an error according to the PNG spec, but most PNG decoders make it black instead.
+        Done here too, slightly faster due to no error handling needed.*/
+        buffer[0] = buffer[1] = buffer[2] = 0;
+        if(has_alpha) buffer[3] = 255;
+      } else {
+        buffer[0] = mode->palette[index * 4 + 0];
+        buffer[1] = mode->palette[index * 4 + 1];
+        buffer[2] = mode->palette[index * 4 + 2];
+        if(has_alpha) buffer[3] = mode->palette[index * 4 + 3];
+      }
+    }
+  } else if(mode->colortype == LCT_GREY_ALPHA) {
+    if(mode->bitdepth == 8) {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = buffer[1] = buffer[2] = in[i * 2 + 0];
+        if(has_alpha) buffer[3] = in[i * 2 + 1];
+      }
+    } else {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = buffer[1] = buffer[2] = in[i * 4 + 0];
+        if(has_alpha) buffer[3] = in[i * 4 + 2];
+      }
+    }
+  } else if(mode->colortype == LCT_RGBA) {
+    if(mode->bitdepth == 8) {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = in[i * 4 + 0];
+        buffer[1] = in[i * 4 + 1];
+        buffer[2] = in[i * 4 + 2];
+        if(has_alpha) buffer[3] = in[i * 4 + 3];
+      }
+    } else {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels) {
+        buffer[0] = in[i * 8 + 0];
+        buffer[1] = in[i * 8 + 2];
+        buffer[2] = in[i * 8 + 4];
+        if(has_alpha) buffer[3] = in[i * 8 + 6];
+      }
+    }
+  }
+}
+
+/*Get RGBA16 color of pixel with index i (y * width + x) from the raw image with
+given color type, but the given color type must be 16-bit itself.*/
+static void getPixelColorRGBA16(unsigned short* r, unsigned short* g, unsigned short* b, unsigned short* a,
+                                const unsigned char* in, size_t i, const LodePNGColorMode* mode) {
+  if(mode->colortype == LCT_GREY) {
+    *r = *g = *b = 256 * in[i * 2 + 0] + in[i * 2 + 1];
+    if(mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+    else *a = 65535;
+  } else if(mode->colortype == LCT_RGB) {
+    *r = 256u * in[i * 6 + 0] + in[i * 6 + 1];
+    *g = 256u * in[i * 6 + 2] + in[i * 6 + 3];
+    *b = 256u * in[i * 6 + 4] + in[i * 6 + 5];
+    if(mode->key_defined
+       && 256u * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+       && 256u * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+       && 256u * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+    else *a = 65535;
+  } else if(mode->colortype == LCT_GREY_ALPHA) {
+    *r = *g = *b = 256u * in[i * 4 + 0] + in[i * 4 + 1];
+    *a = 256u * in[i * 4 + 2] + in[i * 4 + 3];
+  } else if(mode->colortype == LCT_RGBA) {
+    *r = 256u * in[i * 8 + 0] + in[i * 8 + 1];
+    *g = 256u * in[i * 8 + 2] + in[i * 8 + 3];
+    *b = 256u * in[i * 8 + 4] + in[i * 8 + 5];
+    *a = 256u * in[i * 8 + 6] + in[i * 8 + 7];
+  }
+}
+
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+                         const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+                         unsigned w, unsigned h) {
+  size_t i;
+  ColorTree tree;
+  size_t numpixels = (size_t)w * (size_t)h;
+  unsigned error = 0;
+
+  if(lodepng_color_mode_equal(mode_out, mode_in)) {
+    size_t numbytes = lodepng_get_raw_size(w, h, mode_in);
+    for(i = 0; i != numbytes; ++i) out[i] = in[i];
+    return 0;
+  }
+
+  if(mode_out->colortype == LCT_PALETTE) {
+    size_t palettesize = mode_out->palettesize;
+    const unsigned char* palette = mode_out->palette;
+    size_t palsize = (size_t)1u << mode_out->bitdepth;
+    /*if the user specified output palette but did not give the values, assume
+    they want the values of the input color type (assuming that one is palette).
+    Note that we never create a new palette ourselves.*/
+    if(palettesize == 0) {
+      palettesize = mode_in->palettesize;
+      palette = mode_in->palette;
+      /*if the input was also palette with same bitdepth, then the color types are also
+      equal, so copy literally. This to preserve the exact indices that were in the PNG
+      even in case there are duplicate colors in the palette.*/
+      if (mode_in->colortype == LCT_PALETTE && mode_in->bitdepth == mode_out->bitdepth) {
+        size_t numbytes = lodepng_get_raw_size(w, h, mode_in);
+        for(i = 0; i != numbytes; ++i) out[i] = in[i];
+        return 0;
+      }
+    }
+    if(palettesize < palsize) palsize = palettesize;
+    color_tree_init(&tree);
+    for(i = 0; i != palsize; ++i) {
+      const unsigned char* p = &palette[i * 4];
+      color_tree_add(&tree, p[0], p[1], p[2], p[3], (unsigned)i);
+    }
+  }
+
+  if(mode_in->bitdepth == 16 && mode_out->bitdepth == 16) {
+    for(i = 0; i != numpixels; ++i) {
+      unsigned short r = 0, g = 0, b = 0, a = 0;
+      getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+      rgba16ToPixel(out, i, mode_out, r, g, b, a);
+    }
+  } else if(mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGBA) {
+    getPixelColorsRGBA8(out, numpixels, 1, in, mode_in);
+  } else if(mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGB) {
+    getPixelColorsRGBA8(out, numpixels, 0, in, mode_in);
+  } else {
+    unsigned char r = 0, g = 0, b = 0, a = 0;
+    for(i = 0; i != numpixels; ++i) {
+      getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+      error = rgba8ToPixel(out, i, mode_out, &tree, r, g, b, a);
+      if (error) break;
+    }
+  }
+
+  if(mode_out->colortype == LCT_PALETTE) {
+    color_tree_cleanup(&tree);
+  }
+
+  return error;
+}
+
+
+/* Converts a single rgb color without alpha from one type to another, color bits truncated to
+their bitdepth. In case of single channel (gray or palette), only the r channel is used. Slow
+function, do not use to process all pixels of an image. Alpha channel not supported on purpose:
+this is for bKGD, supporting alpha may prevent it from finding a color in the palette, from the
+specification it looks like bKGD should ignore the alpha values of the palette since it can use
+any palette index but doesn't have an alpha channel. Idem with ignoring color key. */
+unsigned lodepng_convert_rgb(
+    unsigned* r_out, unsigned* g_out, unsigned* b_out,
+    unsigned r_in, unsigned g_in, unsigned b_in,
+    const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in) {
+  unsigned r = 0, g = 0, b = 0;
+  unsigned mul = 65535 / ((1u << mode_in->bitdepth) - 1u); /*65535, 21845, 4369, 257, 1*/
+  unsigned shift = 16 - mode_out->bitdepth;
+
+  if(mode_in->colortype == LCT_GREY || mode_in->colortype == LCT_GREY_ALPHA) {
+    r = g = b = r_in * mul;
+  } else if(mode_in->colortype == LCT_RGB || mode_in->colortype == LCT_RGBA) {
+    r = r_in * mul;
+    g = g_in * mul;
+    b = b_in * mul;
+  } else if(mode_in->colortype == LCT_PALETTE) {
+    if(r_in >= mode_in->palettesize) return 82;
+    r = mode_in->palette[r_in * 4 + 0] * 257u;
+    g = mode_in->palette[r_in * 4 + 1] * 257u;
+    b = mode_in->palette[r_in * 4 + 2] * 257u;
+  } else {
+    return 31;
+  }
+
+  /* now convert to output format */
+  if(mode_out->colortype == LCT_GREY || mode_out->colortype == LCT_GREY_ALPHA) {
+    *r_out = r >> shift ;
+  } else if(mode_out->colortype == LCT_RGB || mode_out->colortype == LCT_RGBA) {
+    *r_out = r >> shift ;
+    *g_out = g >> shift ;
+    *b_out = b >> shift ;
+  } else if(mode_out->colortype == LCT_PALETTE) {
+    unsigned i;
+    /* a 16-bit color cannot be in the palette */
+    if((r >> 8) != (r & 255) || (g >> 8) != (g & 255) || (b >> 8) != (b & 255)) return 82;
+    for(i = 0; i < mode_out->palettesize; i++) {
+      unsigned j = i * 4;
+      if((r >> 8) == mode_out->palette[j + 0] && (g >> 8) == mode_out->palette[j + 1] &&
+          (b >> 8) == mode_out->palette[j + 2]) {
+        *r_out = i;
+        return 0;
+      }
+    }
+    return 82;
+  } else {
+    return 31;
+  }
+
+  return 0;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile) {
+  profile->colored = 0;
+  profile->key = 0;
+  profile->key_r = profile->key_g = profile->key_b = 0;
+  profile->alpha = 0;
+  profile->numcolors = 0;
+  profile->bits = 1;
+  profile->numpixels = 0;
+}
+
+/*function used for debug purposes with C++*/
+/*void printColorProfile(LodePNGColorProfile* p) {
+  std::cout << "colored: " << (int)p->colored << ", ";
+  std::cout << "key: " << (int)p->key << ", ";
+  std::cout << "key_r: " << (int)p->key_r << ", ";
+  std::cout << "key_g: " << (int)p->key_g << ", ";
+  std::cout << "key_b: " << (int)p->key_b << ", ";
+  std::cout << "alpha: " << (int)p->alpha << ", ";
+  std::cout << "numcolors: " << (int)p->numcolors << ", ";
+  std::cout << "bits: " << (int)p->bits << std::endl;
+}*/
+
+/*Returns how many bits needed to represent given value (max 8 bit)*/
+static unsigned getValueRequiredBits(unsigned char value) {
+  if(value == 0 || value == 255) return 1;
+  /*The scaling of 2-bit and 4-bit values uses multiples of 85 and 17*/
+  if(value % 17 == 0) return value % 85 == 0 ? 2 : 4;
+  return 8;
+}
+
+/*profile must already have been inited.
+It's ok to set some parameters of profile to done already.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+                                   const unsigned char* in, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode_in) {
+  unsigned error = 0;
+  size_t i;
+  ColorTree tree;
+  size_t numpixels = (size_t)w * (size_t)h;
+
+  /* mark things as done already if it would be impossible to have a more expensive case */
+  unsigned colored_done = lodepng_is_greyscale_type(mode_in) ? 1 : 0;
+  unsigned alpha_done = lodepng_can_have_alpha(mode_in) ? 0 : 1;
+  unsigned numcolors_done = 0;
+  unsigned bpp = lodepng_get_bpp(mode_in);
+  unsigned bits_done = (profile->bits == 1 && bpp == 1) ? 1 : 0;
+  unsigned sixteen = 0; /* whether the input image is 16 bit */
+  unsigned maxnumcolors = 257;
+  if(bpp <= 8) maxnumcolors = LODEPNG_MIN(257, profile->numcolors + (1u << bpp));
+
+  profile->numpixels += numpixels;
+
+  color_tree_init(&tree);
+
+  /*If the profile was already filled in from previous data, fill its palette in tree
+  and mark things as done already if we know they are the most expensive case already*/
+  if(profile->alpha) alpha_done = 1;
+  if(profile->colored) colored_done = 1;
+  if(profile->bits == 16) numcolors_done = 1;
+  if(profile->bits >= bpp) bits_done = 1;
+  if(profile->numcolors >= maxnumcolors) numcolors_done = 1;
+
+  if(!numcolors_done) {
+    for(i = 0; i < profile->numcolors; i++) {
+      const unsigned char* color = &profile->palette[i * 4];
+      color_tree_add(&tree, color[0], color[1], color[2], color[3], (unsigned int)i);
+    }
+  }
+
+  /*Check if the 16-bit input is truly 16-bit*/
+  if(mode_in->bitdepth == 16 && !sixteen) {
+    unsigned short r, g, b, a;
+    for(i = 0; i != numpixels; ++i) {
+      getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+      if((r & 255) != ((r >> 8) & 255) || (g & 255) != ((g >> 8) & 255) ||
+         (b & 255) != ((b >> 8) & 255) || (a & 255) != ((a >> 8) & 255)) /*first and second byte differ*/ {
+        profile->bits = 16;
+        sixteen = 1;
+        bits_done = 1;
+        numcolors_done = 1; /*counting colors no longer useful, palette doesn't support 16-bit*/
+        break;
+      }
+    }
+  }
+
+  if(sixteen) {
+    unsigned short r = 0, g = 0, b = 0, a = 0;
+
+    for(i = 0; i != numpixels; ++i) {
+      getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+
+      if(!colored_done && (r != g || r != b)) {
+        profile->colored = 1;
+        colored_done = 1;
+      }
+
+      if(!alpha_done) {
+        unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+        if(a != 65535 && (a != 0 || (profile->key && !matchkey))) {
+          profile->alpha = 1;
+          profile->key = 0;
+          alpha_done = 1;
+        } else if(a == 0 && !profile->alpha && !profile->key) {
+          profile->key = 1;
+          profile->key_r = r;
+          profile->key_g = g;
+          profile->key_b = b;
+        } else if(a == 65535 && profile->key && matchkey) {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          profile->key = 0;
+          alpha_done = 1;
+        }
+      }
+      if(alpha_done && numcolors_done && colored_done && bits_done) break;
+    }
+
+    if(profile->key && !profile->alpha) {
+      for(i = 0; i != numpixels; ++i) {
+        getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+        if(a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b) {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          profile->key = 0;
+          alpha_done = 1;
+        }
+      }
+    }
+  } else /* < 16-bit */ {
+    unsigned char r = 0, g = 0, b = 0, a = 0;
+    for(i = 0; i != numpixels; ++i) {
+      getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+
+      if(!bits_done && profile->bits < 8) {
+        /*only r is checked, < 8 bits is only relevant for grayscale*/
+        unsigned bits = getValueRequiredBits(r);
+        if(bits > profile->bits) profile->bits = bits;
+      }
+      bits_done = (profile->bits >= bpp);
+
+      if(!colored_done && (r != g || r != b)) {
+        profile->colored = 1;
+        colored_done = 1;
+        if(profile->bits < 8) profile->bits = 8; /*PNG has no colored modes with less than 8-bit per channel*/
+      }
+
+      if(!alpha_done) {
+        unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+        if(a != 255 && (a != 0 || (profile->key && !matchkey))) {
+          profile->alpha = 1;
+          profile->key = 0;
+          alpha_done = 1;
+          if(profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+        } else if(a == 0 && !profile->alpha && !profile->key) {
+          profile->key = 1;
+          profile->key_r = r;
+          profile->key_g = g;
+          profile->key_b = b;
+        } else if(a == 255 && profile->key && matchkey) {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          profile->key = 0;
+          alpha_done = 1;
+          if(profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+        }
+      }
+
+      if(!numcolors_done) {
+        if(!color_tree_has(&tree, r, g, b, a)) {
+          color_tree_add(&tree, r, g, b, a, profile->numcolors);
+          if(profile->numcolors < 256) {
+            unsigned char* p = profile->palette;
+            unsigned n = profile->numcolors;
+            p[n * 4 + 0] = r;
+            p[n * 4 + 1] = g;
+            p[n * 4 + 2] = b;
+            p[n * 4 + 3] = a;
+          }
+          ++profile->numcolors;
+          numcolors_done = profile->numcolors >= maxnumcolors;
+        }
+      }
+
+      if(alpha_done && numcolors_done && colored_done && bits_done) break;
+    }
+
+    if(profile->key && !profile->alpha) {
+      for(i = 0; i != numpixels; ++i) {
+        getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+        if(a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b) {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          profile->key = 0;
+          alpha_done = 1;
+          if(profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+        }
+      }
+    }
+
+    /*make the profile's key always 16-bit for consistency - repeat each byte twice*/
+    profile->key_r += (profile->key_r << 8);
+    profile->key_g += (profile->key_g << 8);
+    profile->key_b += (profile->key_b << 8);
+  }
+
+  color_tree_cleanup(&tree);
+  return error;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*Adds a single color to the color profile. The profile must already have been inited. The color must be given as 16-bit
+(with 2 bytes repeating for 8-bit and 65535 for opaque alpha channel). This function is expensive, do not call it for
+all pixels of an image but only for a few additional values. */
+static unsigned lodepng_color_profile_add(LodePNGColorProfile* profile,
+                                          unsigned r, unsigned g, unsigned b, unsigned a) {
+  unsigned error = 0;
+  unsigned char image[8];
+  LodePNGColorMode mode;
+  lodepng_color_mode_init(&mode);
+  image[0] = r >> 8; image[1] = r; image[2] = g >> 8; image[3] = g;
+  image[4] = b >> 8; image[5] = b; image[6] = a >> 8; image[7] = a;
+  mode.bitdepth = 16;
+  mode.colortype = LCT_RGBA;
+  error = lodepng_get_color_profile(profile, image, 1, 1, &mode);
+  lodepng_color_mode_cleanup(&mode);
+  return error;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*Autochoose color model given the computed profile. mode_in is to copy palette order from
+when relevant.*/
+static unsigned auto_choose_color_from_profile(LodePNGColorMode* mode_out,
+                                               const LodePNGColorMode* mode_in,
+                                               const LodePNGColorProfile* prof) {
+  unsigned error = 0;
+  unsigned palettebits, palette_ok;
+  size_t i, n;
+  size_t numpixels = prof->numpixels;
+
+  unsigned alpha = prof->alpha;
+  unsigned key = prof->key;
+  unsigned bits = prof->bits;
+
+  mode_out->key_defined = 0;
+
+  if(key && numpixels <= 16) {
+    alpha = 1; /*too few pixels to justify tRNS chunk overhead*/
+    key = 0;
+    if(bits < 8) bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+  }
+  n = prof->numcolors;
+  palettebits = n <= 2 ? 1 : (n <= 4 ? 2 : (n <= 16 ? 4 : 8));
+  palette_ok = n <= 256 && bits <= 8;
+  if(numpixels < n * 2) palette_ok = 0; /*don't add palette overhead if image has only a few pixels*/
+  if(!prof->colored && bits <= palettebits) palette_ok = 0; /*gray is less overhead*/
+
+  if(palette_ok) {
+    const unsigned char* p = prof->palette;
+    lodepng_palette_clear(mode_out); /*remove potential earlier palette*/
+    for(i = 0; i != prof->numcolors; ++i) {
+      error = lodepng_palette_add(mode_out, p[i * 4 + 0], p[i * 4 + 1], p[i * 4 + 2], p[i * 4 + 3]);
+      if(error) break;
+    }
+
+    mode_out->colortype = LCT_PALETTE;
+    mode_out->bitdepth = palettebits;
+
+    if(mode_in->colortype == LCT_PALETTE && mode_in->palettesize >= mode_out->palettesize
+        && mode_in->bitdepth == mode_out->bitdepth) {
+      /*If input should have same palette colors, keep original to preserve its order and prevent conversion*/
+      lodepng_color_mode_cleanup(mode_out);
+      lodepng_color_mode_copy(mode_out, mode_in);
+    }
+  } else /*8-bit or 16-bit per channel*/ {
+    mode_out->bitdepth = bits;
+    mode_out->colortype = alpha ? (prof->colored ? LCT_RGBA : LCT_GREY_ALPHA)
+                                : (prof->colored ? LCT_RGB : LCT_GREY);
+
+    if(key) {
+      unsigned mask = (1u << mode_out->bitdepth) - 1u; /*profile always uses 16-bit, mask converts it*/
+      mode_out->key_r = prof->key_r & mask;
+      mode_out->key_g = prof->key_g & mask;
+      mode_out->key_b = prof->key_b & mask;
+      mode_out->key_defined = 1;
+    }
+  }
+
+  return error;
+}
+
+/*Automatically chooses color type that gives smallest amount of bits in the
+output image, e.g. gray if there are only grayscale pixels, palette if there
+are less than 256 colors, color key if only single transparent color, ...
+Updates values of mode with a potentially smaller color model. mode_out should
+contain the user chosen color model, but will be overwritten with the new chosen one.*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+                                   const unsigned char* image, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode_in) {
+  unsigned error = 0;
+  LodePNGColorProfile prof;
+  lodepng_color_profile_init(&prof);
+  error = lodepng_get_color_profile(&prof, image, w, h, mode_in);
+  if(error) return error;
+  return auto_choose_color_from_profile(mode_out, mode_in, &prof);
+}
+
+#endif /* #ifdef LODEPNG_COMPILE_ENCODER */
+
+/*
+Paeth predicter, used by PNG filter type 4
+The parameters are of type short, but should come from unsigned chars, the shorts
+are only needed to make the paeth calculation correct.
+*/
+static unsigned char paethPredictor(short a, short b, short c) {
+  short pa = abs(b - c);
+  short pb = abs(a - c);
+  short pc = abs(a + b - c - c);
+
+  if(pc < pa && pc < pb) return (unsigned char)c;
+  else if(pb < pa) return (unsigned char)b;
+  else return (unsigned char)a;
+}
+
+/*shared values used by multiple Adam7 related functions*/
+
+static const unsigned ADAM7_IX[7] = { 0, 4, 0, 2, 0, 1, 0 }; /*x start values*/
+static const unsigned ADAM7_IY[7] = { 0, 0, 4, 0, 2, 0, 1 }; /*y start values*/
+static const unsigned ADAM7_DX[7] = { 8, 8, 4, 4, 2, 2, 1 }; /*x delta values*/
+static const unsigned ADAM7_DY[7] = { 8, 8, 8, 4, 4, 2, 2 }; /*y delta values*/
+
+/*
+Outputs various dimensions and positions in the image related to the Adam7 reduced images.
+passw: output containing the width of the 7 passes
+passh: output containing the height of the 7 passes
+filter_passstart: output containing the index of the start and end of each
+ reduced image with filter bytes
+padded_passstart output containing the index of the start and end of each
+ reduced image when without filter bytes but with padded scanlines
+passstart: output containing the index of the start and end of each reduced
+ image without padding between scanlines, but still padding between the images
+w, h: width and height of non-interlaced image
+bpp: bits per pixel
+"padded" is only relevant if bpp is less than 8 and a scanline or image does not
+ end at a full byte
+*/
+static void Adam7_getpassvalues(unsigned passw[7], unsigned passh[7], size_t filter_passstart[8],
+                                size_t padded_passstart[8], size_t passstart[8], unsigned w, unsigned h, unsigned bpp) {
+  /*the passstart values have 8 values: the 8th one indicates the byte after the end of the 7th (= last) pass*/
+  unsigned i;
+
+  /*calculate width and height in pixels of each pass*/
+  for(i = 0; i != 7; ++i) {
+    passw[i] = (w + ADAM7_DX[i] - ADAM7_IX[i] - 1) / ADAM7_DX[i];
+    passh[i] = (h + ADAM7_DY[i] - ADAM7_IY[i] - 1) / ADAM7_DY[i];
+    if(passw[i] == 0) passh[i] = 0;
+    if(passh[i] == 0) passw[i] = 0;
+  }
+
+  filter_passstart[0] = padded_passstart[0] = passstart[0] = 0;
+  for(i = 0; i != 7; ++i) {
+    /*if passw[i] is 0, it's 0 bytes, not 1 (no filtertype-byte)*/
+    filter_passstart[i + 1] = filter_passstart[i]
+                            + ((passw[i] && passh[i]) ? passh[i] * (1 + (passw[i] * bpp + 7) / 8) : 0);
+    /*bits padded if needed to fill full byte at end of each scanline*/
+    padded_passstart[i + 1] = padded_passstart[i] + passh[i] * ((passw[i] * bpp + 7) / 8);
+    /*only padded at end of reduced image*/
+    passstart[i + 1] = passstart[i] + (passh[i] * passw[i] * bpp + 7) / 8;
+  }
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Decoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*read the information from the header and store it in the LodePNGInfo. return value is error*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h, LodePNGState* state,
+                         const unsigned char* in, size_t insize) {
+  unsigned width, height;
+  LodePNGInfo* info = &state->info_png;
+  if(insize == 0 || in == 0) {
+    CERROR_RETURN_ERROR(state->error, 48); /*error: the given data is empty*/
+  }
+  if(insize < 33) {
+    CERROR_RETURN_ERROR(state->error, 27); /*error: the data length is smaller than the length of a PNG header*/
+  }
+
+  /*when decoding a new PNG image, make sure all parameters created after previous decoding are reset*/
+  /* TODO: remove this. One should use a new LodePNGState for new sessions */
+  lodepng_info_cleanup(info);
+  lodepng_info_init(info);
+
+  if(in[0] != 137 || in[1] != 80 || in[2] != 78 || in[3] != 71
+     || in[4] != 13 || in[5] != 10 || in[6] != 26 || in[7] != 10) {
+    CERROR_RETURN_ERROR(state->error, 28); /*error: the first 8 bytes are not the correct PNG signature*/
+  }
+  if(lodepng_chunk_length(in + 8) != 13) {
+    CERROR_RETURN_ERROR(state->error, 94); /*error: header size must be 13 bytes*/
+  }
+  if(!lodepng_chunk_type_equals(in + 8, "IHDR")) {
+    CERROR_RETURN_ERROR(state->error, 29); /*error: it doesn't start with a IHDR chunk!*/
+  }
+
+  /*read the values given in the header*/
+  width = lodepng_read32bitInt(&in[16]);
+  height = lodepng_read32bitInt(&in[20]);
+  info->color.bitdepth = in[24];
+  info->color.colortype = (LodePNGColorType)in[25];
+  info->compression_method = in[26];
+  info->filter_method = in[27];
+  info->interlace_method = in[28];
+
+  if(width == 0 || height == 0) {
+    CERROR_RETURN_ERROR(state->error, 93);
+  }
+
+  if(w) *w = width;
+  if(h) *h = height;
+
+  if(!state->decoder.ignore_crc) {
+    unsigned CRC = lodepng_read32bitInt(&in[29]);
+    unsigned checksum = lodepng_crc32(&in[12], 17);
+    if(CRC != checksum) {
+      CERROR_RETURN_ERROR(state->error, 57); /*invalid CRC*/
+    }
+  }
+
+  /*error: only compression method 0 is allowed in the specification*/
+  if(info->compression_method != 0) CERROR_RETURN_ERROR(state->error, 32);
+  /*error: only filter method 0 is allowed in the specification*/
+  if(info->filter_method != 0) CERROR_RETURN_ERROR(state->error, 33);
+  /*error: only interlace methods 0 and 1 exist in the specification*/
+  if(info->interlace_method > 1) CERROR_RETURN_ERROR(state->error, 34);
+
+  state->error = checkColorValidity(info->color.colortype, info->color.bitdepth);
+  return state->error;
+}
+
+static unsigned unfilterScanline(unsigned char* recon, const unsigned char* scanline, const unsigned char* precon,
+                                 size_t bytewidth, unsigned char filterType, size_t length) {
+  /*
+  For PNG filter method 0
+  unfilter a PNG image scanline by scanline. when the pixels are smaller than 1 byte,
+  the filter works byte per byte (bytewidth = 1)
+  precon is the previous unfiltered scanline, recon the result, scanline the current one
+  the incoming scanlines do NOT include the filtertype byte, that one is given in the parameter filterType instead
+  recon and scanline MAY be the same memory address! precon must be disjoint.
+  */
+
+  size_t i;
+  switch(filterType) {
+    case 0:
+      for(i = 0; i != length; ++i) recon[i] = scanline[i];
+      break;
+    case 1:
+      for(i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+      for(i = bytewidth; i < length; ++i) recon[i] = scanline[i] + recon[i - bytewidth];
+      break;
+    case 2:
+      if(precon) {
+        for(i = 0; i != length; ++i) recon[i] = scanline[i] + precon[i];
+      } else {
+        for(i = 0; i != length; ++i) recon[i] = scanline[i];
+      }
+      break;
+    case 3:
+      if(precon) {
+        for(i = 0; i != bytewidth; ++i) recon[i] = scanline[i] + (precon[i] >> 1);
+        for(i = bytewidth; i < length; ++i) recon[i] = scanline[i] + ((recon[i - bytewidth] + precon[i]) >> 1);
+      } else {
+        for(i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+        for(i = bytewidth; i < length; ++i) recon[i] = scanline[i] + (recon[i - bytewidth] >> 1);
+      }
+      break;
+    case 4:
+      if(precon) {
+        for(i = 0; i != bytewidth; ++i) {
+          recon[i] = (scanline[i] + precon[i]); /*paethPredictor(0, precon[i], 0) is always precon[i]*/
+        }
+        for(i = bytewidth; i < length; ++i) {
+          recon[i] = (scanline[i] + paethPredictor(recon[i - bytewidth], precon[i], precon[i - bytewidth]));
+        }
+      } else {
+        for(i = 0; i != bytewidth; ++i) {
+          recon[i] = scanline[i];
+        }
+        for(i = bytewidth; i < length; ++i) {
+          /*paethPredictor(recon[i - bytewidth], 0, 0) is always recon[i - bytewidth]*/
+          recon[i] = (scanline[i] + recon[i - bytewidth]);
+        }
+      }
+      break;
+    default: return 36; /*error: unexisting filter type given*/
+  }
+  return 0;
+}
+
+static unsigned unfilter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp) {
+  /*
+  For PNG filter method 0
+  this function unfilters a single image (e.g. without interlacing this is called once, with Adam7 seven times)
+  out must have enough bytes allocated already, in must have the scanlines + 1 filtertype byte per scanline
+  w and h are image dimensions or dimensions of reduced image, bpp is bits per pixel
+  in and out are allowed to be the same memory address (but aren't the same size since in has the extra filter bytes)
+  */
+
+  unsigned y;
+  unsigned char* prevline = 0;
+
+  /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+  size_t bytewidth = (bpp + 7) / 8;
+  size_t linebytes = (w * bpp + 7) / 8;
+
+  for(y = 0; y < h; ++y) {
+    size_t outindex = linebytes * y;
+    size_t inindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+    unsigned char filterType = in[inindex];
+
+    CERROR_TRY_RETURN(unfilterScanline(&out[outindex], &in[inindex + 1], prevline, bytewidth, filterType, linebytes));
+
+    prevline = &out[outindex];
+  }
+
+  return 0;
+}
+
+/*
+in: Adam7 interlaced image, with no padding bits between scanlines, but between
+ reduced images so that each reduced image starts at a byte.
+out: the same pixels, but re-ordered so that they're now a non-interlaced image with size w*h
+bpp: bits per pixel
+out has the following size in bits: w * h * bpp.
+in is possibly bigger due to padding bits between reduced images.
+out must be big enough AND must be 0 everywhere if bpp < 8 in the current implementation
+(because that's likely a little bit faster)
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_deinterlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp) {
+  unsigned passw[7], passh[7];
+  size_t filter_passstart[8], padded_passstart[8], passstart[8];
+  unsigned i;
+
+  Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+  if(bpp >= 8) {
+    for(i = 0; i != 7; ++i) {
+      unsigned x, y, b;
+      size_t bytewidth = bpp / 8;
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x) {
+        size_t pixelinstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+        size_t pixeloutstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+        for(b = 0; b < bytewidth; ++b) {
+          out[pixeloutstart + b] = in[pixelinstart + b];
+        }
+      }
+    }
+  } else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/ {
+    for(i = 0; i != 7; ++i) {
+      unsigned x, y, b;
+      unsigned ilinebits = bpp * passw[i];
+      unsigned olinebits = bpp * w;
+      size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x) {
+        ibp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+        obp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+        for(b = 0; b < bpp; ++b) {
+          unsigned char bit = readBitFromReversedStream(&ibp, in);
+          /*note that this function assumes the out buffer is completely 0, use setBitOfReversedStream otherwise*/
+          setBitOfReversedStream0(&obp, out, bit);
+        }
+      }
+    }
+  }
+}
+
+static void removePaddingBits(unsigned char* out, const unsigned char* in,
+                              size_t olinebits, size_t ilinebits, unsigned h) {
+  /*
+  After filtering there are still padding bits if scanlines have non multiple of 8 bit amounts. They need
+  to be removed (except at last scanline of (Adam7-reduced) image) before working with pure image buffers
+  for the Adam7 code, the color convert code and the output to the user.
+  in and out are allowed to be the same buffer, in may also be higher but still overlapping; in must
+  have >= ilinebits*h bits, out must have >= olinebits*h bits, olinebits must be <= ilinebits
+  also used to move bits after earlier such operations happened, e.g. in a sequence of reduced images from Adam7
+  only useful if (ilinebits - olinebits) is a value in the range 1..7
+  */
+  unsigned y;
+  size_t diff = ilinebits - olinebits;
+  size_t ibp = 0, obp = 0; /*input and output bit pointers*/
+  for(y = 0; y < h; ++y) {
+    size_t x;
+    for(x = 0; x < olinebits; ++x) {
+      unsigned char bit = readBitFromReversedStream(&ibp, in);
+      setBitOfReversedStream(&obp, out, bit);
+    }
+    ibp += diff;
+  }
+}
+
+/*out must be buffer big enough to contain full image, and in must contain the full decompressed data from
+the IDAT chunks (with filter index bytes and possible padding bits)
+return value is error*/
+static unsigned postProcessScanlines(unsigned char* out, unsigned char* in,
+                                     unsigned w, unsigned h, const LodePNGInfo* info_png) {
+  /*
+  This function converts the filtered-padded-interlaced data into pure 2D image buffer with the PNG's colortype.
+  Steps:
+  *) if no Adam7: 1) unfilter 2) remove padding bits (= posible extra bits per scanline if bpp < 8)
+  *) if adam7: 1) 7x unfilter 2) 7x remove padding bits 3) Adam7_deinterlace
+  NOTE: the in buffer will be overwritten with intermediate data!
+  */
+  unsigned bpp = lodepng_get_bpp(&info_png->color);
+  if(bpp == 0) return 31; /*error: invalid colortype*/
+
+  if(info_png->interlace_method == 0) {
+    if(bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8) {
+      CERROR_TRY_RETURN(unfilter(in, in, w, h, bpp));
+      removePaddingBits(out, in, w * bpp, ((w * bpp + 7) / 8) * 8, h);
+    }
+    /*we can immediately filter into the out buffer, no other steps needed*/
+    else CERROR_TRY_RETURN(unfilter(out, in, w, h, bpp));
+  } else /*interlace_method is 1 (Adam7)*/ {
+    unsigned passw[7], passh[7]; size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    for(i = 0; i != 7; ++i) {
+      CERROR_TRY_RETURN(unfilter(&in[padded_passstart[i]], &in[filter_passstart[i]], passw[i], passh[i], bpp));
+      /*TODO: possible efficiency improvement: if in this reduced image the bits fit nicely in 1 scanline,
+      move bytes instead of bits or move not at all*/
+      if(bpp < 8) {
+        /*remove padding bits in scanlines; after this there still may be padding
+        bits between the different reduced images: each reduced image still starts nicely at a byte*/
+        removePaddingBits(&in[passstart[i]], &in[padded_passstart[i]], passw[i] * bpp,
+                          ((passw[i] * bpp + 7) / 8) * 8, passh[i]);
+      }
+    }
+
+    Adam7_deinterlace(out, in, w, h, bpp);
+  }
+
+  return 0;
+}
+
+static unsigned readChunk_PLTE(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength) {
+  unsigned pos = 0, i;
+  if(color->palette) lodepng_free(color->palette);
+  color->palettesize = chunkLength / 3;
+  color->palette = (unsigned char*)lodepng_malloc(4 * color->palettesize);
+  if(!color->palette && color->palettesize) {
+    color->palettesize = 0;
+    return 83; /*alloc fail*/
+  }
+  if(color->palettesize > 256) return 38; /*error: palette too big*/
+
+  for(i = 0; i != color->palettesize; ++i) {
+    color->palette[4 * i + 0] = data[pos++]; /*R*/
+    color->palette[4 * i + 1] = data[pos++]; /*G*/
+    color->palette[4 * i + 2] = data[pos++]; /*B*/
+    color->palette[4 * i + 3] = 255; /*alpha*/
+  }
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_tRNS(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength) {
+  unsigned i;
+  if(color->colortype == LCT_PALETTE) {
+    /*error: more alpha values given than there are palette entries*/
+    if(chunkLength > color->palettesize) return 39;
+
+    for(i = 0; i != chunkLength; ++i) color->palette[4 * i + 3] = data[i];
+  } else if(color->colortype == LCT_GREY) {
+    /*error: this chunk must be 2 bytes for grayscale image*/
+    if(chunkLength != 2) return 30;
+
+    color->key_defined = 1;
+    color->key_r = color->key_g = color->key_b = 256u * data[0] + data[1];
+  } else if(color->colortype == LCT_RGB) {
+    /*error: this chunk must be 6 bytes for RGB image*/
+    if(chunkLength != 6) return 41;
+
+    color->key_defined = 1;
+    color->key_r = 256u * data[0] + data[1];
+    color->key_g = 256u * data[2] + data[3];
+    color->key_b = 256u * data[4] + data[5];
+  }
+  else return 42; /*error: tRNS chunk not allowed for other color models*/
+
+  return 0; /* OK */
+}
+
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*background color chunk (bKGD)*/
+static unsigned readChunk_bKGD(LodePNGInfo* info, const unsigned char* data, size_t chunkLength) {
+  if(info->color.colortype == LCT_PALETTE) {
+    /*error: this chunk must be 1 byte for indexed color image*/
+    if(chunkLength != 1) return 43;
+
+    /*error: invalid palette index, or maybe this chunk appeared before PLTE*/
+    if(data[0] >= info->color.palettesize) return 103;
+
+    info->background_defined = 1;
+    info->background_r = info->background_g = info->background_b = data[0];
+  } else if(info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA) {
+    /*error: this chunk must be 2 bytes for grayscale image*/
+    if(chunkLength != 2) return 44;
+
+    /*the values are truncated to bitdepth in the PNG file*/
+    info->background_defined = 1;
+    info->background_r = info->background_g = info->background_b = 256u * data[0] + data[1];
+  } else if(info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA) {
+    /*error: this chunk must be 6 bytes for grayscale image*/
+    if(chunkLength != 6) return 45;
+
+    /*the values are truncated to bitdepth in the PNG file*/
+    info->background_defined = 1;
+    info->background_r = 256u * data[0] + data[1];
+    info->background_g = 256u * data[2] + data[3];
+    info->background_b = 256u * data[4] + data[5];
+  }
+
+  return 0; /* OK */
+}
+
+/*text chunk (tEXt)*/
+static unsigned readChunk_tEXt(LodePNGInfo* info, const unsigned char* data, size_t chunkLength) {
+  unsigned error = 0;
+  char *key = 0, *str = 0;
+  unsigned i;
+
+  while(!error) /*not really a while loop, only used to break on error*/ {
+    unsigned length, string2_begin;
+
+    length = 0;
+    while(length < chunkLength && data[length] != 0) ++length;
+    /*even though it's not allowed by the standard, no error is thrown if
+    there's no null termination char, if the text is empty*/
+    if(length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+    key = (char*)lodepng_malloc(length + 1);
+    if(!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    key[length] = 0;
+    for(i = 0; i != length; ++i) key[i] = (char)data[i];
+
+    string2_begin = length + 1; /*skip keyword null terminator*/
+
+    length = (unsigned)(chunkLength < string2_begin ? 0 : chunkLength - string2_begin);
+    str = (char*)lodepng_malloc(length + 1);
+    if(!str) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    str[length] = 0;
+    for(i = 0; i != length; ++i) str[i] = (char)data[string2_begin + i];
+
+    error = lodepng_add_text(info, key, str);
+
+    break;
+  }
+
+  lodepng_free(key);
+  lodepng_free(str);
+
+  return error;
+}
+
+/*compressed text chunk (zTXt)*/
+static unsigned readChunk_zTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+                               const unsigned char* data, size_t chunkLength) {
+  unsigned error = 0;
+  unsigned i;
+
+  unsigned length, string2_begin;
+  char *key = 0;
+  ucvector decoded;
+
+  ucvector_init(&decoded);
+
+  while(!error) /*not really a while loop, only used to break on error*/ {
+    for(length = 0; length < chunkLength && data[length] != 0; ++length) ;
+    if(length + 2 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+    if(length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+    key = (char*)lodepng_malloc(length + 1);
+    if(!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    key[length] = 0;
+    for(i = 0; i != length; ++i) key[i] = (char)data[i];
+
+    if(data[length + 1] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+    string2_begin = length + 2;
+    if(string2_begin > chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+
+    length = (unsigned)chunkLength - string2_begin;
+    /*will fail if zlib error, e.g. if length is too small*/
+    error = zlib_decompress(&decoded.data, &decoded.size,
+                            (unsigned char*)(&data[string2_begin]),
+                            length, zlibsettings);
+    if(error) break;
+    ucvector_push_back(&decoded, 0);
+
+    error = lodepng_add_text(info, key, (char*)decoded.data);
+
+    break;
+  }
+
+  lodepng_free(key);
+  ucvector_cleanup(&decoded);
+
+  return error;
+}
+
+/*international text chunk (iTXt)*/
+static unsigned readChunk_iTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+                               const unsigned char* data, size_t chunkLength) {
+  unsigned error = 0;
+  unsigned i;
+
+  unsigned length, begin, compressed;
+  char *key = 0, *langtag = 0, *transkey = 0;
+  ucvector decoded;
+  ucvector_init(&decoded); /* TODO: only use in case of compressed text */
+
+  while(!error) /*not really a while loop, only used to break on error*/ {
+    /*Quick check if the chunk length isn't too small. Even without check
+    it'd still fail with other error checks below if it's too short. This just gives a different error code.*/
+    if(chunkLength < 5) CERROR_BREAK(error, 30); /*iTXt chunk too short*/
+
+    /*read the key*/
+    for(length = 0; length < chunkLength && data[length] != 0; ++length) ;
+    if(length + 3 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination char, corrupt?*/
+    if(length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+    key = (char*)lodepng_malloc(length + 1);
+    if(!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    key[length] = 0;
+    for(i = 0; i != length; ++i) key[i] = (char)data[i];
+
+    /*read the compression method*/
+    compressed = data[length + 1];
+    if(data[length + 2] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+    /*even though it's not allowed by the standard, no error is thrown if
+    there's no null termination char, if the text is empty for the next 3 texts*/
+
+    /*read the langtag*/
+    begin = length + 3;
+    length = 0;
+    for(i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+    langtag = (char*)lodepng_malloc(length + 1);
+    if(!langtag) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    langtag[length] = 0;
+    for(i = 0; i != length; ++i) langtag[i] = (char)data[begin + i];
+
+    /*read the transkey*/
+    begin += length + 1;
+    length = 0;
+    for(i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+    transkey = (char*)lodepng_malloc(length + 1);
+    if(!transkey) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    transkey[length] = 0;
+    for(i = 0; i != length; ++i) transkey[i] = (char)data[begin + i];
+
+    /*read the actual text*/
+    begin += length + 1;
+
+    length = (unsigned)chunkLength < begin ? 0 : (unsigned)chunkLength - begin;
+
+    if(compressed) {
+      /*will fail if zlib error, e.g. if length is too small*/
+      error = zlib_decompress(&decoded.data, &decoded.size,
+                              (unsigned char*)(&data[begin]),
+                              length, zlibsettings);
+      if(error) break;
+      if(decoded.allocsize < decoded.size) decoded.allocsize = decoded.size;
+      ucvector_push_back(&decoded, 0);
+    } else {
+      if(!ucvector_resize(&decoded, length + 1)) CERROR_BREAK(error, 83 /*alloc fail*/);
+
+      decoded.data[length] = 0;
+      for(i = 0; i != length; ++i) decoded.data[i] = data[begin + i];
+    }
+
+    error = lodepng_add_itext(info, key, langtag, transkey, (char*)decoded.data);
+
+    break;
+  }
+
+  lodepng_free(key);
+  lodepng_free(langtag);
+  lodepng_free(transkey);
+  ucvector_cleanup(&decoded);
+
+  return error;
+}
+
+static unsigned readChunk_tIME(LodePNGInfo* info, const unsigned char* data, size_t chunkLength) {
+  if(chunkLength != 7) return 73; /*invalid tIME chunk size*/
+
+  info->time_defined = 1;
+  info->time.year = 256u * data[0] + data[1];
+  info->time.month = data[2];
+  info->time.day = data[3];
+  info->time.hour = data[4];
+  info->time.minute = data[5];
+  info->time.second = data[6];
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_pHYs(LodePNGInfo* info, const unsigned char* data, size_t chunkLength) {
+  if(chunkLength != 9) return 74; /*invalid pHYs chunk size*/
+
+  info->phys_defined = 1;
+  info->phys_x = 16777216u * data[0] + 65536u * data[1] + 256u * data[2] + data[3];
+  info->phys_y = 16777216u * data[4] + 65536u * data[5] + 256u * data[6] + data[7];
+  info->phys_unit = data[8];
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_gAMA(LodePNGInfo* info, const unsigned char* data, size_t chunkLength) {
+  if(chunkLength != 4) return 96; /*invalid gAMA chunk size*/
+
+  info->gama_defined = 1;
+  info->gama_gamma = 16777216u * data[0] + 65536u * data[1] + 256u * data[2] + data[3];
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_cHRM(LodePNGInfo* info, const unsigned char* data, size_t chunkLength) {
+  if(chunkLength != 32) return 97; /*invalid cHRM chunk size*/
+
+  info->chrm_defined = 1;
+  info->chrm_white_x = 16777216u * data[ 0] + 65536u * data[ 1] + 256u * data[ 2] + data[ 3];
+  info->chrm_white_y = 16777216u * data[ 4] + 65536u * data[ 5] + 256u * data[ 6] + data[ 7];
+  info->chrm_red_x   = 16777216u * data[ 8] + 65536u * data[ 9] + 256u * data[10] + data[11];
+  info->chrm_red_y   = 16777216u * data[12] + 65536u * data[13] + 256u * data[14] + data[15];
+  info->chrm_green_x = 16777216u * data[16] + 65536u * data[17] + 256u * data[18] + data[19];
+  info->chrm_green_y = 16777216u * data[20] + 65536u * data[21] + 256u * data[22] + data[23];
+  info->chrm_blue_x  = 16777216u * data[24] + 65536u * data[25] + 256u * data[26] + data[27];
+  info->chrm_blue_y  = 16777216u * data[28] + 65536u * data[29] + 256u * data[30] + data[31];
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_sRGB(LodePNGInfo* info, const unsigned char* data, size_t chunkLength) {
+  if(chunkLength != 1) return 98; /*invalid sRGB chunk size (this one is never ignored)*/
+
+  info->srgb_defined = 1;
+  info->srgb_intent = data[0];
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_iCCP(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+                               const unsigned char* data, size_t chunkLength) {
+  unsigned error = 0;
+  unsigned i;
+
+  unsigned length, string2_begin;
+  ucvector decoded;
+
+  info->iccp_defined = 1;
+  if(info->iccp_name) lodepng_clear_icc(info);
+
+  for(length = 0; length < chunkLength && data[length] != 0; ++length) ;
+  if(length + 2 >= chunkLength) return 75; /*no null termination, corrupt?*/
+  if(length < 1 || length > 79) return 89; /*keyword too short or long*/
+
+  info->iccp_name = (char*)lodepng_malloc(length + 1);
+  if(!info->iccp_name) return 83; /*alloc fail*/
+
+  info->iccp_name[length] = 0;
+  for(i = 0; i != length; ++i) info->iccp_name[i] = (char)data[i];
+
+  if(data[length + 1] != 0) return 72; /*the 0 byte indicating compression must be 0*/
+
+  string2_begin = length + 2;
+  if(string2_begin > chunkLength) return 75; /*no null termination, corrupt?*/
+
+  length = (unsigned)chunkLength - string2_begin;
+  ucvector_init(&decoded);
+  error = zlib_decompress(&decoded.data, &decoded.size,
+                          (unsigned char*)(&data[string2_begin]),
+                          length, zlibsettings);
+  if(!error) {
+    info->iccp_profile_size = (unsigned int)decoded.size;
+    info->iccp_profile = (unsigned char*)lodepng_malloc(decoded.size);
+    if(info->iccp_profile) {
+      memcpy(info->iccp_profile, decoded.data, decoded.size);
+    } else {
+      error = 83; /* alloc fail */
+    }
+  }
+  ucvector_cleanup(&decoded);
+  return error;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+unsigned lodepng_inspect_chunk(LodePNGState* state, size_t pos,
+                               const unsigned char* in, size_t insize) {
+  const unsigned char* chunk = in + pos;
+  unsigned chunkLength;
+  const unsigned char* data;
+  unsigned unhandled = 0;
+  unsigned error = 0;
+
+  if (pos + 4 > insize) return 30;
+  chunkLength = lodepng_chunk_length(chunk);
+  if(chunkLength > 2147483647) return 63;
+  data = lodepng_chunk_data_const(chunk);
+  if(data + chunkLength + 4 > in + insize) return 30;
+
+  if(lodepng_chunk_type_equals(chunk, "PLTE")) {
+    error = readChunk_PLTE(&state->info_png.color, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "tRNS")) {
+    error = readChunk_tRNS(&state->info_png.color, data, chunkLength);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  } else if(lodepng_chunk_type_equals(chunk, "bKGD")) {
+    error = readChunk_bKGD(&state->info_png, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "tEXt")) {
+    error = readChunk_tEXt(&state->info_png, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "zTXt")) {
+    error = readChunk_zTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "iTXt")) {
+    error = readChunk_iTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "tIME")) {
+    error = readChunk_tIME(&state->info_png, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "pHYs")) {
+    error = readChunk_pHYs(&state->info_png, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "gAMA")) {
+    error = readChunk_gAMA(&state->info_png, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "cHRM")) {
+    error = readChunk_cHRM(&state->info_png, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "sRGB")) {
+    error = readChunk_sRGB(&state->info_png, data, chunkLength);
+  } else if(lodepng_chunk_type_equals(chunk, "iCCP")) {
+    error = readChunk_iCCP(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+  } else {
+    /* unhandled chunk is ok (is not an error) */
+    unhandled = 1;
+  }
+
+  if(!error && !unhandled && !state->decoder.ignore_crc) {
+    if(lodepng_chunk_check_crc(chunk)) return 57; /*invalid CRC*/
+  }
+
+  return error;
+}
+
+/*read a PNG, the result will be in the same color type as the PNG (hence "generic")*/
+static void decodeGeneric(unsigned char** out, unsigned* w, unsigned* h,
+                          LodePNGState* state,
+                          const unsigned char* in, size_t insize) {
+  unsigned char IEND = 0;
+  const unsigned char* chunk;
+  size_t i;
+  ucvector idat; /*the data from idat chunks*/
+  ucvector scanlines;
+  size_t predict;
+  size_t outsize = 0;
+
+  /*for unknown chunk order*/
+  unsigned unknown = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  unsigned critical_pos = 1; /*1 = after IHDR, 2 = after PLTE, 3 = after IDAT*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+
+  /* safe output values in case error happens */
+  *out = 0;
+  *w = *h = 0;
+
+  state->error = lodepng_inspect(w, h, state, in, insize); /*reads header and resets other parameters in state->info_png*/
+  if(state->error) return;
+
+  if(lodepng_pixel_overflow(*w, *h, &state->info_png.color, &state->info_raw)) {
+    CERROR_RETURN(state->error, 92); /*overflow possible due to amount of pixels*/
+  }
+
+  ucvector_init(&idat);
+  chunk = &in[33]; /*first byte of the first chunk after the header*/
+
+  /*loop through the chunks, ignoring unknown chunks and stopping at IEND chunk.
+  IDAT data is put at the start of the in buffer*/
+  while(!IEND && !state->error) {
+    unsigned chunkLength;
+    const unsigned char* data; /*the data in the chunk*/
+
+    /*error: size of the in buffer too small to contain next chunk*/
+    if((size_t)((chunk - in) + 12) > insize || chunk < in) {
+      if(state->decoder.ignore_end) break; /*other errors may still happen though*/
+      CERROR_BREAK(state->error, 30);
+    }
+
+    /*length of the data of the chunk, excluding the length bytes, chunk type and CRC bytes*/
+    chunkLength = lodepng_chunk_length(chunk);
+    /*error: chunk length larger than the max PNG chunk size*/
+    if(chunkLength > 2147483647) {
+      if(state->decoder.ignore_end) break; /*other errors may still happen though*/
+      CERROR_BREAK(state->error, 63);
+    }
+
+    if((size_t)((chunk - in) + chunkLength + 12) > insize || (chunk + chunkLength + 12) < in) {
+      CERROR_BREAK(state->error, 64); /*error: size of the in buffer too small to contain next chunk*/
+    }
+
+    data = lodepng_chunk_data_const(chunk);
+
+    unknown = 0;
+
+    /*IDAT chunk, containing compressed image data*/
+    if(lodepng_chunk_type_equals(chunk, "IDAT")) {
+      size_t oldsize = idat.size;
+      size_t newsize;
+      if(lodepng_addofl(oldsize, chunkLength, &newsize)) CERROR_BREAK(state->error, 95);
+      if(!ucvector_resize(&idat, newsize)) CERROR_BREAK(state->error, 83 /*alloc fail*/);
+      for(i = 0; i != chunkLength; ++i) idat.data[oldsize + i] = data[i];
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+      critical_pos = 3;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    } else if(lodepng_chunk_type_equals(chunk, "IEND")) {
+      /*IEND chunk*/
+      IEND = 1;
+    } else if(lodepng_chunk_type_equals(chunk, "PLTE")) {
+      /*palette chunk (PLTE)*/
+      state->error = readChunk_PLTE(&state->info_png.color, data, chunkLength);
+      if(state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+      critical_pos = 2;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    } else if(lodepng_chunk_type_equals(chunk, "tRNS")) {
+      /*palette transparency chunk (tRNS). Even though this one is an ancillary chunk , it is still compiled
+      in without 'LODEPNG_COMPILE_ANCILLARY_CHUNKS' because it contains essential color information that
+      affects the alpha channel of pixels. */
+      state->error = readChunk_tRNS(&state->info_png.color, data, chunkLength);
+      if(state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+      /*background color chunk (bKGD)*/
+    } else if(lodepng_chunk_type_equals(chunk, "bKGD")) {
+      state->error = readChunk_bKGD(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    } else if(lodepng_chunk_type_equals(chunk, "tEXt")) {
+      /*text chunk (tEXt)*/
+      if(state->decoder.read_text_chunks) {
+        state->error = readChunk_tEXt(&state->info_png, data, chunkLength);
+        if(state->error) break;
+      }
+    } else if(lodepng_chunk_type_equals(chunk, "zTXt")) {
+      /*compressed text chunk (zTXt)*/
+      if(state->decoder.read_text_chunks) {
+        state->error = readChunk_zTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+        if(state->error) break;
+      }
+    } else if(lodepng_chunk_type_equals(chunk, "iTXt")) {
+      /*international text chunk (iTXt)*/
+      if(state->decoder.read_text_chunks) {
+        state->error = readChunk_iTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+        if(state->error) break;
+      }
+    } else if(lodepng_chunk_type_equals(chunk, "tIME")) {
+      state->error = readChunk_tIME(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    } else if(lodepng_chunk_type_equals(chunk, "pHYs")) {
+      state->error = readChunk_pHYs(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    } else if(lodepng_chunk_type_equals(chunk, "gAMA")) {
+      state->error = readChunk_gAMA(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    } else if(lodepng_chunk_type_equals(chunk, "cHRM")) {
+      state->error = readChunk_cHRM(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    } else if(lodepng_chunk_type_equals(chunk, "sRGB")) {
+      state->error = readChunk_sRGB(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    } else if(lodepng_chunk_type_equals(chunk, "iCCP")) {
+      state->error = readChunk_iCCP(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+      if(state->error) break;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    } else /*it's not an implemented chunk type, so ignore it: skip over the data*/ {
+      /*error: unknown critical chunk (5th bit of first byte of chunk type is 0)*/
+      if(!state->decoder.ignore_critical && !lodepng_chunk_ancillary(chunk)) {
+        CERROR_BREAK(state->error, 69);
+      }
+
+      unknown = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+      if(state->decoder.remember_unknown_chunks) {
+        state->error = lodepng_chunk_append(&state->info_png.unknown_chunks_data[critical_pos - 1],
+                                            &state->info_png.unknown_chunks_size[critical_pos - 1], chunk);
+        if(state->error) break;
+      }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    }
+
+    if(!state->decoder.ignore_crc && !unknown) /*check CRC if wanted, only on known chunk types*/ {
+      if(lodepng_chunk_check_crc(chunk)) CERROR_BREAK(state->error, 57); /*invalid CRC*/
+    }
+
+    if(!IEND) chunk = lodepng_chunk_next_const(chunk);
+  }
+
+  ucvector_init(&scanlines);
+  /*predict output size, to allocate exact size for output buffer to avoid more dynamic allocation.
+  If the decompressed size does not match the prediction, the image must be corrupt.*/
+  if(state->info_png.interlace_method == 0) {
+    predict = lodepng_get_raw_size_idat(*w, *h, &state->info_png.color);
+  } else {
+    /*Adam-7 interlaced: predicted size is the sum of the 7 sub-images sizes*/
+    const LodePNGColorMode* color = &state->info_png.color;
+    predict = 0;
+    predict += lodepng_get_raw_size_idat((*w + 7) >> 3, (*h + 7) >> 3, color);
+    if(*w > 4) predict += lodepng_get_raw_size_idat((*w + 3) >> 3, (*h + 7) >> 3, color);
+    predict += lodepng_get_raw_size_idat((*w + 3) >> 2, (*h + 3) >> 3, color);
+    if(*w > 2) predict += lodepng_get_raw_size_idat((*w + 1) >> 2, (*h + 3) >> 2, color);
+    predict += lodepng_get_raw_size_idat((*w + 1) >> 1, (*h + 1) >> 2, color);
+    if(*w > 1) predict += lodepng_get_raw_size_idat((*w + 0) >> 1, (*h + 1) >> 1, color);
+    predict += lodepng_get_raw_size_idat((*w + 0), (*h + 0) >> 1, color);
+  }
+  if(!state->error && !ucvector_reserve(&scanlines, predict)) state->error = 83; /*alloc fail*/
+  if(!state->error) {
+    state->error = zlib_decompress(&scanlines.data, &scanlines.size, idat.data,
+                                   idat.size, &state->decoder.zlibsettings);
+    if(!state->error && scanlines.size != predict) state->error = 91; /*decompressed size doesn't match prediction*/
+  }
+  ucvector_cleanup(&idat);
+
+  if(!state->error) {
+    outsize = lodepng_get_raw_size(*w, *h, &state->info_png.color);
+    *out = (unsigned char*)lodepng_malloc(outsize);
+    if(!*out) state->error = 83; /*alloc fail*/
+  }
+  if(!state->error) {
+    for(i = 0; i < outsize; i++) (*out)[i] = 0;
+    state->error = postProcessScanlines(*out, scanlines.data, *w, *h, &state->info_png);
+  }
+  ucvector_cleanup(&scanlines);
+}
+
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+                        LodePNGState* state,
+                        const unsigned char* in, size_t insize) {
+  *out = 0;
+  decodeGeneric(out, w, h, state, in, insize);
+  if(state->error) return state->error;
+  if(!state->decoder.color_convert || lodepng_color_mode_equal(&state->info_raw, &state->info_png.color)) {
+    /*same color type, no copying or converting of data needed*/
+    /*store the info_png color settings on the info_raw so that the info_raw still reflects what colortype
+    the raw image has to the end user*/
+    if(!state->decoder.color_convert) {
+      state->error = lodepng_color_mode_copy(&state->info_raw, &state->info_png.color);
+      if(state->error) return state->error;
+    }
+  } else {
+    /*color conversion needed; sort of copy of the data*/
+    unsigned char* data = *out;
+    size_t outsize;
+
+    /*TODO: check if this works according to the statement in the documentation: "The converter can convert
+    from grayscale input color type, to 8-bit grayscale or grayscale with alpha"*/
+    if(!(state->info_raw.colortype == LCT_RGB || state->info_raw.colortype == LCT_RGBA)
+       && !(state->info_raw.bitdepth == 8)) {
+      return 56; /*unsupported color mode conversion*/
+    }
+
+    outsize = lodepng_get_raw_size(*w, *h, &state->info_raw);
+    *out = (unsigned char*)lodepng_malloc(outsize);
+    if(!(*out)) {
+      state->error = 83; /*alloc fail*/
+    }
+    else state->error = lodepng_convert(*out, data, &state->info_raw,
+                                        &state->info_png.color, *w, *h);
+    lodepng_free(data);
+  }
+  return state->error;
+}
+
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in,
+                               size_t insize, LodePNGColorType colortype, unsigned bitdepth) {
+  unsigned error;
+  LodePNGState state;
+  lodepng_state_init(&state);
+  state.info_raw.colortype = colortype;
+  state.info_raw.bitdepth = bitdepth;
+  error = lodepng_decode(out, w, h, &state, in, insize);
+  lodepng_state_cleanup(&state);
+  return error;
+}
+
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize) {
+  return lodepng_decode_memory(out, w, h, in, insize, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize) {
+  return lodepng_decode_memory(out, w, h, in, insize, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename,
+                             LodePNGColorType colortype, unsigned bitdepth) {
+  unsigned char* buffer = 0;
+  size_t buffersize;
+  unsigned error;
+  /* safe output values in case error happens */
+  *out = 0;
+  *w = *h = 0;
+  error = lodepng_load_file(&buffer, &buffersize, filename);
+  if(!error) error = lodepng_decode_memory(out, w, h, buffer, buffersize, colortype, bitdepth);
+  lodepng_free(buffer);
+  return error;
+}
+
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename) {
+  return lodepng_decode_file(out, w, h, filename, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename) {
+  return lodepng_decode_file(out, w, h, filename, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings) {
+  settings->color_convert = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  settings->read_text_chunks = 1;
+  settings->remember_unknown_chunks = 0;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+  settings->ignore_crc = 0;
+  settings->ignore_critical = 0;
+  settings->ignore_end = 0;
+  lodepng_decompress_settings_init(&settings->zlibsettings);
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+
+void lodepng_state_init(LodePNGState* state) {
+#ifdef LODEPNG_COMPILE_DECODER
+  lodepng_decoder_settings_init(&state->decoder);
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+  lodepng_encoder_settings_init(&state->encoder);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+  lodepng_color_mode_init(&state->info_raw);
+  lodepng_info_init(&state->info_png);
+  state->error = 1;
+}
+
+void lodepng_state_cleanup(LodePNGState* state) {
+  lodepng_color_mode_cleanup(&state->info_raw);
+  lodepng_info_cleanup(&state->info_png);
+}
+
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source) {
+  lodepng_state_cleanup(dest);
+  *dest = *source;
+  lodepng_color_mode_init(&dest->info_raw);
+  lodepng_info_init(&dest->info_png);
+  dest->error = lodepng_color_mode_copy(&dest->info_raw, &source->info_raw); if(dest->error) return;
+  dest->error = lodepng_info_copy(&dest->info_png, &source->info_png); if(dest->error) return;
+}
+
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Encoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*chunkName must be string of 4 characters*/
+static unsigned addChunk(ucvector* out, const char* chunkName, const unsigned char* data, size_t length) {
+  CERROR_TRY_RETURN(lodepng_chunk_create(&out->data, &out->size, (unsigned)length, chunkName, data));
+  out->allocsize = out->size; /*fix the allocsize again*/
+  return 0;
+}
+
+static void writeSignature(ucvector* out) {
+  /*8 bytes PNG signature, aka the magic bytes*/
+  ucvector_push_back(out, 137);
+  ucvector_push_back(out, 80);
+  ucvector_push_back(out, 78);
+  ucvector_push_back(out, 71);
+  ucvector_push_back(out, 13);
+  ucvector_push_back(out, 10);
+  ucvector_push_back(out, 26);
+  ucvector_push_back(out, 10);
+}
+
+static unsigned addChunk_IHDR(ucvector* out, unsigned w, unsigned h,
+                              LodePNGColorType colortype, unsigned bitdepth, unsigned interlace_method) {
+  unsigned error = 0;
+  ucvector header;
+  ucvector_init(&header);
+
+  lodepng_add32bitInt(&header, w); /*width*/
+  lodepng_add32bitInt(&header, h); /*height*/
+  ucvector_push_back(&header, (unsigned char)bitdepth); /*bit depth*/
+  ucvector_push_back(&header, (unsigned char)colortype); /*color type*/
+  ucvector_push_back(&header, 0); /*compression method*/
+  ucvector_push_back(&header, 0); /*filter method*/
+  ucvector_push_back(&header, interlace_method); /*interlace method*/
+
+  error = addChunk(out, "IHDR", header.data, header.size);
+  ucvector_cleanup(&header);
+
+  return error;
+}
+
+static unsigned addChunk_PLTE(ucvector* out, const LodePNGColorMode* info) {
+  unsigned error = 0;
+  size_t i;
+  ucvector PLTE;
+  ucvector_init(&PLTE);
+  for(i = 0; i != info->palettesize * 4; ++i) {
+    /*add all channels except alpha channel*/
+    if(i % 4 != 3) ucvector_push_back(&PLTE, info->palette[i]);
+  }
+  error = addChunk(out, "PLTE", PLTE.data, PLTE.size);
+  ucvector_cleanup(&PLTE);
+
+  return error;
+}
+
+static unsigned addChunk_tRNS(ucvector* out, const LodePNGColorMode* info) {
+  unsigned error = 0;
+  size_t i;
+  ucvector tRNS;
+  ucvector_init(&tRNS);
+  if(info->colortype == LCT_PALETTE) {
+    size_t amount = info->palettesize;
+    /*the tail of palette values that all have 255 as alpha, does not have to be encoded*/
+    for(i = info->palettesize; i != 0; --i) {
+      if(info->palette[4 * (i - 1) + 3] == 255) --amount;
+      else break;
+    }
+    /*add only alpha channel*/
+    for(i = 0; i != amount; ++i) ucvector_push_back(&tRNS, info->palette[4 * i + 3]);
+  } else if(info->colortype == LCT_GREY) {
+    if(info->key_defined) {
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+    }
+  } else if(info->colortype == LCT_RGB) {
+    if(info->key_defined) {
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_g >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_g & 255));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_b >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_b & 255));
+    }
+  }
+
+  error = addChunk(out, "tRNS", tRNS.data, tRNS.size);
+  ucvector_cleanup(&tRNS);
+
+  return error;
+}
+
+static unsigned addChunk_IDAT(ucvector* out, const unsigned char* data, size_t datasize,
+                              LodePNGCompressSettings* zlibsettings) {
+  ucvector zlibdata;
+  unsigned error = 0;
+
+  /*compress with the Zlib compressor*/
+  ucvector_init(&zlibdata);
+  error = zlib_compress(&zlibdata.data, &zlibdata.size, data, datasize, zlibsettings);
+  if(!error) error = addChunk(out, "IDAT", zlibdata.data, zlibdata.size);
+  ucvector_cleanup(&zlibdata);
+
+  return error;
+}
+
+static unsigned addChunk_IEND(ucvector* out) {
+  unsigned error = 0;
+  error = addChunk(out, "IEND", 0, 0);
+  return error;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static unsigned addChunk_tEXt(ucvector* out, const char* keyword, const char* textstring) {
+  unsigned error = 0;
+  size_t i;
+  ucvector text;
+  ucvector_init(&text);
+  for(i = 0; keyword[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)keyword[i]);
+  if(i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+  ucvector_push_back(&text, 0); /*0 termination char*/
+  for(i = 0; textstring[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)textstring[i]);
+  error = addChunk(out, "tEXt", text.data, text.size);
+  ucvector_cleanup(&text);
+
+  return error;
+}
+
+static unsigned addChunk_zTXt(ucvector* out, const char* keyword, const char* textstring,
+                              LodePNGCompressSettings* zlibsettings) {
+  unsigned error = 0;
+  ucvector data, compressed;
+  size_t i, textsize = strlen(textstring);
+
+  ucvector_init(&data);
+  ucvector_init(&compressed);
+  for(i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+  if(i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+  ucvector_push_back(&data, 0); /*0 termination char*/
+  ucvector_push_back(&data, 0); /*compression method: 0*/
+
+  error = zlib_compress(&compressed.data, &compressed.size,
+                        (unsigned char*)textstring, textsize, zlibsettings);
+  if(!error) {
+    for(i = 0; i != compressed.size; ++i) ucvector_push_back(&data, compressed.data[i]);
+    error = addChunk(out, "zTXt", data.data, data.size);
+  }
+
+  ucvector_cleanup(&compressed);
+  ucvector_cleanup(&data);
+  return error;
+}
+
+static unsigned addChunk_iTXt(ucvector* out, unsigned compressed, const char* keyword, const char* langtag,
+                              const char* transkey, const char* textstring, LodePNGCompressSettings* zlibsettings) {
+  unsigned error = 0;
+  ucvector data;
+  size_t i, textsize = strlen(textstring);
+
+  ucvector_init(&data);
+
+  for(i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+  if(i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+  ucvector_push_back(&data, 0); /*null termination char*/
+  ucvector_push_back(&data, compressed ? 1 : 0); /*compression flag*/
+  ucvector_push_back(&data, 0); /*compression method*/
+  for(i = 0; langtag[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)langtag[i]);
+  ucvector_push_back(&data, 0); /*null termination char*/
+  for(i = 0; transkey[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)transkey[i]);
+  ucvector_push_back(&data, 0); /*null termination char*/
+
+  if(compressed) {
+    ucvector compressed_data;
+    ucvector_init(&compressed_data);
+    error = zlib_compress(&compressed_data.data, &compressed_data.size,
+                          (unsigned char*)textstring, textsize, zlibsettings);
+    if(!error) {
+      for(i = 0; i != compressed_data.size; ++i) ucvector_push_back(&data, compressed_data.data[i]);
+    }
+    ucvector_cleanup(&compressed_data);
+  } else /*not compressed*/ {
+    for(i = 0; textstring[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)textstring[i]);
+  }
+
+  if(!error) error = addChunk(out, "iTXt", data.data, data.size);
+  ucvector_cleanup(&data);
+  return error;
+}
+
+static unsigned addChunk_bKGD(ucvector* out, const LodePNGInfo* info) {
+  unsigned error = 0;
+  ucvector bKGD;
+  ucvector_init(&bKGD);
+  if(info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA) {
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+  } else if(info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA) {
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_g >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_g & 255));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_b >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_b & 255));
+  } else if(info->color.colortype == LCT_PALETTE) {
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255)); /*palette index*/
+  }
+
+  error = addChunk(out, "bKGD", bKGD.data, bKGD.size);
+  ucvector_cleanup(&bKGD);
+
+  return error;
+}
+
+static unsigned addChunk_tIME(ucvector* out, const LodePNGTime* time) {
+  unsigned error = 0;
+  unsigned char* data = (unsigned char*)lodepng_malloc(7);
+  if(!data) return 83; /*alloc fail*/
+  data[0] = (unsigned char)(time->year >> 8);
+  data[1] = (unsigned char)(time->year & 255);
+  data[2] = (unsigned char)time->month;
+  data[3] = (unsigned char)time->day;
+  data[4] = (unsigned char)time->hour;
+  data[5] = (unsigned char)time->minute;
+  data[6] = (unsigned char)time->second;
+  error = addChunk(out, "tIME", data, 7);
+  lodepng_free(data);
+  return error;
+}
+
+static unsigned addChunk_pHYs(ucvector* out, const LodePNGInfo* info) {
+  unsigned error = 0;
+  ucvector data;
+  ucvector_init(&data);
+
+  lodepng_add32bitInt(&data, info->phys_x);
+  lodepng_add32bitInt(&data, info->phys_y);
+  ucvector_push_back(&data, info->phys_unit);
+
+  error = addChunk(out, "pHYs", data.data, data.size);
+  ucvector_cleanup(&data);
+
+  return error;
+}
+
+static unsigned addChunk_gAMA(ucvector* out, const LodePNGInfo* info) {
+  unsigned error = 0;
+  ucvector data;
+  ucvector_init(&data);
+
+  lodepng_add32bitInt(&data, info->gama_gamma);
+
+  error = addChunk(out, "gAMA", data.data, data.size);
+  ucvector_cleanup(&data);
+
+  return error;
+}
+
+static unsigned addChunk_cHRM(ucvector* out, const LodePNGInfo* info) {
+  unsigned error = 0;
+  ucvector data;
+  ucvector_init(&data);
+
+  lodepng_add32bitInt(&data, info->chrm_white_x);
+  lodepng_add32bitInt(&data, info->chrm_white_y);
+  lodepng_add32bitInt(&data, info->chrm_red_x);
+  lodepng_add32bitInt(&data, info->chrm_red_y);
+  lodepng_add32bitInt(&data, info->chrm_green_x);
+  lodepng_add32bitInt(&data, info->chrm_green_y);
+  lodepng_add32bitInt(&data, info->chrm_blue_x);
+  lodepng_add32bitInt(&data, info->chrm_blue_y);
+
+  error = addChunk(out, "cHRM", data.data, data.size);
+  ucvector_cleanup(&data);
+
+  return error;
+}
+
+static unsigned addChunk_sRGB(ucvector* out, const LodePNGInfo* info) {
+  unsigned char data = info->srgb_intent;
+  return addChunk(out, "sRGB", &data, 1);
+}
+
+static unsigned addChunk_iCCP(ucvector* out, const LodePNGInfo* info, LodePNGCompressSettings* zlibsettings) {
+  unsigned error = 0;
+  ucvector data, compressed;
+  size_t i;
+
+  ucvector_init(&data);
+  ucvector_init(&compressed);
+  for(i = 0; info->iccp_name[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)info->iccp_name[i]);
+  if(i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+  ucvector_push_back(&data, 0); /*0 termination char*/
+  ucvector_push_back(&data, 0); /*compression method: 0*/
+
+  error = zlib_compress(&compressed.data, &compressed.size,
+                        info->iccp_profile, info->iccp_profile_size, zlibsettings);
+  if(!error) {
+    for(i = 0; i != compressed.size; ++i) ucvector_push_back(&data, compressed.data[i]);
+    error = addChunk(out, "iCCP", data.data, data.size);
+  }
+
+  ucvector_cleanup(&compressed);
+  ucvector_cleanup(&data);
+  return error;
+}
+
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+static void filterScanline(unsigned char* out, const unsigned char* scanline, const unsigned char* prevline,
+                           size_t length, size_t bytewidth, unsigned char filterType) {
+  size_t i;
+  switch(filterType) {
+    case 0: /*None*/
+      for(i = 0; i != length; ++i) out[i] = scanline[i];
+      break;
+    case 1: /*Sub*/
+      for(i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+      for(i = bytewidth; i < length; ++i) out[i] = scanline[i] - scanline[i - bytewidth];
+      break;
+    case 2: /*Up*/
+      if(prevline) {
+        for(i = 0; i != length; ++i) out[i] = scanline[i] - prevline[i];
+      } else {
+        for(i = 0; i != length; ++i) out[i] = scanline[i];
+      }
+      break;
+    case 3: /*Average*/
+      if(prevline) {
+        for(i = 0; i != bytewidth; ++i) out[i] = scanline[i] - (prevline[i] >> 1);
+        for(i = bytewidth; i < length; ++i) out[i] = scanline[i] - ((scanline[i - bytewidth] + prevline[i]) >> 1);
+      } else {
+        for(i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+        for(i = bytewidth; i < length; ++i) out[i] = scanline[i] - (scanline[i - bytewidth] >> 1);
+      }
+      break;
+    case 4: /*Paeth*/
+      if(prevline) {
+        /*paethPredictor(0, prevline[i], 0) is always prevline[i]*/
+        for(i = 0; i != bytewidth; ++i) out[i] = (scanline[i] - prevline[i]);
+        for(i = bytewidth; i < length; ++i) {
+          out[i] = (scanline[i] - paethPredictor(scanline[i - bytewidth], prevline[i], prevline[i - bytewidth]));
+        }
+      } else {
+        for(i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+        /*paethPredictor(scanline[i - bytewidth], 0, 0) is always scanline[i - bytewidth]*/
+        for(i = bytewidth; i < length; ++i) out[i] = (scanline[i] - scanline[i - bytewidth]);
+      }
+      break;
+    default: return; /*unexisting filter type given*/
+  }
+}
+
+/* log2 approximation. A slight bit faster than std::log. */
+static float flog2(float f) {
+  float result = 0;
+  while(f > 32) { result += 4; f /= 16; }
+  while(f > 2) { ++result; f /= 2; }
+  return result + 1.442695f * (f * f * f / 3 - 3 * f * f / 2 + 3 * f - 1.83333f);
+}
+
+static unsigned filter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h,
+                       const LodePNGColorMode* info, const LodePNGEncoderSettings* settings) {
+  /*
+  For PNG filter method 0
+  out must be a buffer with as size: h + (w * h * bpp + 7) / 8, because there are
+  the scanlines with 1 extra byte per scanline
+  */
+
+  unsigned bpp = lodepng_get_bpp(info);
+  /*the width of a scanline in bytes, not including the filter type*/
+  size_t linebytes = (w * bpp + 7) / 8;
+  /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+  size_t bytewidth = (bpp + 7) / 8;
+  const unsigned char* prevline = 0;
+  unsigned x, y;
+  unsigned error = 0;
+  LodePNGFilterStrategy strategy = settings->filter_strategy;
+
+  /*
+  There is a heuristic called the minimum sum of absolute differences heuristic, suggested by the PNG standard:
+   *  If the image type is Palette, or the bit depth is smaller than 8, then do not filter the image (i.e.
+      use fixed filtering, with the filter None).
+   * (The other case) If the image type is Grayscale or RGB (with or without Alpha), and the bit depth is
+     not smaller than 8, then use adaptive filtering heuristic as follows: independently for each row, apply
+     all five filters and select the filter that produces the smallest sum of absolute values per row.
+  This heuristic is used if filter strategy is LFS_MINSUM and filter_palette_zero is true.
+
+  If filter_palette_zero is true and filter_strategy is not LFS_MINSUM, the above heuristic is followed,
+  but for "the other case", whatever strategy filter_strategy is set to instead of the minimum sum
+  heuristic is used.
+  */
+  if(settings->filter_palette_zero &&
+     (info->colortype == LCT_PALETTE || info->bitdepth < 8)) strategy = LFS_ZERO;
+
+  if(bpp == 0) return 31; /*error: invalid color type*/
+
+  if(strategy == LFS_ZERO) {
+    for(y = 0; y != h; ++y) {
+      size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+      size_t inindex = linebytes * y;
+      out[outindex] = 0; /*filter type byte*/
+      filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, 0);
+      prevline = &in[inindex];
+    }
+  } else if(strategy == LFS_MINSUM) {
+    /*adaptive filtering*/
+    size_t sum[5];
+    unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+    size_t smallest = 0;
+    unsigned char type, bestType = 0;
+
+    for(type = 0; type != 5; ++type) {
+      attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+      if(!attempt[type]) return 83; /*alloc fail*/
+    }
+
+    if(!error) {
+      for(y = 0; y != h; ++y) {
+        /*try the 5 filter types*/
+        for(type = 0; type != 5; ++type) {
+          filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+
+          /*calculate the sum of the result*/
+          sum[type] = 0;
+          if(type == 0) {
+            for(x = 0; x != linebytes; ++x) sum[type] += (unsigned char)(attempt[type][x]);
+          } else {
+            for(x = 0; x != linebytes; ++x) {
+              /*For differences, each byte should be treated as signed, values above 127 are negative
+              (converted to signed char). Filtertype 0 isn't a difference though, so use unsigned there.
+              This means filtertype 0 is almost never chosen, but that is justified.*/
+              unsigned char s = attempt[type][x];
+              sum[type] += s < 128 ? s : (255U - s);
+            }
+          }
+
+          /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+          if(type == 0 || sum[type] < smallest) {
+            bestType = type;
+            smallest = sum[type];
+          }
+        }
+
+        prevline = &in[y * linebytes];
+
+        /*now fill the out values*/
+        out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+        for(x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+      }
+    }
+
+    for(type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+  } else if(strategy == LFS_ENTROPY) {
+    float sum[5];
+    unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+    float smallest = 0;
+    unsigned type, bestType = 0;
+    unsigned count[256];
+
+    for(type = 0; type != 5; ++type) {
+      attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+      if(!attempt[type]) return 83; /*alloc fail*/
+    }
+
+    for(y = 0; y != h; ++y) {
+      /*try the 5 filter types*/
+      for(type = 0; type != 5; ++type) {
+        filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+        for(x = 0; x != 256; ++x) count[x] = 0;
+        for(x = 0; x != linebytes; ++x) ++count[attempt[type][x]];
+        ++count[type]; /*the filter type itself is part of the scanline*/
+        sum[type] = 0;
+        for(x = 0; x != 256; ++x) {
+          float p = count[x] / (float)(linebytes + 1);
+          sum[type] += count[x] == 0 ? 0 : flog2(1 / p) * p;
+        }
+        /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+        if(type == 0 || sum[type] < smallest) {
+          bestType = type;
+          smallest = sum[type];
+        }
+      }
+
+      prevline = &in[y * linebytes];
+
+      /*now fill the out values*/
+      out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+      for(x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+    }
+
+    for(type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+  } else if(strategy == LFS_PREDEFINED) {
+    for(y = 0; y != h; ++y) {
+      size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+      size_t inindex = linebytes * y;
+      unsigned char type = settings->predefined_filters[y];
+      out[outindex] = type; /*filter type byte*/
+      filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, type);
+      prevline = &in[inindex];
+    }
+  } else if(strategy == LFS_BRUTE_FORCE) {
+    /*brute force filter chooser.
+    deflate the scanline after every filter attempt to see which one deflates best.
+    This is very slow and gives only slightly smaller, sometimes even larger, result*/
+    size_t size[5];
+    unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+    size_t smallest = 0;
+    unsigned type = 0, bestType = 0;
+    unsigned char* dummy;
+    LodePNGCompressSettings zlibsettings = settings->zlibsettings;
+    /*use fixed tree on the attempts so that the tree is not adapted to the filtertype on purpose,
+    to simulate the true case where the tree is the same for the whole image. Sometimes it gives
+    better result with dynamic tree anyway. Using the fixed tree sometimes gives worse, but in rare
+    cases better compression. It does make this a bit less slow, so it's worth doing this.*/
+    zlibsettings.btype = 1;
+    /*a custom encoder likely doesn't read the btype setting and is optimized for complete PNG
+    images only, so disable it*/
+    zlibsettings.custom_zlib = 0;
+    zlibsettings.custom_deflate = 0;
+    for(type = 0; type != 5; ++type) {
+      attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+      if(!attempt[type]) return 83; /*alloc fail*/
+    }
+    for(y = 0; y != h; ++y) /*try the 5 filter types*/ {
+      for(type = 0; type != 5; ++type) {
+        unsigned testsize = (unsigned)linebytes;
+        /*if(testsize > 8) testsize /= 8;*/ /*it already works good enough by testing a part of the row*/
+
+        filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+        size[type] = 0;
+        dummy = 0;
+        zlib_compress(&dummy, &size[type], attempt[type], testsize, &zlibsettings);
+        lodepng_free(dummy);
+        /*check if this is smallest size (or if type == 0 it's the first case so always store the values)*/
+        if(type == 0 || size[type] < smallest) {
+          bestType = type;
+          smallest = size[type];
+        }
+      }
+      prevline = &in[y * linebytes];
+      out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+      for(x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+    }
+    for(type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+  }
+  else return 88; /* unknown filter strategy */
+
+  return error;
+}
+
+static void addPaddingBits(unsigned char* out, const unsigned char* in,
+                           size_t olinebits, size_t ilinebits, unsigned h) {
+  /*The opposite of the removePaddingBits function
+  olinebits must be >= ilinebits*/
+  unsigned y;
+  size_t diff = olinebits - ilinebits;
+  size_t obp = 0, ibp = 0; /*bit pointers*/
+  for(y = 0; y != h; ++y) {
+    size_t x;
+    for(x = 0; x < ilinebits; ++x) {
+      unsigned char bit = readBitFromReversedStream(&ibp, in);
+      setBitOfReversedStream(&obp, out, bit);
+    }
+    /*obp += diff; --> no, fill in some value in the padding bits too, to avoid
+    "Use of uninitialised value of size ###" warning from valgrind*/
+    for(x = 0; x != diff; ++x) setBitOfReversedStream(&obp, out, 0);
+  }
+}
+
+/*
+in: non-interlaced image with size w*h
+out: the same pixels, but re-ordered according to PNG's Adam7 interlacing, with
+ no padding bits between scanlines, but between reduced images so that each
+ reduced image starts at a byte.
+bpp: bits per pixel
+there are no padding bits, not between scanlines, not between reduced images
+in has the following size in bits: w * h * bpp.
+out is possibly bigger due to padding bits between reduced images
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_interlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp) {
+  unsigned passw[7], passh[7];
+  size_t filter_passstart[8], padded_passstart[8], passstart[8];
+  unsigned i;
+
+  Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+  if(bpp >= 8) {
+    for(i = 0; i != 7; ++i) {
+      unsigned x, y, b;
+      size_t bytewidth = bpp / 8;
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x) {
+        size_t pixelinstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+        size_t pixeloutstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+        for(b = 0; b < bytewidth; ++b) {
+          out[pixeloutstart + b] = in[pixelinstart + b];
+        }
+      }
+    }
+  } else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/ {
+    for(i = 0; i != 7; ++i) {
+      unsigned x, y, b;
+      unsigned ilinebits = bpp * passw[i];
+      unsigned olinebits = bpp * w;
+      size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x) {
+        ibp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+        obp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+        for(b = 0; b < bpp; ++b) {
+          unsigned char bit = readBitFromReversedStream(&ibp, in);
+          setBitOfReversedStream(&obp, out, bit);
+        }
+      }
+    }
+  }
+}
+
+/*out must be buffer big enough to contain uncompressed IDAT chunk data, and in must contain the full image.
+return value is error**/
+static unsigned preProcessScanlines(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                    unsigned w, unsigned h,
+                                    const LodePNGInfo* info_png, const LodePNGEncoderSettings* settings) {
+  /*
+  This function converts the pure 2D image with the PNG's colortype, into filtered-padded-interlaced data. Steps:
+  *) if no Adam7: 1) add padding bits (= posible extra bits per scanline if bpp < 8) 2) filter
+  *) if adam7: 1) Adam7_interlace 2) 7x add padding bits 3) 7x filter
+  */
+  unsigned bpp = lodepng_get_bpp(&info_png->color);
+  unsigned error = 0;
+
+  if(info_png->interlace_method == 0) {
+    *outsize = h + (h * ((w * bpp + 7) / 8)); /*image size plus an extra byte per scanline + possible padding bits*/
+    *out = (unsigned char*)lodepng_malloc(*outsize);
+    if(!(*out) && (*outsize)) error = 83; /*alloc fail*/
+
+    if(!error) {
+      /*non multiple of 8 bits per scanline, padding bits needed per scanline*/
+      if(bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8) {
+        unsigned char* padded = (unsigned char*)lodepng_malloc(h * ((w * bpp + 7) / 8));
+        if(!padded) error = 83; /*alloc fail*/
+        if(!error) {
+          addPaddingBits(padded, in, ((w * bpp + 7) / 8) * 8, w * bpp, h);
+          error = filter(*out, padded, w, h, &info_png->color, settings);
+        }
+        lodepng_free(padded);
+      } else {
+        /*we can immediately filter into the out buffer, no other steps needed*/
+        error = filter(*out, in, w, h, &info_png->color, settings);
+      }
+    }
+  } else /*interlace_method is 1 (Adam7)*/ {
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned char* adam7;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    *outsize = filter_passstart[7]; /*image size plus an extra byte per scanline + possible padding bits*/
+    *out = (unsigned char*)lodepng_malloc(*outsize);
+    if(!(*out)) error = 83; /*alloc fail*/
+
+    adam7 = (unsigned char*)lodepng_malloc(passstart[7]);
+    if(!adam7 && passstart[7]) error = 83; /*alloc fail*/
+
+    if(!error) {
+      unsigned i;
+
+      Adam7_interlace(adam7, in, w, h, bpp);
+      for(i = 0; i != 7; ++i) {
+        if(bpp < 8) {
+          unsigned char* padded = (unsigned char*)lodepng_malloc(padded_passstart[i + 1] - padded_passstart[i]);
+          if(!padded) ERROR_BREAK(83); /*alloc fail*/
+          addPaddingBits(padded, &adam7[passstart[i]],
+                         ((passw[i] * bpp + 7) / 8) * 8, passw[i] * bpp, passh[i]);
+          error = filter(&(*out)[filter_passstart[i]], padded,
+                         passw[i], passh[i], &info_png->color, settings);
+          lodepng_free(padded);
+        } else {
+          error = filter(&(*out)[filter_passstart[i]], &adam7[padded_passstart[i]],
+                         passw[i], passh[i], &info_png->color, settings);
+        }
+
+        if(error) break;
+      }
+    }
+
+    lodepng_free(adam7);
+  }
+
+  return error;
+}
+
+/*
+palette must have 4 * palettesize bytes allocated, and given in format RGBARGBARGBARGBA...
+returns 0 if the palette is opaque,
+returns 1 if the palette has a single color with alpha 0 ==> color key
+returns 2 if the palette is semi-translucent.
+*/
+static unsigned getPaletteTranslucency(const unsigned char* palette, size_t palettesize) {
+  size_t i;
+  unsigned key = 0;
+  unsigned r = 0, g = 0, b = 0; /*the value of the color with alpha 0, so long as color keying is possible*/
+  for(i = 0; i != palettesize; ++i) {
+    if(!key && palette[4 * i + 3] == 0) {
+      r = palette[4 * i + 0]; g = palette[4 * i + 1]; b = palette[4 * i + 2];
+      key = 1;
+      i = (size_t)(-1); /*restart from beginning, to detect earlier opaque colors with key's value*/
+    }
+    else if(palette[4 * i + 3] != 255) return 2;
+    /*when key, no opaque RGB may have key's RGB*/
+    else if(key && r == palette[i * 4 + 0] && g == palette[i * 4 + 1] && b == palette[i * 4 + 2]) return 2;
+  }
+  return key;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+static unsigned addUnknownChunks(ucvector* out, unsigned char* data, size_t datasize) {
+  unsigned char* inchunk = data;
+  while((size_t)(inchunk - data) < datasize) {
+    CERROR_TRY_RETURN(lodepng_chunk_append(&out->data, &out->size, inchunk));
+    out->allocsize = out->size; /*fix the allocsize again*/
+    inchunk = lodepng_chunk_next(inchunk);
+  }
+  return 0;
+}
+
+static unsigned isGrayICCProfile(const unsigned char* profile, unsigned size) {
+  /*
+  It is a gray profile if bytes 16-19 are "GRAY", rgb profile if bytes 16-19
+  are "RGB ". We do not perform any full parsing of the ICC profile here, other
+  than check those 4 bytes to grayscale profile. Other than that, validity of
+  the profile is not checked. This is needed only because the PNG specification
+  requires using a non-gray color model if there is an ICC profile with "RGB "
+  (sadly limiting compression opportunities if the input data is grayscale RGB
+  data), and requires using a gray color model if it is "GRAY".
+  */
+  if(size < 20) return 0;
+  return profile[16] == 'G' &&  profile[17] == 'R' &&  profile[18] == 'A' &&  profile[19] == 'Y';
+}
+
+static unsigned isRGBICCProfile(const unsigned char* profile, unsigned size) {
+  /* See comment in isGrayICCProfile*/
+  if(size < 20) return 0;
+  return profile[16] == 'R' &&  profile[17] == 'G' &&  profile[18] == 'B' &&  profile[19] == ' ';
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+                        const unsigned char* image, unsigned w, unsigned h,
+                        LodePNGState* state) {
+  unsigned char* data = 0; /*uncompressed version of the IDAT chunk data*/
+  size_t datasize = 0;
+  ucvector outv;
+  LodePNGInfo info;
+
+  ucvector_init(&outv);
+  lodepng_info_init(&info);
+
+  /*provide some proper output values if error will happen*/
+  *out = 0;
+  *outsize = 0;
+  state->error = 0;
+
+  /*check input values validity*/
+  if((state->info_png.color.colortype == LCT_PALETTE || state->encoder.force_palette)
+      && (state->info_png.color.palettesize == 0 || state->info_png.color.palettesize > 256)) {
+    state->error = 68; /*invalid palette size, it is only allowed to be 1-256*/
+    goto cleanup;
+  }
+  if(state->encoder.zlibsettings.btype > 2) {
+    state->error = 61; /*error: unexisting btype*/
+    goto cleanup;
+  }
+  if(state->info_png.interlace_method > 1) {
+    state->error = 71; /*error: unexisting interlace mode*/
+    goto cleanup;
+  }
+  state->error = checkColorValidity(state->info_png.color.colortype, state->info_png.color.bitdepth);
+  if(state->error) goto cleanup; /*error: unexisting color type given*/
+  state->error = checkColorValidity(state->info_raw.colortype, state->info_raw.bitdepth);
+  if(state->error) goto cleanup; /*error: unexisting color type given*/
+
+  /* color convert and compute scanline filter types */
+  lodepng_info_copy(&info, &state->info_png);
+  if(state->encoder.auto_convert) {
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    if(state->info_png.background_defined) {
+      unsigned bg_r = state->info_png.background_r;
+      unsigned bg_g = state->info_png.background_g;
+      unsigned bg_b = state->info_png.background_b;
+      unsigned r = 0, g = 0, b = 0;
+      LodePNGColorProfile prof;
+      LodePNGColorMode mode16 = lodepng_color_mode_make(LCT_RGB, 16);
+      lodepng_convert_rgb(&r, &g, &b, bg_r, bg_g, bg_b, &mode16, &state->info_png.color);
+      lodepng_color_profile_init(&prof);
+      state->error = lodepng_get_color_profile(&prof, image, w, h, &state->info_raw);
+      if(state->error) goto cleanup;
+      lodepng_color_profile_add(&prof, r, g, b, 65535);
+      state->error = auto_choose_color_from_profile(&info.color, &state->info_raw, &prof);
+      if(state->error) goto cleanup;
+      if(lodepng_convert_rgb(&info.background_r, &info.background_g, &info.background_b,
+          bg_r, bg_g, bg_b, &info.color, &state->info_png.color)) {
+        state->error = 104;
+        goto cleanup;
+      }
+    }
+    else
+#endif /* LODEPNG_COMPILE_ANCILLARY_CHUNKS */
+    {
+      state->error = lodepng_auto_choose_color(&info.color, image, w, h, &state->info_raw);
+      if(state->error) goto cleanup;
+    }
+  }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  if(state->info_png.iccp_defined) {
+    unsigned gray_icc = isGrayICCProfile(state->info_png.iccp_profile, state->info_png.iccp_profile_size);
+    unsigned gray_png = info.color.colortype == LCT_GREY || info.color.colortype == LCT_GREY_ALPHA;
+    /* TODO: perhaps instead of giving errors or less optimal compression, we can automatically modify
+    the ICC profile here to say "GRAY" or "RGB " to match the PNG color type, unless this will require
+    non trivial changes to the rest of the ICC profile */
+    if(!gray_icc && !isRGBICCProfile(state->info_png.iccp_profile, state->info_png.iccp_profile_size)) {
+      state->error = 100; /* Disallowed profile color type for PNG */
+      goto cleanup;
+    }
+    if(!state->encoder.auto_convert && gray_icc != gray_png) {
+      /* Non recoverable: encoder not allowed to convert color type, and requested color type not
+      compatible with ICC color type */
+      state->error = 101;
+      goto cleanup;
+    }
+    if(gray_icc && !gray_png) {
+      /* Non recoverable: trying to set grayscale ICC profile while colored pixels were given */
+      state->error = 102;
+      goto cleanup;
+      /* NOTE: this relies on the fact that lodepng_auto_choose_color never returns palette for grayscale pixels */
+    }
+    if(!gray_icc && gray_png) {
+      /* Recoverable but an unfortunate loss in compression density: We have grayscale pixels but
+      are forced to store them in more expensive RGB format that will repeat each value 3 times
+      because the PNG spec does not allow an RGB ICC profile with internal grayscale color data */
+      if(info.color.colortype == LCT_GREY) info.color.colortype = LCT_RGB;
+      if(info.color.colortype == LCT_GREY_ALPHA) info.color.colortype = LCT_RGBA;
+      if(info.color.bitdepth < 8) info.color.bitdepth = 8;
+    }
+  }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+  if(!lodepng_color_mode_equal(&state->info_raw, &info.color)) {
+    unsigned char* converted;
+    size_t size = ((size_t)w * (size_t)h * (size_t)lodepng_get_bpp(&info.color) + 7) / 8;
+
+    converted = (unsigned char*)lodepng_malloc(size);
+    if(!converted && size) state->error = 83; /*alloc fail*/
+    if(!state->error) {
+      state->error = lodepng_convert(converted, image, &info.color, &state->info_raw, w, h);
+    }
+    if(!state->error) preProcessScanlines(&data, &datasize, converted, w, h, &info, &state->encoder);
+    lodepng_free(converted);
+    if(state->error) goto cleanup;
+  }
+  else preProcessScanlines(&data, &datasize, image, w, h, &info, &state->encoder);
+
+  /* output all PNG chunks */ {
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    size_t i;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    /*write signature and chunks*/
+    writeSignature(&outv);
+    /*IHDR*/
+    addChunk_IHDR(&outv, w, h, info.color.colortype, info.color.bitdepth, info.interlace_method);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*unknown chunks between IHDR and PLTE*/
+    if(info.unknown_chunks_data[0]) {
+      state->error = addUnknownChunks(&outv, info.unknown_chunks_data[0], info.unknown_chunks_size[0]);
+      if(state->error) goto cleanup;
+    }
+    /*color profile chunks must come before PLTE */
+    if(info.iccp_defined) addChunk_iCCP(&outv, &info, &state->encoder.zlibsettings);
+    if(info.srgb_defined) addChunk_sRGB(&outv, &info);
+    if(info.gama_defined) addChunk_gAMA(&outv, &info);
+    if(info.chrm_defined) addChunk_cHRM(&outv, &info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    /*PLTE*/
+    if(info.color.colortype == LCT_PALETTE) {
+      addChunk_PLTE(&outv, &info.color);
+    }
+    if(state->encoder.force_palette && (info.color.colortype == LCT_RGB || info.color.colortype == LCT_RGBA)) {
+      addChunk_PLTE(&outv, &info.color);
+    }
+    /*tRNS*/
+    if(info.color.colortype == LCT_PALETTE && getPaletteTranslucency(info.color.palette, info.color.palettesize) != 0) {
+      addChunk_tRNS(&outv, &info.color);
+    }
+    if((info.color.colortype == LCT_GREY || info.color.colortype == LCT_RGB) && info.color.key_defined) {
+      addChunk_tRNS(&outv, &info.color);
+    }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*bKGD (must come between PLTE and the IDAt chunks*/
+    if(info.background_defined) {
+      state->error = addChunk_bKGD(&outv, &info);
+      if(state->error) goto cleanup;
+    }
+    /*pHYs (must come before the IDAT chunks)*/
+    if(info.phys_defined) addChunk_pHYs(&outv, &info);
+
+    /*unknown chunks between PLTE and IDAT*/
+    if(info.unknown_chunks_data[1]) {
+      state->error = addUnknownChunks(&outv, info.unknown_chunks_data[1], info.unknown_chunks_size[1]);
+      if(state->error) goto cleanup;
+    }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    /*IDAT (multiple IDAT chunks must be consecutive)*/
+    state->error = addChunk_IDAT(&outv, data, datasize, &state->encoder.zlibsettings);
+    if(state->error) goto cleanup;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*tIME*/
+    if(info.time_defined) addChunk_tIME(&outv, &info.time);
+    /*tEXt and/or zTXt*/
+    for(i = 0; i != info.text_num; ++i) {
+      if(strlen(info.text_keys[i]) > 79) {
+        state->error = 66; /*text chunk too large*/
+        goto cleanup;
+      }
+      if(strlen(info.text_keys[i]) < 1) {
+        state->error = 67; /*text chunk too small*/
+        goto cleanup;
+      }
+      if(state->encoder.text_compression) {
+        addChunk_zTXt(&outv, info.text_keys[i], info.text_strings[i], &state->encoder.zlibsettings);
+      } else {
+        addChunk_tEXt(&outv, info.text_keys[i], info.text_strings[i]);
+      }
+    }
+    /*LodePNG version id in text chunk*/
+    if(state->encoder.add_id) {
+      unsigned already_added_id_text = 0;
+      for(i = 0; i != info.text_num; ++i) {
+        if(!strcmp(info.text_keys[i], "LodePNG")) {
+          already_added_id_text = 1;
+          break;
+        }
+      }
+      if(already_added_id_text == 0) {
+        addChunk_tEXt(&outv, "LodePNG", LODEPNG_VERSION_STRING); /*it's shorter as tEXt than as zTXt chunk*/
+      }
+    }
+    /*iTXt*/
+    for(i = 0; i != info.itext_num; ++i) {
+      if(strlen(info.itext_keys[i]) > 79) {
+        state->error = 66; /*text chunk too large*/
+        goto cleanup;
+      }
+      if(strlen(info.itext_keys[i]) < 1) {
+        state->error = 67; /*text chunk too small*/
+        goto cleanup;
+      }
+      addChunk_iTXt(&outv, state->encoder.text_compression,
+                    info.itext_keys[i], info.itext_langtags[i], info.itext_transkeys[i], info.itext_strings[i],
+                    &state->encoder.zlibsettings);
+    }
+
+    /*unknown chunks between IDAT and IEND*/
+    if(info.unknown_chunks_data[2]) {
+      state->error = addUnknownChunks(&outv, info.unknown_chunks_data[2], info.unknown_chunks_size[2]);
+      if(state->error) goto cleanup;
+    }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    addChunk_IEND(&outv);
+  }
+
+cleanup:
+  lodepng_info_cleanup(&info);
+  lodepng_free(data);
+
+  /*instead of cleaning the vector up, give it to the output*/
+  *out = outv.data;
+  *outsize = outv.size;
+
+  return state->error;
+}
+
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize, const unsigned char* image,
+                               unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth) {
+  unsigned error;
+  LodePNGState state;
+  lodepng_state_init(&state);
+  state.info_raw.colortype = colortype;
+  state.info_raw.bitdepth = bitdepth;
+  state.info_png.color.colortype = colortype;
+  state.info_png.color.bitdepth = bitdepth;
+  lodepng_encode(out, outsize, image, w, h, &state);
+  error = state.error;
+  lodepng_state_cleanup(&state);
+  return error;
+}
+
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h) {
+  return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h) {
+  return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_encode_file(const char* filename, const unsigned char* image, unsigned w, unsigned h,
+                             LodePNGColorType colortype, unsigned bitdepth) {
+  unsigned char* buffer;
+  size_t buffersize;
+  unsigned error = lodepng_encode_memory(&buffer, &buffersize, image, w, h, colortype, bitdepth);
+  if(!error) error = lodepng_save_file(buffer, buffersize, filename);
+  lodepng_free(buffer);
+  return error;
+}
+
+unsigned lodepng_encode32_file(const char* filename, const unsigned char* image, unsigned w, unsigned h) {
+  return lodepng_encode_file(filename, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24_file(const char* filename, const unsigned char* image, unsigned w, unsigned h) {
+  return lodepng_encode_file(filename, image, w, h, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings) {
+  lodepng_compress_settings_init(&settings->zlibsettings);
+  settings->filter_palette_zero = 1;
+  settings->filter_strategy = LFS_MINSUM;
+  settings->auto_convert = 1;
+  settings->force_palette = 0;
+  settings->predefined_filters = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  settings->add_id = 0;
+  settings->text_compression = 1;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*
+This returns the description of a numerical error code in English. This is also
+the documentation of all the error codes.
+*/
+const char* lodepng_error_text(unsigned code) {
+  switch(code) {
+    case 0: return "no error, everything went ok";
+    case 1: return "nothing done yet"; /*the Encoder/Decoder has done nothing yet, error checking makes no sense yet*/
+    case 10: return "end of input memory reached without huffman end code"; /*while huffman decoding*/
+    case 11: return "error in code tree made it jump outside of huffman tree"; /*while huffman decoding*/
+    case 13: return "problem while processing dynamic deflate block";
+    case 14: return "problem while processing dynamic deflate block";
+    case 15: return "problem while processing dynamic deflate block";
+    case 16: return "unexisting code while processing dynamic deflate block";
+    case 17: return "end of out buffer memory reached while inflating";
+    case 18: return "invalid distance code while inflating";
+    case 19: return "end of out buffer memory reached while inflating";
+    case 20: return "invalid deflate block BTYPE encountered while decoding";
+    case 21: return "NLEN is not ones complement of LEN in a deflate block";
+
+    /*end of out buffer memory reached while inflating:
+    This can happen if the inflated deflate data is longer than the amount of bytes required to fill up
+    all the pixels of the image, given the color depth and image dimensions. Something that doesn't
+    happen in a normal, well encoded, PNG image.*/
+    case 22: return "end of out buffer memory reached while inflating";
+    case 23: return "end of in buffer memory reached while inflating";
+    case 24: return "invalid FCHECK in zlib header";
+    case 25: return "invalid compression method in zlib header";
+    case 26: return "FDICT encountered in zlib header while it's not used for PNG";
+    case 27: return "PNG file is smaller than a PNG header";
+    /*Checks the magic file header, the first 8 bytes of the PNG file*/
+    case 28: return "incorrect PNG signature, it's no PNG or corrupted";
+    case 29: return "first chunk is not the header chunk";
+    case 30: return "chunk length too large, chunk broken off at end of file";
+    case 31: return "illegal PNG color type or bpp";
+    case 32: return "illegal PNG compression method";
+    case 33: return "illegal PNG filter method";
+    case 34: return "illegal PNG interlace method";
+    case 35: return "chunk length of a chunk is too large or the chunk too small";
+    case 36: return "illegal PNG filter type encountered";
+    case 37: return "illegal bit depth for this color type given";
+    case 38: return "the palette is too big"; /*more than 256 colors*/
+    case 39: return "tRNS chunk before PLTE or has more entries than palette size";
+    case 40: return "tRNS chunk has wrong size for grayscale image";
+    case 41: return "tRNS chunk has wrong size for RGB image";
+    case 42: return "tRNS chunk appeared while it was not allowed for this color type";
+    case 43: return "bKGD chunk has wrong size for palette image";
+    case 44: return "bKGD chunk has wrong size for grayscale image";
+    case 45: return "bKGD chunk has wrong size for RGB image";
+    case 48: return "empty input buffer given to decoder. Maybe caused by non-existing file?";
+    case 49: return "jumped past memory while generating dynamic huffman tree";
+    case 50: return "jumped past memory while generating dynamic huffman tree";
+    case 51: return "jumped past memory while inflating huffman block";
+    case 52: return "jumped past memory while inflating";
+    case 53: return "size of zlib data too small";
+    case 54: return "repeat symbol in tree while there was no value symbol yet";
+    /*jumped past tree while generating huffman tree, this could be when the
+    tree will have more leaves than symbols after generating it out of the
+    given lenghts. They call this an oversubscribed dynamic bit lengths tree in zlib.*/
+    case 55: return "jumped past tree while generating huffman tree";
+    case 56: return "given output image colortype or bitdepth not supported for color conversion";
+    case 57: return "invalid CRC encountered (checking CRC can be disabled)";
+    case 58: return "invalid ADLER32 encountered (checking ADLER32 can be disabled)";
+    case 59: return "requested color conversion not supported";
+    case 60: return "invalid window size given in the settings of the encoder (must be 0-32768)";
+    case 61: return "invalid BTYPE given in the settings of the encoder (only 0, 1 and 2 are allowed)";
+    /*LodePNG leaves the choice of RGB to grayscale conversion formula to the user.*/
+    case 62: return "conversion from color to grayscale not supported";
+    /*(2^31-1)*/
+    case 63: return "length of a chunk too long, max allowed for PNG is 2147483647 bytes per chunk";
+    /*this would result in the inability of a deflated block to ever contain an end code. It must be at least 1.*/
+    case 64: return "the length of the END symbol 256 in the Huffman tree is 0";
+    case 66: return "the length of a text chunk keyword given to the encoder is longer than the maximum of 79 bytes";
+    case 67: return "the length of a text chunk keyword given to the encoder is smaller than the minimum of 1 byte";
+    case 68: return "tried to encode a PLTE chunk with a palette that has less than 1 or more than 256 colors";
+    case 69: return "unknown chunk type with 'critical' flag encountered by the decoder";
+    case 71: return "unexisting interlace mode given to encoder (must be 0 or 1)";
+    case 72: return "while decoding, unexisting compression method encountering in zTXt or iTXt chunk (it must be 0)";
+    case 73: return "invalid tIME chunk size";
+    case 74: return "invalid pHYs chunk size";
+    /*length could be wrong, or data chopped off*/
+    case 75: return "no null termination char found while decoding text chunk";
+    case 76: return "iTXt chunk too short to contain required bytes";
+    case 77: return "integer overflow in buffer size";
+    case 78: return "failed to open file for reading"; /*file doesn't exist or couldn't be opened for reading*/
+    case 79: return "failed to open file for writing";
+    case 80: return "tried creating a tree of 0 symbols";
+    case 81: return "lazy matching at pos 0 is impossible";
+    case 82: return "color conversion to palette requested while a color isn't in palette, or index out of bounds";
+    case 83: return "memory allocation failed";
+    case 84: return "given image too small to contain all pixels to be encoded";
+    case 86: return "impossible offset in lz77 encoding (internal bug)";
+    case 87: return "must provide custom zlib function pointer if LODEPNG_COMPILE_ZLIB is not defined";
+    case 88: return "invalid filter strategy given for LodePNGEncoderSettings.filter_strategy";
+    case 89: return "text chunk keyword too short or long: must have size 1-79";
+    /*the windowsize in the LodePNGCompressSettings. Requiring POT(==> & instead of %) makes encoding 12% faster.*/
+    case 90: return "windowsize must be a power of two";
+    case 91: return "invalid decompressed idat size";
+    case 92: return "integer overflow due to too many pixels";
+    case 93: return "zero width or height is invalid";
+    case 94: return "header chunk must have a size of 13 bytes";
+    case 95: return "integer overflow with combined idat chunk size";
+    case 96: return "invalid gAMA chunk size";
+    case 97: return "invalid cHRM chunk size";
+    case 98: return "invalid sRGB chunk size";
+    case 99: return "invalid sRGB rendering intent";
+    case 100: return "invalid ICC profile color type, the PNG specification only allows RGB or GRAY";
+    case 101: return "PNG specification does not allow RGB ICC profile on gray color types and vice versa";
+    case 102: return "not allowed to set grayscale ICC profile with colored pixels by PNG specification";
+    case 103: return "invalid palette index in bKGD chunk. Maybe it came before PLTE chunk?";
+    case 104: return "invalid bKGD color while encoding (e.g. palette index out of range)";
+  }
+  return "unknown error code";
+}
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // C++ Wrapper                                                          // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng {
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename) {
+  long size = lodepng_filesize(filename.c_str());
+  if(size < 0) return 78;
+  buffer.resize((size_t)size);
+  return size == 0 ? 0 : lodepng_buffer_file(&buffer[0], (size_t)size, filename.c_str());
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename) {
+  return lodepng_save_file(buffer.empty() ? 0 : &buffer[0], buffer.size(), filename.c_str());
+}
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                    const LodePNGDecompressSettings& settings) {
+  unsigned char* buffer = 0;
+  size_t buffersize = 0;
+  unsigned error = zlib_decompress(&buffer, &buffersize, in, insize, &settings);
+  if(buffer) {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                    const LodePNGDecompressSettings& settings) {
+  return decompress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+}
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                  const LodePNGCompressSettings& settings) {
+  unsigned char* buffer = 0;
+  size_t buffersize = 0;
+  unsigned error = zlib_compress(&buffer, &buffersize, in, insize, &settings);
+  if(buffer) {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                  const LodePNGCompressSettings& settings) {
+  return compress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+}
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+
+
+#ifdef LODEPNG_COMPILE_PNG
+
+State::State() {
+  lodepng_state_init(this);
+}
+
+State::State(const State& other) {
+  lodepng_state_init(this);
+  lodepng_state_copy(this, &other);
+}
+
+State::~State() {
+  lodepng_state_cleanup(this);
+}
+
+State& State::operator=(const State& other) {
+  lodepng_state_copy(this, &other);
+  return *this;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const unsigned char* in,
+                size_t insize, LodePNGColorType colortype, unsigned bitdepth) {
+  unsigned char* buffer;
+  unsigned error = lodepng_decode_memory(&buffer, &w, &h, in, insize, colortype, bitdepth);
+  if(buffer && !error) {
+    State state;
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const std::vector<unsigned char>& in, LodePNGColorType colortype, unsigned bitdepth) {
+  return decode(out, w, h, in.empty() ? 0 : &in[0], (unsigned)in.size(), colortype, bitdepth);
+}
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const unsigned char* in, size_t insize) {
+  unsigned char* buffer = NULL;
+  unsigned error = lodepng_decode(&buffer, &w, &h, &state, in, insize);
+  if(buffer && !error) {
+    size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+  }
+  lodepng_free(buffer);
+  return error;
+}
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const std::vector<unsigned char>& in) {
+  return decode(out, w, h, state, in.empty() ? 0 : &in[0], in.size());
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const std::string& filename,
+                LodePNGColorType colortype, unsigned bitdepth) {
+  std::vector<unsigned char> buffer;
+  /* safe output values in case error happens */
+  w = h = 0;
+  unsigned error = load_file(buffer, filename);
+  if(error) return error;
+  return decode(out, w, h, buffer, colortype, bitdepth);
+}
+#endif /* LODEPNG_COMPILE_DECODER */
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+unsigned encode(std::vector<unsigned char>& out, const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth) {
+  unsigned char* buffer;
+  size_t buffersize;
+  unsigned error = lodepng_encode_memory(&buffer, &buffersize, in, w, h, colortype, bitdepth);
+  if(buffer) {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth) {
+  if(lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+  return encode(out, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+}
+
+unsigned encode(std::vector<unsigned char>& out,
+                const unsigned char* in, unsigned w, unsigned h,
+                State& state) {
+  unsigned char* buffer;
+  size_t buffersize;
+  unsigned error = lodepng_encode(&buffer, &buffersize, in, w, h, &state);
+  if(buffer) {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                State& state) {
+  if(lodepng_get_raw_size(w, h, &state.info_raw) > in.size()) return 84;
+  return encode(out, in.empty() ? 0 : &in[0], w, h, state);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned encode(const std::string& filename,
+                const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth) {
+  std::vector<unsigned char> buffer;
+  unsigned error = encode(buffer, in, w, h, colortype, bitdepth);
+  if(!error) error = save_file(buffer, filename);
+  return error;
+}
+
+unsigned encode(const std::string& filename,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth) {
+  if(lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+  return encode(filename, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+}
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_PNG */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
diff --git a/thirdparty/basis_universal/encoder/lodepng.h b/thirdparty/basis_universal/encoder/lodepng.h
new file mode 100644
index 0000000000..476a2061e2
--- /dev/null
+++ b/thirdparty/basis_universal/encoder/lodepng.h
@@ -0,0 +1,1930 @@
+/*
+LodePNG version 20190210
+
+Copyright (c) 2005-2019 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+#ifndef LODEPNG_H
+#define LODEPNG_H
+
+#include <string.h> /*for size_t*/
+
+extern const char* LODEPNG_VERSION_STRING;
+
+/*
+The following #defines are used to create code sections. They can be disabled
+to disable code sections, which can give faster compile time and smaller binary.
+The "NO_COMPILE" defines are designed to be used to pass as defines to the
+compiler command to disable them without modifying this header, e.g.
+-DLODEPNG_NO_COMPILE_ZLIB for gcc.
+In addition to those below, you can also define LODEPNG_NO_COMPILE_CRC to
+allow implementing a custom lodepng_crc32.
+*/
+/*deflate & zlib. If disabled, you must specify alternative zlib functions in
+the custom_zlib field of the compress and decompress settings*/
+#ifndef LODEPNG_NO_COMPILE_ZLIB
+#define LODEPNG_COMPILE_ZLIB
+#endif
+
+/*png encoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_PNG
+#define LODEPNG_COMPILE_PNG
+#endif
+
+/*deflate&zlib decoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_DECODER
+#define LODEPNG_COMPILE_DECODER
+#endif
+
+/*deflate&zlib encoder and png encoder*/
+#ifndef LODEPNG_NO_COMPILE_ENCODER
+#define LODEPNG_COMPILE_ENCODER
+#endif
+
+/*the optional built in harddisk file loading and saving functions*/
+#ifndef LODEPNG_NO_COMPILE_DISK
+#define LODEPNG_COMPILE_DISK
+#endif
+
+/*support for chunks other than IHDR, IDAT, PLTE, tRNS, IEND: ancillary and unknown chunks*/
+#ifndef LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+#define LODEPNG_COMPILE_ANCILLARY_CHUNKS
+#endif
+
+/*ability to convert error numerical codes to English text string*/
+#ifndef LODEPNG_NO_COMPILE_ERROR_TEXT
+#define LODEPNG_COMPILE_ERROR_TEXT
+#endif
+
+/*Compile the default allocators (C's free, malloc and realloc). If you disable this,
+you can define the functions lodepng_free, lodepng_malloc and lodepng_realloc in your
+source files with custom allocators.*/
+#ifndef LODEPNG_NO_COMPILE_ALLOCATORS
+#define LODEPNG_COMPILE_ALLOCATORS
+#endif
+
+/*compile the C++ version (you can disable the C++ wrapper here even when compiling for C++)*/
+#ifdef __cplusplus
+#ifndef LODEPNG_NO_COMPILE_CPP
+#define LODEPNG_COMPILE_CPP
+#endif
+#endif
+
+#ifdef LODEPNG_COMPILE_CPP
+#include <vector>
+#include <string>
+#endif /*LODEPNG_COMPILE_CPP*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*The PNG color types (also used for raw).*/
+typedef enum LodePNGColorType {
+  LCT_GREY = 0, /*grayscale: 1,2,4,8,16 bit*/
+  LCT_RGB = 2, /*RGB: 8,16 bit*/
+  LCT_PALETTE = 3, /*palette: 1,2,4,8 bit*/
+  LCT_GREY_ALPHA = 4, /*grayscale with alpha: 8,16 bit*/
+  LCT_RGBA = 6 /*RGB with alpha: 8,16 bit*/
+} LodePNGColorType;
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Converts PNG data in memory to raw pixel data.
+out: Output parameter. Pointer to buffer that will contain the raw pixel data.
+     After decoding, its size is w * h * (bytes per pixel) bytes larger than
+     initially. Bytes per pixel depends on colortype and bitdepth.
+     Must be freed after usage with free(*out).
+     Note: for 16-bit per channel colors, uses big endian format like PNG does.
+w: Output parameter. Pointer to width of pixel data.
+h: Output parameter. Pointer to height of pixel data.
+in: Memory buffer with the PNG file.
+insize: size of the in buffer.
+colortype: the desired color type for the raw output image. See explanation on PNG color types.
+bitdepth: the desired bit depth for the raw output image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h,
+                               const unsigned char* in, size_t insize,
+                               LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_memory, but always decodes to 32-bit RGBA raw image*/
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h,
+                          const unsigned char* in, size_t insize);
+
+/*Same as lodepng_decode_memory, but always decodes to 24-bit RGB raw image*/
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h,
+                          const unsigned char* in, size_t insize);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load PNG from disk, from file with given name.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h,
+                             const char* filename,
+                             LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_file, but always decodes to 32-bit RGBA raw image.*/
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h,
+                               const char* filename);
+
+/*Same as lodepng_decode_file, but always decodes to 24-bit RGB raw image.*/
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h,
+                               const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Converts raw pixel data into a PNG image in memory. The colortype and bitdepth
+  of the output PNG image cannot be chosen, they are automatically determined
+  by the colortype, bitdepth and content of the input pixel data.
+  Note: for 16-bit per channel colors, needs big endian format like PNG does.
+out: Output parameter. Pointer to buffer that will contain the PNG image data.
+     Must be freed after usage with free(*out).
+outsize: Output parameter. Pointer to the size in bytes of the out buffer.
+image: The raw pixel data to encode. The size of this buffer should be
+       w * h * (bytes per pixel), bytes per pixel depends on colortype and bitdepth.
+w: width of the raw pixel data in pixels.
+h: height of the raw pixel data in pixels.
+colortype: the color type of the raw input image. See explanation on PNG color types.
+bitdepth: the bit depth of the raw input image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize,
+                               const unsigned char* image, unsigned w, unsigned h,
+                               LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_memory, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize,
+                          const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_memory, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize,
+                          const unsigned char* image, unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned lodepng_encode_file(const char* filename,
+                             const unsigned char* image, unsigned w, unsigned h,
+                             LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_file, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32_file(const char* filename,
+                               const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_file, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24_file(const char* filename,
+                               const unsigned char* image, unsigned w, unsigned h);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng {
+#ifdef LODEPNG_COMPILE_DECODER
+/*Same as lodepng_decode_memory, but decodes to an std::vector. The colortype
+is the format to output the pixels to. Default is RGBA 8-bit per channel.*/
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const unsigned char* in, size_t insize,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const std::vector<unsigned char>& in,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts PNG file from disk to raw pixel data in memory.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const std::string& filename,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*Same as lodepng_encode_memory, but encodes to an std::vector. colortype
+is that of the raw input data. The output PNG color type will be auto chosen.*/
+unsigned encode(std::vector<unsigned char>& out,
+                const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts 32-bit RGBA raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned encode(const std::string& filename,
+                const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+unsigned encode(const std::string& filename,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*Returns an English description of the numerical error code.*/
+const char* lodepng_error_text(unsigned code);
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Settings for zlib decompression*/
+typedef struct LodePNGDecompressSettings LodePNGDecompressSettings;
+struct LodePNGDecompressSettings {
+  /* Check LodePNGDecoderSettings for more ignorable errors such as ignore_crc */
+  unsigned ignore_adler32; /*if 1, continue and don't give an error message if the Adler32 checksum is corrupted*/
+
+  /*use custom zlib decoder instead of built in one (default: null)*/
+  unsigned (*custom_zlib)(unsigned char**, size_t*,
+                          const unsigned char*, size_t,
+                          const LodePNGDecompressSettings*);
+  /*use custom deflate decoder instead of built in one (default: null)
+  if custom_zlib is used, custom_deflate is ignored since only the built in
+  zlib function will call custom_deflate*/
+  unsigned (*custom_inflate)(unsigned char**, size_t*,
+                             const unsigned char*, size_t,
+                             const LodePNGDecompressSettings*);
+
+  const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGDecompressSettings lodepng_default_decompress_settings;
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Settings for zlib compression. Tweaking these settings tweaks the balance
+between speed and compression ratio.
+*/
+typedef struct LodePNGCompressSettings LodePNGCompressSettings;
+struct LodePNGCompressSettings /*deflate = compress*/ {
+  /*LZ77 related settings*/
+  unsigned btype; /*the block type for LZ (0, 1, 2 or 3, see zlib standard). Should be 2 for proper compression.*/
+  unsigned use_lz77; /*whether or not to use LZ77. Should be 1 for proper compression.*/
+  unsigned windowsize; /*must be a power of two <= 32768. higher compresses more but is slower. Default value: 2048.*/
+  unsigned minmatch; /*mininum lz77 length. 3 is normally best, 6 can be better for some PNGs. Default: 0*/
+  unsigned nicematch; /*stop searching if >= this length found. Set to 258 for best compression. Default: 128*/
+  unsigned lazymatching; /*use lazy matching: better compression but a bit slower. Default: true*/
+
+  /*use custom zlib encoder instead of built in one (default: null)*/
+  unsigned (*custom_zlib)(unsigned char**, size_t*,
+                          const unsigned char*, size_t,
+                          const LodePNGCompressSettings*);
+  /*use custom deflate encoder instead of built in one (default: null)
+  if custom_zlib is used, custom_deflate is ignored since only the built in
+  zlib function will call custom_deflate*/
+  unsigned (*custom_deflate)(unsigned char**, size_t*,
+                             const unsigned char*, size_t,
+                             const LodePNGCompressSettings*);
+
+  const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGCompressSettings lodepng_default_compress_settings;
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*
+Color mode of an image. Contains all information required to decode the pixel
+bits to RGBA colors. This information is the same as used in the PNG file
+format, and is used both for PNG and raw image data in LodePNG.
+*/
+typedef struct LodePNGColorMode {
+  /*header (IHDR)*/
+  LodePNGColorType colortype; /*color type, see PNG standard or documentation further in this header file*/
+  unsigned bitdepth;  /*bits per sample, see PNG standard or documentation further in this header file*/
+
+  /*
+  palette (PLTE and tRNS)
+
+  Dynamically allocated with the colors of the palette, including alpha.
+  When encoding a PNG, to store your colors in the palette of the LodePNGColorMode, first use
+  lodepng_palette_clear, then for each color use lodepng_palette_add.
+  If you encode an image without alpha with palette, don't forget to put value 255 in each A byte of the palette.
+
+  When decoding, by default you can ignore this palette, since LodePNG already
+  fills the palette colors in the pixels of the raw RGBA output.
+
+  The palette is only supported for color type 3.
+  */
+  unsigned char* palette; /*palette in RGBARGBA... order. When allocated, must be either 0, or have size 1024*/
+  size_t palettesize; /*palette size in number of colors (amount of bytes is 4 * palettesize)*/
+
+  /*
+  transparent color key (tRNS)
+
+  This color uses the same bit depth as the bitdepth value in this struct, which can be 1-bit to 16-bit.
+  For grayscale PNGs, r, g and b will all 3 be set to the same.
+
+  When decoding, by default you can ignore this information, since LodePNG sets
+  pixels with this key to transparent already in the raw RGBA output.
+
+  The color key is only supported for color types 0 and 2.
+  */
+  unsigned key_defined; /*is a transparent color key given? 0 = false, 1 = true*/
+  unsigned key_r;       /*red/grayscale component of color key*/
+  unsigned key_g;       /*green component of color key*/
+  unsigned key_b;       /*blue component of color key*/
+} LodePNGColorMode;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_color_mode_init(LodePNGColorMode* info);
+void lodepng_color_mode_cleanup(LodePNGColorMode* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source);
+/* Makes a temporary LodePNGColorMode that does not need cleanup (no palette) */
+LodePNGColorMode lodepng_color_mode_make(LodePNGColorType colortype, unsigned bitdepth);
+
+void lodepng_palette_clear(LodePNGColorMode* info);
+/*add 1 color to the palette*/
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+                             unsigned char r, unsigned char g, unsigned char b, unsigned char a);
+
+/*get the total amount of bits per pixel, based on colortype and bitdepth in the struct*/
+unsigned lodepng_get_bpp(const LodePNGColorMode* info);
+/*get the amount of color channels used, based on colortype in the struct.
+If a palette is used, it counts as 1 channel.*/
+unsigned lodepng_get_channels(const LodePNGColorMode* info);
+/*is it a grayscale type? (only colortype 0 or 4)*/
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info);
+/*has it got an alpha channel? (only colortype 2 or 6)*/
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info);
+/*has it got a palette? (only colortype 3)*/
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info);
+/*only returns true if there is a palette and there is a value in the palette with alpha < 255.
+Loops through the palette to check this.*/
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info);
+/*
+Check if the given color info indicates the possibility of having non-opaque pixels in the PNG image.
+Returns true if the image can have translucent or invisible pixels (it still be opaque if it doesn't use such pixels).
+Returns false if the image can only have opaque pixels.
+In detail, it returns true only if it's a color type with alpha, or has a palette with non-opaque values,
+or if "key_defined" is true.
+*/
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info);
+/*Returns the byte size of a raw image buffer with given width, height and color mode*/
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*The information of a Time chunk in PNG.*/
+typedef struct LodePNGTime {
+  unsigned year;    /*2 bytes used (0-65535)*/
+  unsigned month;   /*1-12*/
+  unsigned day;     /*1-31*/
+  unsigned hour;    /*0-23*/
+  unsigned minute;  /*0-59*/
+  unsigned second;  /*0-60 (to allow for leap seconds)*/
+} LodePNGTime;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*Information about the PNG image, except pixels, width and height.*/
+typedef struct LodePNGInfo {
+  /*header (IHDR), palette (PLTE) and transparency (tRNS) chunks*/
+  unsigned compression_method;/*compression method of the original file. Always 0.*/
+  unsigned filter_method;     /*filter method of the original file*/
+  unsigned interlace_method;  /*interlace method of the original file: 0=none, 1=Adam7*/
+  LodePNGColorMode color;     /*color type and bits, palette and transparency of the PNG file*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  /*
+  Suggested background color chunk (bKGD)
+
+  This uses the same color mode and bit depth as the PNG (except no alpha channel),
+  with values truncated to the bit depth in the unsigned integer.
+
+  For grayscale and palette PNGs, the value is stored in background_r. The values
+  in background_g and background_b are then unused.
+
+  So when decoding, you may get these in a different color mode than the one you requested
+  for the raw pixels.
+
+  When encoding with auto_convert, you must use the color model defined in info_png.color for
+  these values. The encoder normally ignores info_png.color when auto_convert is on, but will
+  use it to interpret these values (and convert copies of them to its chosen color model).
+
+  When encoding, avoid setting this to an expensive color, such as a non-gray value
+  when the image is gray, or the compression will be worse since it will be forced to
+  write the PNG with a more expensive color mode (when auto_convert is on).
+
+  The decoder does not use this background color to edit the color of pixels. This is a
+  completely optional metadata feature.
+  */
+  unsigned background_defined; /*is a suggested background color given?*/
+  unsigned background_r;       /*red/gray/palette component of suggested background color*/
+  unsigned background_g;       /*green component of suggested background color*/
+  unsigned background_b;       /*blue component of suggested background color*/
+
+  /*
+  non-international text chunks (tEXt and zTXt)
+
+  The char** arrays each contain num strings. The actual messages are in
+  text_strings, while text_keys are keywords that give a short description what
+  the actual text represents, e.g. Title, Author, Description, or anything else.
+
+  All the string fields below including keys, names and language tags are null terminated.
+  The PNG specification uses null characters for the keys, names and tags, and forbids null
+  characters to appear in the main text which is why we can use null termination everywhere here.
+
+  A keyword is minimum 1 character and maximum 79 characters long. It's
+  discouraged to use a single line length longer than 79 characters for texts.
+
+  Don't allocate these text buffers yourself. Use the init/cleanup functions
+  correctly and use lodepng_add_text and lodepng_clear_text.
+  */
+  size_t text_num; /*the amount of texts in these char** buffers (there may be more texts in itext)*/
+  char** text_keys; /*the keyword of a text chunk (e.g. "Comment")*/
+  char** text_strings; /*the actual text*/
+
+  /*
+  international text chunks (iTXt)
+  Similar to the non-international text chunks, but with additional strings
+  "langtags" and "transkeys".
+  */
+  size_t itext_num; /*the amount of international texts in this PNG*/
+  char** itext_keys; /*the English keyword of the text chunk (e.g. "Comment")*/
+  char** itext_langtags; /*language tag for this text's language, ISO/IEC 646 string, e.g. ISO 639 language tag*/
+  char** itext_transkeys; /*keyword translated to the international language - UTF-8 string*/
+  char** itext_strings; /*the actual international text - UTF-8 string*/
+
+  /*time chunk (tIME)*/
+  unsigned time_defined; /*set to 1 to make the encoder generate a tIME chunk*/
+  LodePNGTime time;
+
+  /*phys chunk (pHYs)*/
+  unsigned phys_defined; /*if 0, there is no pHYs chunk and the values below are undefined, if 1 else there is one*/
+  unsigned phys_x; /*pixels per unit in x direction*/
+  unsigned phys_y; /*pixels per unit in y direction*/
+  unsigned phys_unit; /*may be 0 (unknown unit) or 1 (metre)*/
+
+  /*
+  Color profile related chunks: gAMA, cHRM, sRGB, iCPP
+
+  LodePNG does not apply any color conversions on pixels in the encoder or decoder and does not interpret these color
+  profile values. It merely passes on the information. If you wish to use color profiles and convert colors, please
+  use these values with a color management library.
+
+  See the PNG, ICC and sRGB specifications for more information about the meaning of these values.
+  */
+
+  /* gAMA chunk: optional, overridden by sRGB or iCCP if those are present. */
+  unsigned gama_defined; /* Whether a gAMA chunk is present (0 = not present, 1 = present). */
+  unsigned gama_gamma;   /* Gamma exponent times 100000 */
+
+  /* cHRM chunk: optional, overridden by sRGB or iCCP if those are present. */
+  unsigned chrm_defined; /* Whether a cHRM chunk is present (0 = not present, 1 = present). */
+  unsigned chrm_white_x; /* White Point x times 100000 */
+  unsigned chrm_white_y; /* White Point y times 100000 */
+  unsigned chrm_red_x;   /* Red x times 100000 */
+  unsigned chrm_red_y;   /* Red y times 100000 */
+  unsigned chrm_green_x; /* Green x times 100000 */
+  unsigned chrm_green_y; /* Green y times 100000 */
+  unsigned chrm_blue_x;  /* Blue x times 100000 */
+  unsigned chrm_blue_y;  /* Blue y times 100000 */
+
+  /*
+  sRGB chunk: optional. May not appear at the same time as iCCP.
+  If gAMA is also present gAMA must contain value 45455.
+  If cHRM is also present cHRM must contain respectively 31270,32900,64000,33000,30000,60000,15000,6000.
+  */
+  unsigned srgb_defined; /* Whether an sRGB chunk is present (0 = not present, 1 = present). */
+  unsigned srgb_intent;  /* Rendering intent: 0=perceptual, 1=rel. colorimetric, 2=saturation, 3=abs. colorimetric */
+
+  /*
+  iCCP chunk: optional. May not appear at the same time as sRGB.
+
+  LodePNG does not parse or use the ICC profile (except its color space header field for an edge case), a
+  separate library to handle the ICC data (not included in LodePNG) format is needed to use it for color
+  management and conversions.
+
+  For encoding, if iCCP is present, gAMA and cHRM are recommended to be added as well with values that match the ICC
+  profile as closely as possible, if you wish to do this you should provide the correct values for gAMA and cHRM and
+  enable their '_defined' flags since LodePNG will not automatically compute them from the ICC profile.
+
+  For encoding, the ICC profile is required by the PNG specification to be an "RGB" profile for non-gray
+  PNG color types and a "GRAY" profile for gray PNG color types. If you disable auto_convert, you must ensure
+  the ICC profile type matches your requested color type, else the encoder gives an error. If auto_convert is
+  enabled (the default), and the ICC profile is not a good match for the pixel data, this will result in an encoder
+  error if the pixel data has non-gray pixels for a GRAY profile, or a silent less-optimal compression of the pixel
+  data if the pixels could be encoded as grayscale but the ICC profile is RGB.
+
+  To avoid this do not set an ICC profile in the image unless there is a good reason for it, and when doing so
+  make sure you compute it carefully to avoid the above problems.
+  */
+  unsigned iccp_defined;      /* Whether an iCCP chunk is present (0 = not present, 1 = present). */
+  char* iccp_name;            /* Null terminated string with profile name, 1-79 bytes */
+  /*
+  The ICC profile in iccp_profile_size bytes.
+  Don't allocate this buffer yourself. Use the init/cleanup functions
+  correctly and use lodepng_set_icc and lodepng_clear_icc.
+  */
+  unsigned char* iccp_profile;
+  unsigned iccp_profile_size; /* The size of iccp_profile in bytes */
+
+  /* End of color profile related chunks */
+
+
+  /*
+  unknown chunks: chunks not known by LodePNG, passed on byte for byte.
+
+  There are 3 buffers, one for each position in the PNG where unknown chunks can appear.
+  Each buffer contains all unknown chunks for that position consecutively.
+  The 3 positions are:
+  0: between IHDR and PLTE, 1: between PLTE and IDAT, 2: between IDAT and IEND.
+
+  For encoding, do not store critical chunks or known chunks that are enabled with a "_defined" flag
+  above in here, since the encoder will blindly follow this and could then encode an invalid PNG file
+  (such as one with two IHDR chunks or the disallowed combination of sRGB with iCCP). But do use
+  this if you wish to store an ancillary chunk that is not supported by LodePNG (such as sPLT or hIST),
+  or any non-standard PNG chunk.
+
+  Do not allocate or traverse this data yourself. Use the chunk traversing functions declared
+  later, such as lodepng_chunk_next and lodepng_chunk_append, to read/write this struct.
+  */
+  unsigned char* unknown_chunks_data[3];
+  size_t unknown_chunks_size[3]; /*size in bytes of the unknown chunks, given for protection*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGInfo;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_info_init(LodePNGInfo* info);
+void lodepng_info_cleanup(LodePNGInfo* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str); /*push back both texts at once*/
+void lodepng_clear_text(LodePNGInfo* info); /*use this to clear the texts again after you filled them in*/
+
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+                           const char* transkey, const char* str); /*push back the 4 texts of 1 chunk at once*/
+void lodepng_clear_itext(LodePNGInfo* info); /*use this to clear the itexts again after you filled them in*/
+
+/*replaces if exists*/
+unsigned lodepng_set_icc(LodePNGInfo* info, const char* name, const unsigned char* profile, unsigned profile_size);
+void lodepng_clear_icc(LodePNGInfo* info); /*use this to clear the texts again after you filled them in*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*
+Converts raw buffer from one color type to another color type, based on
+LodePNGColorMode structs to describe the input and output color type.
+See the reference manual at the end of this header file to see which color conversions are supported.
+return value = LodePNG error code (0 if all went ok, an error if the conversion isn't supported)
+The out buffer must have size (w * h * bpp + 7) / 8, where bpp is the bits per pixel
+of the output color type (lodepng_get_bpp).
+For < 8 bpp images, there should not be padding bits at the end of scanlines.
+For 16-bit per channel colors, uses big endian format like PNG does.
+Return value is LodePNG error code
+*/
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+                         const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+                         unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Settings for the decoder. This contains settings for the PNG and the Zlib
+decoder, but not the Info settings from the Info structs.
+*/
+typedef struct LodePNGDecoderSettings {
+  LodePNGDecompressSettings zlibsettings; /*in here is the setting to ignore Adler32 checksums*/
+
+  /* Check LodePNGDecompressSettings for more ignorable errors such as ignore_adler32 */
+  unsigned ignore_crc; /*ignore CRC checksums*/
+  unsigned ignore_critical; /*ignore unknown critical chunks*/
+  unsigned ignore_end; /*ignore issues at end of file if possible (missing IEND chunk, too large chunk, ...)*/
+  /* TODO: make a system involving warnings with levels and a strict mode instead. Other potentially recoverable
+     errors: srgb rendering intent value, size of content of ancillary chunks, more than 79 characters for some
+     strings, placement/combination rules for ancillary chunks, crc of unknown chunks, allowed characters
+     in string keys, etc... */
+
+  unsigned color_convert; /*whether to convert the PNG to the color type you want. Default: yes*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  unsigned read_text_chunks; /*if false but remember_unknown_chunks is true, they're stored in the unknown chunks*/
+  /*store all bytes from unknown chunks in the LodePNGInfo (off by default, useful for a png editor)*/
+  unsigned remember_unknown_chunks;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGDecoderSettings;
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*automatically use color type with less bits per pixel if losslessly possible. Default: AUTO*/
+typedef enum LodePNGFilterStrategy {
+  /*every filter at zero*/
+  LFS_ZERO,
+  /*Use filter that gives minimum sum, as described in the official PNG filter heuristic.*/
+  LFS_MINSUM,
+  /*Use the filter type that gives smallest Shannon entropy for this scanline. Depending
+  on the image, this is better or worse than minsum.*/
+  LFS_ENTROPY,
+  /*
+  Brute-force-search PNG filters by compressing each filter for each scanline.
+  Experimental, very slow, and only rarely gives better compression than MINSUM.
+  */
+  LFS_BRUTE_FORCE,
+  /*use predefined_filters buffer: you specify the filter type for each scanline*/
+  LFS_PREDEFINED
+} LodePNGFilterStrategy;
+
+/*Gives characteristics about the integer RGBA colors of the image (count, alpha channel usage, bit depth, ...),
+which helps decide which color model to use for encoding.
+Used internally by default if "auto_convert" is enabled. Public because it's useful for custom algorithms.
+NOTE: This is not related to the ICC color profile, search "iccp_profile" instead to find the ICC/chromacity/...
+fields in this header file.*/
+typedef struct LodePNGColorProfile {
+  unsigned colored; /*not grayscale*/
+  unsigned key; /*image is not opaque and color key is possible instead of full alpha*/
+  unsigned short key_r; /*key values, always as 16-bit, in 8-bit case the byte is duplicated, e.g. 65535 means 255*/
+  unsigned short key_g;
+  unsigned short key_b;
+  unsigned alpha; /*image is not opaque and alpha channel or alpha palette required*/
+  unsigned numcolors; /*amount of colors, up to 257. Not valid if bits == 16.*/
+  unsigned char palette[1024]; /*Remembers up to the first 256 RGBA colors, in no particular order*/
+  unsigned bits; /*bits per channel (not for palette). 1,2 or 4 for grayscale only. 16 if 16-bit per channel required.*/
+  size_t numpixels;
+} LodePNGColorProfile;
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile);
+
+/*Get a LodePNGColorProfile of the image. The profile must already have been inited.
+NOTE: This is not related to the ICC color profile, search "iccp_profile" instead to find the ICC/chromacity/...
+fields in this header file.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+                                   const unsigned char* image, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode_in);
+/*The function LodePNG uses internally to decide the PNG color with auto_convert.
+Chooses an optimal color model, e.g. gray if only gray pixels, palette if < 256 colors, ...*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+                                   const unsigned char* image, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode_in);
+
+/*Settings for the encoder.*/
+typedef struct LodePNGEncoderSettings {
+  LodePNGCompressSettings zlibsettings; /*settings for the zlib encoder, such as window size, ...*/
+
+  unsigned auto_convert; /*automatically choose output PNG color type. Default: true*/
+
+  /*If true, follows the official PNG heuristic: if the PNG uses a palette or lower than
+  8 bit depth, set all filters to zero. Otherwise use the filter_strategy. Note that to
+  completely follow the official PNG heuristic, filter_palette_zero must be true and
+  filter_strategy must be LFS_MINSUM*/
+  unsigned filter_palette_zero;
+  /*Which filter strategy to use when not using zeroes due to filter_palette_zero.
+  Set filter_palette_zero to 0 to ensure always using your chosen strategy. Default: LFS_MINSUM*/
+  LodePNGFilterStrategy filter_strategy;
+  /*used if filter_strategy is LFS_PREDEFINED. In that case, this must point to a buffer with
+  the same length as the amount of scanlines in the image, and each value must <= 5. You
+  have to cleanup this buffer, LodePNG will never free it. Don't forget that filter_palette_zero
+  must be set to 0 to ensure this is also used on palette or low bitdepth images.*/
+  const unsigned char* predefined_filters;
+
+  /*force creating a PLTE chunk if colortype is 2 or 6 (= a suggested palette).
+  If colortype is 3, PLTE is _always_ created.*/
+  unsigned force_palette;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  /*add LodePNG identifier and version as a text chunk, for debugging*/
+  unsigned add_id;
+  /*encode text chunks as zTXt chunks instead of tEXt chunks, and use compression in iTXt chunks*/
+  unsigned text_compression;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGEncoderSettings;
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+/*The settings, state and information for extended encoding and decoding.*/
+typedef struct LodePNGState {
+#ifdef LODEPNG_COMPILE_DECODER
+  LodePNGDecoderSettings decoder; /*the decoding settings*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+  LodePNGEncoderSettings encoder; /*the encoding settings*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+  LodePNGColorMode info_raw; /*specifies the format in which you would like to get the raw pixel buffer*/
+  LodePNGInfo info_png; /*info of the PNG image obtained after decoding*/
+  unsigned error;
+#ifdef LODEPNG_COMPILE_CPP
+  /* For the lodepng::State subclass. */
+  virtual ~LodePNGState(){}
+#endif
+} LodePNGState;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_state_init(LodePNGState* state);
+void lodepng_state_cleanup(LodePNGState* state);
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source);
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Same as lodepng_decode_memory, but uses a LodePNGState to allow custom settings and
+getting much more information about the PNG image and color mode.
+*/
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+                        LodePNGState* state,
+                        const unsigned char* in, size_t insize);
+
+/*
+Read the PNG header, but not the actual data. This returns only the information
+that is in the IHDR chunk of the PNG, such as width, height and color type. The
+information is placed in the info_png field of the LodePNGState.
+*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h,
+                         LodePNGState* state,
+                         const unsigned char* in, size_t insize);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/*
+Reads one metadata chunk (other than IHDR) of the PNG file and outputs what it
+read in the state. Returns error code on failure.
+Use lodepng_inspect first with a new state, then e.g. lodepng_chunk_find_const
+to find the desired chunk type, and if non null use lodepng_inspect_chunk (with
+chunk_pointer - start_of_file as pos).
+Supports most metadata chunks from the PNG standard (gAMA, bKGD, tEXt, ...).
+Ignores unsupported, unknown, non-metadata or IHDR chunks (without error).
+Requirements: &in[pos] must point to start of a chunk, must use regular
+lodepng_inspect first since format of most other chunks depends on IHDR, and if
+there is a PLTE chunk, that one must be inspected before tRNS or bKGD.
+*/
+unsigned lodepng_inspect_chunk(LodePNGState* state, size_t pos,
+                               const unsigned char* in, size_t insize);
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*This function allocates the out buffer with standard malloc and stores the size in *outsize.*/
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+                        const unsigned char* image, unsigned w, unsigned h,
+                        LodePNGState* state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*
+The lodepng_chunk functions are normally not needed, except to traverse the
+unknown chunks stored in the LodePNGInfo struct, or add new ones to it.
+It also allows traversing the chunks of an encoded PNG file yourself.
+
+The chunk pointer always points to the beginning of the chunk itself, that is
+the first byte of the 4 length bytes.
+
+In the PNG file format, chunks have the following format:
+-4 bytes length: length of the data of the chunk in bytes (chunk itself is 12 bytes longer)
+-4 bytes chunk type (ASCII a-z,A-Z only, see below)
+-length bytes of data (may be 0 bytes if length was 0)
+-4 bytes of CRC, computed on chunk name + data
+
+The first chunk starts at the 8th byte of the PNG file, the entire rest of the file
+exists out of concatenated chunks with the above format.
+
+PNG standard chunk ASCII naming conventions:
+-First byte: uppercase = critical, lowercase = ancillary
+-Second byte: uppercase = public, lowercase = private
+-Third byte: must be uppercase
+-Fourth byte: uppercase = unsafe to copy, lowercase = safe to copy
+*/
+
+/*
+Gets the length of the data of the chunk. Total chunk length has 12 bytes more.
+There must be at least 4 bytes to read from. If the result value is too large,
+it may be corrupt data.
+*/
+unsigned lodepng_chunk_length(const unsigned char* chunk);
+
+/*puts the 4-byte type in null terminated string*/
+void lodepng_chunk_type(char type[5], const unsigned char* chunk);
+
+/*check if the type is the given type*/
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type);
+
+/*0: it's one of the critical chunk types, 1: it's an ancillary chunk (see PNG standard)*/
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk);
+
+/*0: public, 1: private (see PNG standard)*/
+unsigned char lodepng_chunk_private(const unsigned char* chunk);
+
+/*0: the chunk is unsafe to copy, 1: the chunk is safe to copy (see PNG standard)*/
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk);
+
+/*get pointer to the data of the chunk, where the input points to the header of the chunk*/
+unsigned char* lodepng_chunk_data(unsigned char* chunk);
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk);
+
+/*returns 0 if the crc is correct, 1 if it's incorrect (0 for OK as usual!)*/
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk);
+
+/*generates the correct CRC from the data and puts it in the last 4 bytes of the chunk*/
+void lodepng_chunk_generate_crc(unsigned char* chunk);
+
+/*
+Iterate to next chunks, allows iterating through all chunks of the PNG file.
+Input must be at the beginning of a chunk (result of a previous lodepng_chunk_next call,
+or the 8th byte of a PNG file which always has the first chunk), or alternatively may
+point to the first byte of the PNG file (which is not a chunk but the magic header, the
+function will then skip over it and return the first real chunk).
+Expects at least 8 readable bytes of memory in the input pointer.
+Will output pointer to the start of the next chunk or the end of the file if there
+is no more chunk after this. Start this process at the 8th byte of the PNG file.
+In a non-corrupt PNG file, the last chunk should have name "IEND".
+*/
+unsigned char* lodepng_chunk_next(unsigned char* chunk);
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk);
+
+/*Finds the first chunk with the given type in the range [chunk, end), or returns NULL if not found.*/
+unsigned char* lodepng_chunk_find(unsigned char* chunk, const unsigned char* end, const char type[5]);
+const unsigned char* lodepng_chunk_find_const(const unsigned char* chunk, const unsigned char* end, const char type[5]);
+
+/*
+Appends chunk to the data in out. The given chunk should already have its chunk header.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returns error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk);
+
+/*
+Appends new chunk to out. The chunk to append is given by giving its length, type
+and data separately. The type is a 4-letter string.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returne error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+                              const char* type, const unsigned char* data);
+
+
+/*Calculate CRC32 of buffer*/
+unsigned lodepng_crc32(const unsigned char* buf, size_t len);
+#endif /*LODEPNG_COMPILE_PNG*/
+
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*
+This zlib part can be used independently to zlib compress and decompress a
+buffer. It cannot be used to create gzip files however, and it only supports the
+part of zlib that is required for PNG, it does not support dictionaries.
+*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Inflate a buffer. Inflate is the decompression step of deflate. Out buffer must be freed after use.*/
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGDecompressSettings* settings);
+
+/*
+Decompresses Zlib data. Reallocates the out buffer and appends the data. The
+data must be according to the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize,
+                                 const unsigned char* in, size_t insize,
+                                 const LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Compresses data with Zlib. Reallocates the out buffer and appends the data.
+Zlib adds a small header and trailer around the deflate data.
+The data is output in the format of the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize,
+                               const unsigned char* in, size_t insize,
+                               const LodePNGCompressSettings* settings);
+
+/*
+Find length-limited Huffman code for given frequencies. This function is in the
+public interface only for tests, it's used internally by lodepng_deflate.
+*/
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+                                      size_t numcodes, unsigned maxbitlen);
+
+/*Compress a buffer with deflate. See RFC 1951. Out buffer must be freed after use.*/
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGCompressSettings* settings);
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into buffer. The function allocates the out buffer, and
+after usage you should free it.
+out: output parameter, contains pointer to loaded buffer.
+outsize: output parameter, size of the allocated out buffer
+filename: the path to the file to load
+return value: error code (0 means ok)
+*/
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename);
+
+/*
+Save a file from buffer to disk. Warning, if it exists, this function overwrites
+the file without warning!
+buffer: the buffer to write
+buffersize: size of the buffer to write
+filename: the path to the file to save to
+return value: error code (0 means ok)
+*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+
+#ifdef LODEPNG_COMPILE_CPP
+/* The LodePNG C++ wrapper uses std::vectors instead of manually allocated memory buffers. */
+namespace lodepng {
+#ifdef LODEPNG_COMPILE_PNG
+class State : public LodePNGState {
+  public:
+    State();
+    State(const State& other);
+    virtual ~State();
+    State& operator=(const State& other);
+};
+
+#ifdef LODEPNG_COMPILE_DECODER
+/* Same as other lodepng::decode, but using a State for more settings and information. */
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const unsigned char* in, size_t insize);
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const std::vector<unsigned char>& in);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/* Same as other lodepng::encode, but using a State for more settings and information. */
+unsigned encode(std::vector<unsigned char>& out,
+                const unsigned char* in, unsigned w, unsigned h,
+                State& state);
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                State& state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into an std::vector.
+return value: error code (0 means ok)
+*/
+unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename);
+
+/*
+Save the binary data in an std::vector to a file on disk. The file is overwritten
+without warning.
+*/
+unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_PNG */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+/* Zlib-decompress an unsigned char buffer */
+unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                    const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+
+/* Zlib-decompress an std::vector */
+unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                    const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/* Zlib-compress an unsigned char buffer */
+unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                  const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+
+/* Zlib-compress an std::vector */
+unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                  const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+
+/*
+TODO:
+[.] test if there are no memory leaks or security exploits - done a lot but needs to be checked often
+[.] check compatibility with various compilers  - done but needs to be redone for every newer version
+[X] converting color to 16-bit per channel types
+[X] support color profile chunk types (but never let them touch RGB values by default)
+[ ] support all public PNG chunk types (almost done except sBIT, sPLT and hIST)
+[ ] make sure encoder generates no chunks with size > (2^31)-1
+[ ] partial decoding (stream processing)
+[X] let the "isFullyOpaque" function check color keys and transparent palettes too
+[X] better name for the variables "codes", "codesD", "codelengthcodes", "clcl" and "lldl"
+[ ] allow treating some errors like warnings, when image is recoverable (e.g. 69, 57, 58)
+[ ] make warnings like: oob palette, checksum fail, data after iend, wrong/unknown crit chunk, no null terminator in text, ...
+[ ] error messages with line numbers (and version)
+[ ] errors in state instead of as return code?
+[ ] new errors/warnings like suspiciously big decompressed ztxt or iccp chunk
+[ ] let the C++ wrapper catch exceptions coming from the standard library and return LodePNG error codes
+[ ] allow user to provide custom color conversion functions, e.g. for premultiplied alpha, padding bits or not, ...
+[ ] allow user to give data (void*) to custom allocator
+[ ] provide alternatives for C library functions not present on some platforms (memcpy, ...)
+[ ] rename "grey" to "gray" everywhere since "color" also uses US spelling (keep "grey" copies for backwards compatibility)
+*/
+
+#endif /*LODEPNG_H inclusion guard*/
+
+/*
+LodePNG Documentation
+---------------------
+
+0. table of contents
+--------------------
+
+  1. about
+   1.1. supported features
+   1.2. features not supported
+  2. C and C++ version
+  3. security
+  4. decoding
+  5. encoding
+  6. color conversions
+    6.1. PNG color types
+    6.2. color conversions
+    6.3. padding bits
+    6.4. A note about 16-bits per channel and endianness
+  7. error values
+  8. chunks and PNG editing
+  9. compiler support
+  10. examples
+   10.1. decoder C++ example
+   10.2. decoder C example
+  11. state settings reference
+  12. changes
+  13. contact information
+
+
+1. about
+--------
+
+PNG is a file format to store raster images losslessly with good compression,
+supporting different color types and alpha channel.
+
+LodePNG is a PNG codec according to the Portable Network Graphics (PNG)
+Specification (Second Edition) - W3C Recommendation 10 November 2003.
+
+The specifications used are:
+
+*) Portable Network Graphics (PNG) Specification (Second Edition):
+     http://www.w3.org/TR/2003/REC-PNG-20031110
+*) RFC 1950 ZLIB Compressed Data Format version 3.3:
+     http://www.gzip.org/zlib/rfc-zlib.html
+*) RFC 1951 DEFLATE Compressed Data Format Specification ver 1.3:
+     http://www.gzip.org/zlib/rfc-deflate.html
+
+The most recent version of LodePNG can currently be found at
+http://lodev.org/lodepng/
+
+LodePNG works both in C (ISO C90) and C++, with a C++ wrapper that adds
+extra functionality.
+
+LodePNG exists out of two files:
+-lodepng.h: the header file for both C and C++
+-lodepng.c(pp): give it the name lodepng.c or lodepng.cpp (or .cc) depending on your usage
+
+If you want to start using LodePNG right away without reading this doc, get the
+examples from the LodePNG website to see how to use it in code, or check the
+smaller examples in chapter 13 here.
+
+LodePNG is simple but only supports the basic requirements. To achieve
+simplicity, the following design choices were made: There are no dependencies
+on any external library. There are functions to decode and encode a PNG with
+a single function call, and extended versions of these functions taking a
+LodePNGState struct allowing to specify or get more information. By default
+the colors of the raw image are always RGB or RGBA, no matter what color type
+the PNG file uses. To read and write files, there are simple functions to
+convert the files to/from buffers in memory.
+
+This all makes LodePNG suitable for loading textures in games, demos and small
+programs, ... It's less suitable for full fledged image editors, loading PNGs
+over network (it requires all the image data to be available before decoding can
+begin), life-critical systems, ...
+
+1.1. supported features
+-----------------------
+
+The following features are supported by the decoder:
+
+*) decoding of PNGs with any color type, bit depth and interlace mode, to a 24- or 32-bit color raw image,
+   or the same color type as the PNG
+*) encoding of PNGs, from any raw image to 24- or 32-bit color, or the same color type as the raw image
+*) Adam7 interlace and deinterlace for any color type
+*) loading the image from harddisk or decoding it from a buffer from other sources than harddisk
+*) support for alpha channels, including RGBA color model, translucent palettes and color keying
+*) zlib decompression (inflate)
+*) zlib compression (deflate)
+*) CRC32 and ADLER32 checksums
+*) colorimetric color profile conversions: currently experimentally available in lodepng_util.cpp only,
+   plus alternatively ability to pass on chroma/gamma/ICC profile information to other color management system.
+*) handling of unknown chunks, allowing making a PNG editor that stores custom and unknown chunks.
+*) the following chunks are supported by both encoder and decoder:
+    IHDR: header information
+    PLTE: color palette
+    IDAT: pixel data
+    IEND: the final chunk
+    tRNS: transparency for palettized images
+    tEXt: textual information
+    zTXt: compressed textual information
+    iTXt: international textual information
+    bKGD: suggested background color
+    pHYs: physical dimensions
+    tIME: modification time
+    cHRM: RGB chromaticities
+    gAMA: RGB gamma correction
+    iCCP: ICC color profile
+    sRGB: rendering intent
+
+1.2. features not supported
+---------------------------
+
+The following features are _not_ supported:
+
+*) some features needed to make a conformant PNG-Editor might be still missing.
+*) partial loading/stream processing. All data must be available and is processed in one call.
+*) The following public chunks are not (yet) supported but treated as unknown chunks by LodePNG:
+    sBIT
+    hIST
+    sPLT
+
+
+2. C and C++ version
+--------------------
+
+The C version uses buffers allocated with alloc that you need to free()
+yourself. You need to use init and cleanup functions for each struct whenever
+using a struct from the C version to avoid exploits and memory leaks.
+
+The C++ version has extra functions with std::vectors in the interface and the
+lodepng::State class which is a LodePNGState with constructor and destructor.
+
+These files work without modification for both C and C++ compilers because all
+the additional C++ code is in "#ifdef __cplusplus" blocks that make C-compilers
+ignore it, and the C code is made to compile both with strict ISO C90 and C++.
+
+To use the C++ version, you need to rename the source file to lodepng.cpp
+(instead of lodepng.c), and compile it with a C++ compiler.
+
+To use the C version, you need to rename the source file to lodepng.c (instead
+of lodepng.cpp), and compile it with a C compiler.
+
+
+3. Security
+-----------
+
+Even if carefully designed, it's always possible that LodePNG contains possible
+exploits. If you discover one, please let me know, and it will be fixed.
+
+When using LodePNG, care has to be taken with the C version of LodePNG, as well
+as the C-style structs when working with C++. The following conventions are used
+for all C-style structs:
+
+-if a struct has a corresponding init function, always call the init function when making a new one
+-if a struct has a corresponding cleanup function, call it before the struct disappears to avoid memory leaks
+-if a struct has a corresponding copy function, use the copy function instead of "=".
+ The destination must also be inited already.
+
+
+4. Decoding
+-----------
+
+Decoding converts a PNG compressed image to a raw pixel buffer.
+
+Most documentation on using the decoder is at its declarations in the header
+above. For C, simple decoding can be done with functions such as
+lodepng_decode32, and more advanced decoding can be done with the struct
+LodePNGState and lodepng_decode. For C++, all decoding can be done with the
+various lodepng::decode functions, and lodepng::State can be used for advanced
+features.
+
+When using the LodePNGState, it uses the following fields for decoding:
+*) LodePNGInfo info_png: it stores extra information about the PNG (the input) in here
+*) LodePNGColorMode info_raw: here you can say what color mode of the raw image (the output) you want to get
+*) LodePNGDecoderSettings decoder: you can specify a few extra settings for the decoder to use
+
+LodePNGInfo info_png
+--------------------
+
+After decoding, this contains extra information of the PNG image, except the actual
+pixels, width and height because these are already gotten directly from the decoder
+functions.
+
+It contains for example the original color type of the PNG image, text comments,
+suggested background color, etc... More details about the LodePNGInfo struct are
+at its declaration documentation.
+
+LodePNGColorMode info_raw
+-------------------------
+
+When decoding, here you can specify which color type you want
+the resulting raw image to be. If this is different from the colortype of the
+PNG, then the decoder will automatically convert the result. This conversion
+always works, except if you want it to convert a color PNG to grayscale or to
+a palette with missing colors.
+
+By default, 32-bit color is used for the result.
+
+LodePNGDecoderSettings decoder
+------------------------------
+
+The settings can be used to ignore the errors created by invalid CRC and Adler32
+chunks, and to disable the decoding of tEXt chunks.
+
+There's also a setting color_convert, true by default. If false, no conversion
+is done, the resulting data will be as it was in the PNG (after decompression)
+and you'll have to puzzle the colors of the pixels together yourself using the
+color type information in the LodePNGInfo.
+
+
+5. Encoding
+-----------
+
+Encoding converts a raw pixel buffer to a PNG compressed image.
+
+Most documentation on using the encoder is at its declarations in the header
+above. For C, simple encoding can be done with functions such as
+lodepng_encode32, and more advanced decoding can be done with the struct
+LodePNGState and lodepng_encode. For C++, all encoding can be done with the
+various lodepng::encode functions, and lodepng::State can be used for advanced
+features.
+
+Like the decoder, the encoder can also give errors. However it gives less errors
+since the encoder input is trusted, the decoder input (a PNG image that could
+be forged by anyone) is not trusted.
+
+When using the LodePNGState, it uses the following fields for encoding:
+*) LodePNGInfo info_png: here you specify how you want the PNG (the output) to be.
+*) LodePNGColorMode info_raw: here you say what color type of the raw image (the input) has
+*) LodePNGEncoderSettings encoder: you can specify a few settings for the encoder to use
+
+LodePNGInfo info_png
+--------------------
+
+When encoding, you use this the opposite way as when decoding: for encoding,
+you fill in the values you want the PNG to have before encoding. By default it's
+not needed to specify a color type for the PNG since it's automatically chosen,
+but it's possible to choose it yourself given the right settings.
+
+The encoder will not always exactly match the LodePNGInfo struct you give,
+it tries as close as possible. Some things are ignored by the encoder. The
+encoder uses, for example, the following settings from it when applicable:
+colortype and bitdepth, text chunks, time chunk, the color key, the palette, the
+background color, the interlace method, unknown chunks, ...
+
+When encoding to a PNG with colortype 3, the encoder will generate a PLTE chunk.
+If the palette contains any colors for which the alpha channel is not 255 (so
+there are translucent colors in the palette), it'll add a tRNS chunk.
+
+LodePNGColorMode info_raw
+-------------------------
+
+You specify the color type of the raw image that you give to the input here,
+including a possible transparent color key and palette you happen to be using in
+your raw image data.
+
+By default, 32-bit color is assumed, meaning your input has to be in RGBA
+format with 4 bytes (unsigned chars) per pixel.
+
+LodePNGEncoderSettings encoder
+------------------------------
+
+The following settings are supported (some are in sub-structs):
+*) auto_convert: when this option is enabled, the encoder will
+automatically choose the smallest possible color mode (including color key) that
+can encode the colors of all pixels without information loss.
+*) btype: the block type for LZ77. 0 = uncompressed, 1 = fixed huffman tree,
+   2 = dynamic huffman tree (best compression). Should be 2 for proper
+   compression.
+*) use_lz77: whether or not to use LZ77 for compressed block types. Should be
+   true for proper compression.
+*) windowsize: the window size used by the LZ77 encoder (1 - 32768). Has value
+   2048 by default, but can be set to 32768 for better, but slow, compression.
+*) force_palette: if colortype is 2 or 6, you can make the encoder write a PLTE
+   chunk if force_palette is true. This can used as suggested palette to convert
+   to by viewers that don't support more than 256 colors (if those still exist)
+*) add_id: add text chunk "Encoder: LodePNG <version>" to the image.
+*) text_compression: default 1. If 1, it'll store texts as zTXt instead of tEXt chunks.
+  zTXt chunks use zlib compression on the text. This gives a smaller result on
+  large texts but a larger result on small texts (such as a single program name).
+  It's all tEXt or all zTXt though, there's no separate setting per text yet.
+
+
+6. color conversions
+--------------------
+
+An important thing to note about LodePNG, is that the color type of the PNG, and
+the color type of the raw image, are completely independent. By default, when
+you decode a PNG, you get the result as a raw image in the color type you want,
+no matter whether the PNG was encoded with a palette, grayscale or RGBA color.
+And if you encode an image, by default LodePNG will automatically choose the PNG
+color type that gives good compression based on the values of colors and amount
+of colors in the image. It can be configured to let you control it instead as
+well, though.
+
+To be able to do this, LodePNG does conversions from one color mode to another.
+It can convert from almost any color type to any other color type, except the
+following conversions: RGB to grayscale is not supported, and converting to a
+palette when the palette doesn't have a required color is not supported. This is
+not supported on purpose: this is information loss which requires a color
+reduction algorithm that is beyong the scope of a PNG encoder (yes, RGB to gray
+is easy, but there are multiple ways if you want to give some channels more
+weight).
+
+By default, when decoding, you get the raw image in 32-bit RGBA or 24-bit RGB
+color, no matter what color type the PNG has. And by default when encoding,
+LodePNG automatically picks the best color model for the output PNG, and expects
+the input image to be 32-bit RGBA or 24-bit RGB. So, unless you want to control
+the color format of the images yourself, you can skip this chapter.
+
+6.1. PNG color types
+--------------------
+
+A PNG image can have many color types, ranging from 1-bit color to 64-bit color,
+as well as palettized color modes. After the zlib decompression and unfiltering
+in the PNG image is done, the raw pixel data will have that color type and thus
+a certain amount of bits per pixel. If you want the output raw image after
+decoding to have another color type, a conversion is done by LodePNG.
+
+The PNG specification gives the following color types:
+
+0: grayscale, bit depths 1, 2, 4, 8, 16
+2: RGB, bit depths 8 and 16
+3: palette, bit depths 1, 2, 4 and 8
+4: grayscale with alpha, bit depths 8 and 16
+6: RGBA, bit depths 8 and 16
+
+Bit depth is the amount of bits per pixel per color channel. So the total amount
+of bits per pixel is: amount of channels * bitdepth.
+
+6.2. color conversions
+----------------------
+
+As explained in the sections about the encoder and decoder, you can specify
+color types and bit depths in info_png and info_raw to change the default
+behaviour.
+
+If, when decoding, you want the raw image to be something else than the default,
+you need to set the color type and bit depth you want in the LodePNGColorMode,
+or the parameters colortype and bitdepth of the simple decoding function.
+
+If, when encoding, you use another color type than the default in the raw input
+image, you need to specify its color type and bit depth in the LodePNGColorMode
+of the raw image, or use the parameters colortype and bitdepth of the simple
+encoding function.
+
+If, when encoding, you don't want LodePNG to choose the output PNG color type
+but control it yourself, you need to set auto_convert in the encoder settings
+to false, and specify the color type you want in the LodePNGInfo of the
+encoder (including palette: it can generate a palette if auto_convert is true,
+otherwise not).
+
+If the input and output color type differ (whether user chosen or auto chosen),
+LodePNG will do a color conversion, which follows the rules below, and may
+sometimes result in an error.
+
+To avoid some confusion:
+-the decoder converts from PNG to raw image
+-the encoder converts from raw image to PNG
+-the colortype and bitdepth in LodePNGColorMode info_raw, are those of the raw image
+-the colortype and bitdepth in the color field of LodePNGInfo info_png, are those of the PNG
+-when encoding, the color type in LodePNGInfo is ignored if auto_convert
+ is enabled, it is automatically generated instead
+-when decoding, the color type in LodePNGInfo is set by the decoder to that of the original
+ PNG image, but it can be ignored since the raw image has the color type you requested instead
+-if the color type of the LodePNGColorMode and PNG image aren't the same, a conversion
+ between the color types is done if the color types are supported. If it is not
+ supported, an error is returned. If the types are the same, no conversion is done.
+-even though some conversions aren't supported, LodePNG supports loading PNGs from any
+ colortype and saving PNGs to any colortype, sometimes it just requires preparing
+ the raw image correctly before encoding.
+-both encoder and decoder use the same color converter.
+
+The function lodepng_convert does the color conversion. It is available in the
+interface but normally isn't needed since the encoder and decoder already call
+it.
+
+Non supported color conversions:
+-color to grayscale when non-gray pixels are present: no error is thrown, but
+the result will look ugly because only the red channel is taken (it assumes all
+three channels are the same in this case so ignores green and blue). The reason
+no error is given is to allow converting from three-channel grayscale images to
+one-channel even if there are numerical imprecisions.
+-anything to palette when the palette does not have an exact match for a from-color
+in it: in this case an error is thrown
+
+Supported color conversions:
+-anything to 8-bit RGB, 8-bit RGBA, 16-bit RGB, 16-bit RGBA
+-any gray or gray+alpha, to gray or gray+alpha
+-anything to a palette, as long as the palette has the requested colors in it
+-removing alpha channel
+-higher to smaller bitdepth, and vice versa
+
+If you want no color conversion to be done (e.g. for speed or control):
+-In the encoder, you can make it save a PNG with any color type by giving the
+raw color mode and LodePNGInfo the same color mode, and setting auto_convert to
+false.
+-In the decoder, you can make it store the pixel data in the same color type
+as the PNG has, by setting the color_convert setting to false. Settings in
+info_raw are then ignored.
+
+6.3. padding bits
+-----------------
+
+In the PNG file format, if a less than 8-bit per pixel color type is used and the scanlines
+have a bit amount that isn't a multiple of 8, then padding bits are used so that each
+scanline starts at a fresh byte. But that is NOT true for the LodePNG raw input and output.
+The raw input image you give to the encoder, and the raw output image you get from the decoder
+will NOT have these padding bits, e.g. in the case of a 1-bit image with a width
+of 7 pixels, the first pixel of the second scanline will the the 8th bit of the first byte,
+not the first bit of a new byte.
+
+6.4. A note about 16-bits per channel and endianness
+----------------------------------------------------
+
+LodePNG uses unsigned char arrays for 16-bit per channel colors too, just like
+for any other color format. The 16-bit values are stored in big endian (most
+significant byte first) in these arrays. This is the opposite order of the
+little endian used by x86 CPU's.
+
+LodePNG always uses big endian because the PNG file format does so internally.
+Conversions to other formats than PNG uses internally are not supported by
+LodePNG on purpose, there are myriads of formats, including endianness of 16-bit
+colors, the order in which you store R, G, B and A, and so on. Supporting and
+converting to/from all that is outside the scope of LodePNG.
+
+This may mean that, depending on your use case, you may want to convert the big
+endian output of LodePNG to little endian with a for loop. This is certainly not
+always needed, many applications and libraries support big endian 16-bit colors
+anyway, but it means you cannot simply cast the unsigned char* buffer to an
+unsigned short* buffer on x86 CPUs.
+
+
+7. error values
+---------------
+
+All functions in LodePNG that return an error code, return 0 if everything went
+OK, or a non-zero code if there was an error.
+
+The meaning of the LodePNG error values can be retrieved with the function
+lodepng_error_text: given the numerical error code, it returns a description
+of the error in English as a string.
+
+Check the implementation of lodepng_error_text to see the meaning of each code.
+
+
+8. chunks and PNG editing
+-------------------------
+
+If you want to add extra chunks to a PNG you encode, or use LodePNG for a PNG
+editor that should follow the rules about handling of unknown chunks, or if your
+program is able to read other types of chunks than the ones handled by LodePNG,
+then that's possible with the chunk functions of LodePNG.
+
+A PNG chunk has the following layout:
+
+4 bytes length
+4 bytes type name
+length bytes data
+4 bytes CRC
+
+8.1. iterating through chunks
+-----------------------------
+
+If you have a buffer containing the PNG image data, then the first chunk (the
+IHDR chunk) starts at byte number 8 of that buffer. The first 8 bytes are the
+signature of the PNG and are not part of a chunk. But if you start at byte 8
+then you have a chunk, and can check the following things of it.
+
+NOTE: none of these functions check for memory buffer boundaries. To avoid
+exploits, always make sure the buffer contains all the data of the chunks.
+When using lodepng_chunk_next, make sure the returned value is within the
+allocated memory.
+
+unsigned lodepng_chunk_length(const unsigned char* chunk):
+
+Get the length of the chunk's data. The total chunk length is this length + 12.
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk):
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type):
+
+Get the type of the chunk or compare if it's a certain type
+
+unsigned char lodepng_chunk_critical(const unsigned char* chunk):
+unsigned char lodepng_chunk_private(const unsigned char* chunk):
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk):
+
+Check if the chunk is critical in the PNG standard (only IHDR, PLTE, IDAT and IEND are).
+Check if the chunk is private (public chunks are part of the standard, private ones not).
+Check if the chunk is safe to copy. If it's not, then, when modifying data in a critical
+chunk, unsafe to copy chunks of the old image may NOT be saved in the new one if your
+program doesn't handle that type of unknown chunk.
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk):
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk):
+
+Get a pointer to the start of the data of the chunk.
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk):
+void lodepng_chunk_generate_crc(unsigned char* chunk):
+
+Check if the crc is correct or generate a correct one.
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk):
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk):
+
+Iterate to the next chunk. This works if you have a buffer with consecutive chunks. Note that these
+functions do no boundary checking of the allocated data whatsoever, so make sure there is enough
+data available in the buffer to be able to go to the next chunk.
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk):
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+                              const char* type, const unsigned char* data):
+
+These functions are used to create new chunks that are appended to the data in *out that has
+length *outlength. The append function appends an existing chunk to the new data. The create
+function creates a new chunk with the given parameters and appends it. Type is the 4-letter
+name of the chunk.
+
+8.2. chunks in info_png
+-----------------------
+
+The LodePNGInfo struct contains fields with the unknown chunk in it. It has 3
+buffers (each with size) to contain 3 types of unknown chunks:
+the ones that come before the PLTE chunk, the ones that come between the PLTE
+and the IDAT chunks, and the ones that come after the IDAT chunks.
+It's necessary to make the distionction between these 3 cases because the PNG
+standard forces to keep the ordering of unknown chunks compared to the critical
+chunks, but does not force any other ordering rules.
+
+info_png.unknown_chunks_data[0] is the chunks before PLTE
+info_png.unknown_chunks_data[1] is the chunks after PLTE, before IDAT
+info_png.unknown_chunks_data[2] is the chunks after IDAT
+
+The chunks in these 3 buffers can be iterated through and read by using the same
+way described in the previous subchapter.
+
+When using the decoder to decode a PNG, you can make it store all unknown chunks
+if you set the option settings.remember_unknown_chunks to 1. By default, this
+option is off (0).
+
+The encoder will always encode unknown chunks that are stored in the info_png.
+If you need it to add a particular chunk that isn't known by LodePNG, you can
+use lodepng_chunk_append or lodepng_chunk_create to the chunk data in
+info_png.unknown_chunks_data[x].
+
+Chunks that are known by LodePNG should not be added in that way. E.g. to make
+LodePNG add a bKGD chunk, set background_defined to true and add the correct
+parameters there instead.
+
+
+9. compiler support
+-------------------
+
+No libraries other than the current standard C library are needed to compile
+LodePNG. For the C++ version, only the standard C++ library is needed on top.
+Add the files lodepng.c(pp) and lodepng.h to your project, include
+lodepng.h where needed, and your program can read/write PNG files.
+
+It is compatible with C90 and up, and C++03 and up.
+
+If performance is important, use optimization when compiling! For both the
+encoder and decoder, this makes a large difference.
+
+Make sure that LodePNG is compiled with the same compiler of the same version
+and with the same settings as the rest of the program, or the interfaces with
+std::vectors and std::strings in C++ can be incompatible.
+
+CHAR_BITS must be 8 or higher, because LodePNG uses unsigned chars for octets.
+
+*) gcc and g++
+
+LodePNG is developed in gcc so this compiler is natively supported. It gives no
+warnings with compiler options "-Wall -Wextra -pedantic -ansi", with gcc and g++
+version 4.7.1 on Linux, 32-bit and 64-bit.
+
+*) Clang
+
+Fully supported and warning-free.
+
+*) Mingw
+
+The Mingw compiler (a port of gcc for Windows) should be fully supported by
+LodePNG.
+
+*) Visual Studio and Visual C++ Express Edition
+
+LodePNG should be warning-free with warning level W4. Two warnings were disabled
+with pragmas though: warning 4244 about implicit conversions, and warning 4996
+where it wants to use a non-standard function fopen_s instead of the standard C
+fopen.
+
+Visual Studio may want "stdafx.h" files to be included in each source file and
+give an error "unexpected end of file while looking for precompiled header".
+This is not standard C++ and will not be added to the stock LodePNG. You can
+disable it for lodepng.cpp only by right clicking it, Properties, C/C++,
+Precompiled Headers, and set it to Not Using Precompiled Headers there.
+
+NOTE: Modern versions of VS should be fully supported, but old versions, e.g.
+VS6, are not guaranteed to work.
+
+*) Compilers on Macintosh
+
+LodePNG has been reported to work both with gcc and LLVM for Macintosh, both for
+C and C++.
+
+*) Other Compilers
+
+If you encounter problems on any compilers, feel free to let me know and I may
+try to fix it if the compiler is modern and standards complient.
+
+
+10. examples
+------------
+
+This decoder example shows the most basic usage of LodePNG. More complex
+examples can be found on the LodePNG website.
+
+10.1. decoder C++ example
+-------------------------
+
+#include "lodepng.h"
+#include <iostream>
+
+int main(int argc, char *argv[]) {
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  //load and decode
+  std::vector<unsigned char> image;
+  unsigned width, height;
+  unsigned error = lodepng::decode(image, width, height, filename);
+
+  //if there's an error, display it
+  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
+
+  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
+}
+
+10.2. decoder C example
+-----------------------
+
+#include "lodepng.h"
+
+int main(int argc, char *argv[]) {
+  unsigned error;
+  unsigned char* image;
+  size_t width, height;
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  error = lodepng_decode32_file(&image, &width, &height, filename);
+
+  if(error) printf("decoder error %u: %s\n", error, lodepng_error_text(error));
+
+  / * use image here * /
+
+  free(image);
+  return 0;
+}
+
+11. state settings reference
+----------------------------
+
+A quick reference of some settings to set on the LodePNGState
+
+For decoding:
+
+state.decoder.zlibsettings.ignore_adler32: ignore ADLER32 checksums
+state.decoder.zlibsettings.custom_...: use custom inflate function
+state.decoder.ignore_crc: ignore CRC checksums
+state.decoder.ignore_critical: ignore unknown critical chunks
+state.decoder.ignore_end: ignore missing IEND chunk. May fail if this corruption causes other errors
+state.decoder.color_convert: convert internal PNG color to chosen one
+state.decoder.read_text_chunks: whether to read in text metadata chunks
+state.decoder.remember_unknown_chunks: whether to read in unknown chunks
+state.info_raw.colortype: desired color type for decoded image
+state.info_raw.bitdepth: desired bit depth for decoded image
+state.info_raw....: more color settings, see struct LodePNGColorMode
+state.info_png....: no settings for decoder but ouput, see struct LodePNGInfo
+
+For encoding:
+
+state.encoder.zlibsettings.btype: disable compression by setting it to 0
+state.encoder.zlibsettings.use_lz77: use LZ77 in compression
+state.encoder.zlibsettings.windowsize: tweak LZ77 windowsize
+state.encoder.zlibsettings.minmatch: tweak min LZ77 length to match
+state.encoder.zlibsettings.nicematch: tweak LZ77 match where to stop searching
+state.encoder.zlibsettings.lazymatching: try one more LZ77 matching
+state.encoder.zlibsettings.custom_...: use custom deflate function
+state.encoder.auto_convert: choose optimal PNG color type, if 0 uses info_png
+state.encoder.filter_palette_zero: PNG filter strategy for palette
+state.encoder.filter_strategy: PNG filter strategy to encode with
+state.encoder.force_palette: add palette even if not encoding to one
+state.encoder.add_id: add LodePNG identifier and version as a text chunk
+state.encoder.text_compression: use compressed text chunks for metadata
+state.info_raw.colortype: color type of raw input image you provide
+state.info_raw.bitdepth: bit depth of raw input image you provide
+state.info_raw: more color settings, see struct LodePNGColorMode
+state.info_png.color.colortype: desired color type if auto_convert is false
+state.info_png.color.bitdepth: desired bit depth if auto_convert is false
+state.info_png.color....: more color settings, see struct LodePNGColorMode
+state.info_png....: more PNG related settings, see struct LodePNGInfo
+
+
+12. changes
+-----------
+
+The version number of LodePNG is the date of the change given in the format
+yyyymmdd.
+
+Some changes aren't backwards compatible. Those are indicated with a (!)
+symbol.
+
+*) 30 dec 2018: code style changes only: removed newlines before opening braces.
+*) 10 sep 2018: added way to inspect metadata chunks without full decoding.
+*) 19 aug 2018 (!): fixed color mode bKGD is encoded with and made it use
+   palette index in case of palette.
+*) 10 aug 2018 (!): added support for gAMA, cHRM, sRGB and iCCP chunks. This
+   change is backwards compatible unless you relied on unknown_chunks for those.
+*) 11 jun 2018: less restrictive check for pixel size integer overflow
+*) 14 jan 2018: allow optionally ignoring a few more recoverable errors
+*) 17 sep 2017: fix memory leak for some encoder input error cases
+*) 27 nov 2016: grey+alpha auto color model detection bugfix
+*) 18 apr 2016: Changed qsort to custom stable sort (for platforms w/o qsort).
+*) 09 apr 2016: Fixed colorkey usage detection, and better file loading (within
+   the limits of pure C90).
+*) 08 dec 2015: Made load_file function return error if file can't be opened.
+*) 24 okt 2015: Bugfix with decoding to palette output.
+*) 18 apr 2015: Boundary PM instead of just package-merge for faster encoding.
+*) 23 aug 2014: Reduced needless memory usage of decoder.
+*) 28 jun 2014: Removed fix_png setting, always support palette OOB for
+    simplicity. Made ColorProfile public.
+*) 09 jun 2014: Faster encoder by fixing hash bug and more zeros optimization.
+*) 22 dec 2013: Power of two windowsize required for optimization.
+*) 15 apr 2013: Fixed bug with LAC_ALPHA and color key.
+*) 25 mar 2013: Added an optional feature to ignore some PNG errors (fix_png).
+*) 11 mar 2013 (!): Bugfix with custom free. Changed from "my" to "lodepng_"
+    prefix for the custom allocators and made it possible with a new #define to
+    use custom ones in your project without needing to change lodepng's code.
+*) 28 jan 2013: Bugfix with color key.
+*) 27 okt 2012: Tweaks in text chunk keyword length error handling.
+*) 8 okt 2012 (!): Added new filter strategy (entropy) and new auto color mode.
+    (no palette). Better deflate tree encoding. New compression tweak settings.
+    Faster color conversions while decoding. Some internal cleanups.
+*) 23 sep 2012: Reduced warnings in Visual Studio a little bit.
+*) 1 sep 2012 (!): Removed #define's for giving custom (de)compression functions
+    and made it work with function pointers instead.
+*) 23 jun 2012: Added more filter strategies. Made it easier to use custom alloc
+    and free functions and toggle #defines from compiler flags. Small fixes.
+*) 6 may 2012 (!): Made plugging in custom zlib/deflate functions more flexible.
+*) 22 apr 2012 (!): Made interface more consistent, renaming a lot. Removed
+    redundant C++ codec classes. Reduced amount of structs. Everything changed,
+    but it is cleaner now imho and functionality remains the same. Also fixed
+    several bugs and shrunk the implementation code. Made new samples.
+*) 6 nov 2011 (!): By default, the encoder now automatically chooses the best
+    PNG color model and bit depth, based on the amount and type of colors of the
+    raw image. For this, autoLeaveOutAlphaChannel replaced by auto_choose_color.
+*) 9 okt 2011: simpler hash chain implementation for the encoder.
+*) 8 sep 2011: lz77 encoder lazy matching instead of greedy matching.
+*) 23 aug 2011: tweaked the zlib compression parameters after benchmarking.
+    A bug with the PNG filtertype heuristic was fixed, so that it chooses much
+    better ones (it's quite significant). A setting to do an experimental, slow,
+    brute force search for PNG filter types is added.
+*) 17 aug 2011 (!): changed some C zlib related function names.
+*) 16 aug 2011: made the code less wide (max 120 characters per line).
+*) 17 apr 2011: code cleanup. Bugfixes. Convert low to 16-bit per sample colors.
+*) 21 feb 2011: fixed compiling for C90. Fixed compiling with sections disabled.
+*) 11 dec 2010: encoding is made faster, based on suggestion by Peter Eastman
+    to optimize long sequences of zeros.
+*) 13 nov 2010: added LodePNG_InfoColor_hasPaletteAlpha and
+    LodePNG_InfoColor_canHaveAlpha functions for convenience.
+*) 7 nov 2010: added LodePNG_error_text function to get error code description.
+*) 30 okt 2010: made decoding slightly faster
+*) 26 okt 2010: (!) changed some C function and struct names (more consistent).
+     Reorganized the documentation and the declaration order in the header.
+*) 08 aug 2010: only changed some comments and external samples.
+*) 05 jul 2010: fixed bug thanks to warnings in the new gcc version.
+*) 14 mar 2010: fixed bug where too much memory was allocated for char buffers.
+*) 02 sep 2008: fixed bug where it could create empty tree that linux apps could
+    read by ignoring the problem but windows apps couldn't.
+*) 06 jun 2008: added more error checks for out of memory cases.
+*) 26 apr 2008: added a few more checks here and there to ensure more safety.
+*) 06 mar 2008: crash with encoding of strings fixed
+*) 02 feb 2008: support for international text chunks added (iTXt)
+*) 23 jan 2008: small cleanups, and #defines to divide code in sections
+*) 20 jan 2008: support for unknown chunks allowing using LodePNG for an editor.
+*) 18 jan 2008: support for tIME and pHYs chunks added to encoder and decoder.
+*) 17 jan 2008: ability to encode and decode compressed zTXt chunks added
+    Also various fixes, such as in the deflate and the padding bits code.
+*) 13 jan 2008: Added ability to encode Adam7-interlaced images. Improved
+    filtering code of encoder.
+*) 07 jan 2008: (!) changed LodePNG to use ISO C90 instead of C++. A
+    C++ wrapper around this provides an interface almost identical to before.
+    Having LodePNG be pure ISO C90 makes it more portable. The C and C++ code
+    are together in these files but it works both for C and C++ compilers.
+*) 29 dec 2007: (!) changed most integer types to unsigned int + other tweaks
+*) 30 aug 2007: bug fixed which makes this Borland C++ compatible
+*) 09 aug 2007: some VS2005 warnings removed again
+*) 21 jul 2007: deflate code placed in new namespace separate from zlib code
+*) 08 jun 2007: fixed bug with 2- and 4-bit color, and small interlaced images
+*) 04 jun 2007: improved support for Visual Studio 2005: crash with accessing
+    invalid std::vector element [0] fixed, and level 3 and 4 warnings removed
+*) 02 jun 2007: made the encoder add a tag with version by default
+*) 27 may 2007: zlib and png code separated (but still in the same file),
+    simple encoder/decoder functions added for more simple usage cases
+*) 19 may 2007: minor fixes, some code cleaning, new error added (error 69),
+    moved some examples from here to lodepng_examples.cpp
+*) 12 may 2007: palette decoding bug fixed
+*) 24 apr 2007: changed the license from BSD to the zlib license
+*) 11 mar 2007: very simple addition: ability to encode bKGD chunks.
+*) 04 mar 2007: (!) tEXt chunk related fixes, and support for encoding
+    palettized PNG images. Plus little interface change with palette and texts.
+*) 03 mar 2007: Made it encode dynamic Huffman shorter with repeat codes.
+    Fixed a bug where the end code of a block had length 0 in the Huffman tree.
+*) 26 feb 2007: Huffman compression with dynamic trees (BTYPE 2) now implemented
+    and supported by the encoder, resulting in smaller PNGs at the output.
+*) 27 jan 2007: Made the Adler-32 test faster so that a timewaste is gone.
+*) 24 jan 2007: gave encoder an error interface. Added color conversion from any
+    greyscale type to 8-bit greyscale with or without alpha.
+*) 21 jan 2007: (!) Totally changed the interface. It allows more color types
+    to convert to and is more uniform. See the manual for how it works now.
+*) 07 jan 2007: Some cleanup & fixes, and a few changes over the last days:
+    encode/decode custom tEXt chunks, separate classes for zlib & deflate, and
+    at last made the decoder give errors for incorrect Adler32 or Crc.
+*) 01 jan 2007: Fixed bug with encoding PNGs with less than 8 bits per channel.
+*) 29 dec 2006: Added support for encoding images without alpha channel, and
+    cleaned out code as well as making certain parts faster.
+*) 28 dec 2006: Added "Settings" to the encoder.
+*) 26 dec 2006: The encoder now does LZ77 encoding and produces much smaller files now.
+    Removed some code duplication in the decoder. Fixed little bug in an example.
+*) 09 dec 2006: (!) Placed output parameters of public functions as first parameter.
+    Fixed a bug of the decoder with 16-bit per color.
+*) 15 okt 2006: Changed documentation structure
+*) 09 okt 2006: Encoder class added. It encodes a valid PNG image from the
+    given image buffer, however for now it's not compressed.
+*) 08 sep 2006: (!) Changed to interface with a Decoder class
+*) 30 jul 2006: (!) LodePNG_InfoPng , width and height are now retrieved in different
+    way. Renamed decodePNG to decodePNGGeneric.
+*) 29 jul 2006: (!) Changed the interface: image info is now returned as a
+    struct of type LodePNG::LodePNG_Info, instead of a vector, which was a bit clumsy.
+*) 28 jul 2006: Cleaned the code and added new error checks.
+    Corrected terminology "deflate" into "inflate".
+*) 23 jun 2006: Added SDL example in the documentation in the header, this
+    example allows easy debugging by displaying the PNG and its transparency.
+*) 22 jun 2006: (!) Changed way to obtain error value. Added
+    loadFile function for convenience. Made decodePNG32 faster.
+*) 21 jun 2006: (!) Changed type of info vector to unsigned.
+    Changed position of palette in info vector. Fixed an important bug that
+    happened on PNGs with an uncompressed block.
+*) 16 jun 2006: Internally changed unsigned into unsigned where
+    needed, and performed some optimizations.
+*) 07 jun 2006: (!) Renamed functions to decodePNG and placed them
+    in LodePNG namespace. Changed the order of the parameters. Rewrote the
+    documentation in the header. Renamed files to lodepng.cpp and lodepng.h
+*) 22 apr 2006: Optimized and improved some code
+*) 07 sep 2005: (!) Changed to std::vector interface
+*) 12 aug 2005: Initial release (C++, decoder only)
+
+
+13. contact information
+-----------------------
+
+Feel free to contact me with suggestions, problems, comments, ... concerning
+LodePNG. If you encounter a PNG image that doesn't work properly with this
+decoder, feel free to send it and I'll use it to find and fix the problem.
+
+My email address is (puzzle the account and domain together with an @ symbol):
+Domain: gmail dot com.
+Account: lode dot vandevenne.
+
+
+Copyright (c) 2005-2019 Lode Vandevenne
+*/