37 files changed, 867 insertions, 4626 deletions
diff --git a/thirdparty/README.md b/thirdparty/README.md
index cacc0275dd..33ce2423d9 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -89,17 +89,19 @@ will limit its functionality to IPv4 only.
 ## etcpak
 
 - Upstream: https://github.com/wolfpld/etcpak
-- Version: git (403d38b3f1cb347c196d845d0a05e44a00d17169, 2021)
+- Version: git (f27daea656ff77671580f838a889e33049430ebd, 2021)
 - License: BSD-3-Clause
 
-Important: Some Godot-made changes, see `patches` folders.
-
 Files extracted from upstream source:
 
-- All `.cpp` and `.hpp` files in the root folder except `Application.cpp`.
-- `lz4` folder.
+- Only the files relevant for compression (i.e. `Process*.cpp` and their deps):
+  ```
+  Dither.{cpp,hpp} ForceInline.hpp Math.hpp ProcessCommon.hpp ProcessRGB.{cpp,hpp}
+  ProcessDxtc.{cpp,hpp} Tables.{cpp,hpp} Vector.hpp
+  ```
 - `AUTHORS.txt` and `LICENSE.txt`
 
+
 ## fonts
 
 - `NotoSans*.ttf`, `NotoNaskhArabicUI_Regular.ttf`:
@@ -342,7 +344,7 @@ File extracted from upstream release tarball:
 ## meshoptimizer
 
 - Upstream: https://github.com/zeux/meshoptimizer
-- Version: git (e3f53f66e7a35b9b8764bee478589d79e34fa698, 2021)
+- Version: 0.16 (95893c0566646434dd675b708d293fcb2d526d08, 2021)
 - License: MIT
 
 Files extracted from upstream repository:
diff --git a/thirdparty/etcpak/Bitmap.cpp b/thirdparty/etcpak/Bitmap.cpp
deleted file mode 100644
index ef318318ac..0000000000
--- a/thirdparty/etcpak/Bitmap.cpp
+++ /dev/null
@@ -1,216 +0,0 @@
-#include <ctype.h>
-#include <stdlib.h>
-#include <string.h>
-#include <assert.h>
-
-#include <png.h>
-#include "lz4/lz4.h"
-
-#include "Bitmap.hpp"
-#include "Debug.hpp"
-
-Bitmap::Bitmap( const char* fn, unsigned int lines, bool bgr )
-    : m_block( nullptr )
-    , m_lines( lines )
-    , m_alpha( true )
-    , m_sema( 0 )
-{
-    FILE* f = fopen( fn, "rb" );
-    assert( f );
-
-    char buf[4];
-    fread( buf, 1, 4, f );
-    if( memcmp( buf, "raw4", 4 ) == 0 )
-    {
-        uint8_t a;
-        fread( &a, 1, 1, f );
-        m_alpha = a == 1;
-        uint32_t d;
-        fread( &d, 1, 4, f );
-        m_size.x = d;
-        fread( &d, 1, 4, f );
-        m_size.y = d;
-        DBGPRINT( "Raw bitmap " << fn << "  " << m_size.x << "x" << m_size.y );
-
-        assert( m_size.x % 4 == 0 );
-        assert( m_size.y % 4 == 0 );
-
-        int32_t csize;
-        fread( &csize, 1, 4, f );
-        char* cbuf = new char[csize];
-        fread( cbuf, 1, csize, f );
-        fclose( f );
-
-        m_block = m_data = new uint32_t[m_size.x*m_size.y];
-        m_linesLeft = m_size.y / 4;
-
-        LZ4_decompress_fast( cbuf, (char*)m_data, m_size.x*m_size.y*4 );
-        delete[] cbuf;
-
-        for( int i=0; i<m_size.y/4; i++ )
-        {
-            m_sema.unlock();
-        }
-    }
-    else
-    {
-        fseek( f, 0, SEEK_SET );
-
-        unsigned int sig_read = 0;
-        int bit_depth, color_type, interlace_type;
-
-        png_structp png_ptr = png_create_read_struct( PNG_LIBPNG_VER_STRING, NULL, NULL, NULL );
-        png_infop info_ptr = png_create_info_struct( png_ptr );
-        setjmp( png_jmpbuf( png_ptr ) );
-
-        png_init_io( png_ptr, f );
-        png_set_sig_bytes( png_ptr, sig_read );
-
-        png_uint_32 w, h;
-
-        png_read_info( png_ptr, info_ptr );
-        png_get_IHDR( png_ptr, info_ptr, &w, &h, &bit_depth, &color_type, &interlace_type, NULL, NULL );
-
-        m_size = v2i( w, h );
-
-        png_set_strip_16( png_ptr );
-        if( color_type == PNG_COLOR_TYPE_PALETTE )
-        {
-            png_set_palette_to_rgb( png_ptr );
-        }
-        else if( color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8 )
-        {
-            png_set_expand_gray_1_2_4_to_8( png_ptr );
-        }
-        if( png_get_valid( png_ptr, info_ptr, PNG_INFO_tRNS ) )
-        {
-            png_set_tRNS_to_alpha( png_ptr );
-        }
-        if( color_type == PNG_COLOR_TYPE_GRAY_ALPHA )
-        {
-            png_set_gray_to_rgb(png_ptr);
-        }
-        if( bgr )
-        {
-            png_set_bgr(png_ptr);
-        }
-
-        switch( color_type )
-        {
-        case PNG_COLOR_TYPE_PALETTE:
-            if( !png_get_valid( png_ptr, info_ptr, PNG_INFO_tRNS ) )
-            {
-                png_set_filler( png_ptr, 0xff, PNG_FILLER_AFTER );
-                m_alpha = false;
-            }
-            break;
-        case PNG_COLOR_TYPE_GRAY_ALPHA:
-            png_set_gray_to_rgb( png_ptr );
-            break;
-        case PNG_COLOR_TYPE_RGB:
-            png_set_filler( png_ptr, 0xff, PNG_FILLER_AFTER );
-            m_alpha = false;
-            break;
-        default:
-            break;
-        }
-
-        DBGPRINT( "Bitmap " << fn << "  " << w << "x" << h );
-
-        assert( w % 4 == 0 );
-        assert( h % 4 == 0 );
-
-        m_block = m_data = new uint32_t[w*h];
-        m_linesLeft = h / 4;
-
-        m_load = std::async( std::launch::async, [this, f, png_ptr, info_ptr]() mutable
-        {
-            auto ptr = m_data;
-            unsigned int lines = 0;
-            for( int i=0; i<m_size.y / 4; i++ )
-            {
-                for( int j=0; j<4; j++ )
-                {
-                    png_read_rows( png_ptr, (png_bytepp)&ptr, NULL, 1 );
-                    ptr += m_size.x;
-                }
-                lines++;
-                if( lines >= m_lines )
-                {
-                    lines = 0;
-                    m_sema.unlock();
-                }
-            }
-
-            if( lines != 0 )
-            {
-                m_sema.unlock();
-            }
-
-            png_read_end( png_ptr, info_ptr );
-            png_destroy_read_struct( &png_ptr, &info_ptr, NULL );
-            fclose( f );
-        } );
-    }
-}
-
-Bitmap::Bitmap( const v2i& size )
-    : m_data( new uint32_t[size.x*size.y] )
-    , m_block( nullptr )
-    , m_lines( 1 )
-    , m_linesLeft( size.y / 4 )
-    , m_size( size )
-    , m_sema( 0 )
-{
-}
-
-Bitmap::Bitmap( const Bitmap& src, unsigned int lines )
-    : m_lines( lines )
-    , m_alpha( src.Alpha() )
-    , m_sema( 0 )
-{
-}
-
-Bitmap::~Bitmap()
-{
-    delete[] m_data;
-}
-
-void Bitmap::Write( const char* fn )
-{
-    FILE* f = fopen( fn, "wb" );
-    assert( f );
-
-    png_structp png_ptr = png_create_write_struct( PNG_LIBPNG_VER_STRING, NULL, NULL, NULL );
-    png_infop info_ptr = png_create_info_struct( png_ptr );
-    setjmp( png_jmpbuf( png_ptr ) );
-    png_init_io( png_ptr, f );
-
-    png_set_IHDR( png_ptr, info_ptr, m_size.x, m_size.y, 8, PNG_COLOR_TYPE_RGB_ALPHA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE );
-
-    png_write_info( png_ptr, info_ptr );
-
-    uint32_t* ptr = m_data;
-    for( int i=0; i<m_size.y; i++ )
-    {
-        png_write_rows( png_ptr, (png_bytepp)(&ptr), 1 );
-        ptr += m_size.x;
-    }
-
-    png_write_end( png_ptr, info_ptr );
-    png_destroy_write_struct( &png_ptr, &info_ptr );
-
-    fclose( f );
-}
-
-const uint32_t* Bitmap::NextBlock( unsigned int& lines, bool& done )
-{
-    std::lock_guard<std::mutex> lock( m_lock );
-    lines = std::min( m_lines, m_linesLeft );
-    auto ret = m_block;
-    m_sema.lock();
-    m_block += m_size.x * 4 * lines;
-    m_linesLeft -= lines;
-    done = m_linesLeft == 0;
-    return ret;
-}
diff --git a/thirdparty/etcpak/Bitmap.hpp b/thirdparty/etcpak/Bitmap.hpp
deleted file mode 100644
index fae8c936ed..0000000000
--- a/thirdparty/etcpak/Bitmap.hpp
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef __DARKRL__BITMAP_HPP__
-#define __DARKRL__BITMAP_HPP__
-
-#include <future>
-#include <memory>
-#include <mutex>
-#include <stdint.h>
-
-#include "Semaphore.hpp"
-#include "Vector.hpp"
-
-enum class Channels
-{
-    RGB,
-    Alpha
-};
-
-class Bitmap
-{
-public:
-    Bitmap( const char* fn, unsigned int lines, bool bgr );
-    Bitmap( const v2i& size );
-    virtual ~Bitmap();
-
-    void Write( const char* fn );
-
-    uint32_t* Data() { if( m_load.valid() ) m_load.wait(); return m_data; }
-    const uint32_t* Data() const { if( m_load.valid() ) m_load.wait(); return m_data; }
-    const v2i& Size() const { return m_size; }
-    bool Alpha() const { return m_alpha; }
-
-    const uint32_t* NextBlock( unsigned int& lines, bool& done );
-
-protected:
-    Bitmap( const Bitmap& src, unsigned int lines );
-
-    uint32_t* m_data;
-    uint32_t* m_block;
-    unsigned int m_lines;
-    unsigned int m_linesLeft;
-    v2i m_size;
-    bool m_alpha;
-    Semaphore m_sema;
-    std::mutex m_lock;
-    std::future<void> m_load;
-};
-
-typedef std::shared_ptr<Bitmap> BitmapPtr;
-
-#endif
diff --git a/thirdparty/etcpak/BitmapDownsampled.cpp b/thirdparty/etcpak/BitmapDownsampled.cpp
deleted file mode 100644
index 0eb0d81185..0000000000
--- a/thirdparty/etcpak/BitmapDownsampled.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-#include <string.h>
-#include <utility>
-
-#include "BitmapDownsampled.hpp"
-#include "Debug.hpp"
-
-BitmapDownsampled::BitmapDownsampled( const Bitmap& bmp, unsigned int lines )
-    : Bitmap( bmp, lines )
-{
-    m_size.x = std::max( 1, bmp.Size().x / 2 );
-    m_size.y = std::max( 1, bmp.Size().y / 2 );
-
-    int w = std::max( m_size.x, 4 );
-    int h = std::max( m_size.y, 4 );
-
-    DBGPRINT( "Subbitmap " << m_size.x << "x" << m_size.y );
-
-    m_block = m_data = new uint32_t[w*h];
-
-    if( m_size.x < w || m_size.y < h )
-    {
-        memset( m_data, 0, w*h*sizeof( uint32_t ) );
-        m_linesLeft = h / 4;
-        unsigned int lines = 0;
-        for( int i=0; i<h/4; i++ )
-        {
-            for( int j=0; j<4; j++ )
-            {
-                lines++;
-                if( lines > m_lines )
-                {
-                    lines = 0;
-                    m_sema.unlock();
-                }
-            }
-        }
-        if( lines != 0 )
-        {
-            m_sema.unlock();
-        }
-    }
-    else
-    {
-        m_linesLeft = h / 4;
-        m_load = std::async( std::launch::async, [this, &bmp, w, h]() mutable
-        {
-            auto ptr = m_data;
-            auto src1 = bmp.Data();
-            auto src2 = src1 + bmp.Size().x;
-            unsigned int lines = 0;
-            for( int i=0; i<h/4; i++ )
-            {
-                for( int j=0; j<4; j++ )
-                {
-                    for( int k=0; k<m_size.x; k++ )
-                    {
-                        int r = ( ( *src1 & 0x000000FF ) + ( *(src1+1) & 0x000000FF ) + ( *src2 & 0x000000FF ) + ( *(src2+1) & 0x000000FF ) ) / 4;
-                        int g = ( ( ( *src1 & 0x0000FF00 ) + ( *(src1+1) & 0x0000FF00 ) + ( *src2 & 0x0000FF00 ) + ( *(src2+1) & 0x0000FF00 ) ) / 4 ) & 0x0000FF00;
-                        int b = ( ( ( *src1 & 0x00FF0000 ) + ( *(src1+1) & 0x00FF0000 ) + ( *src2 & 0x00FF0000 ) + ( *(src2+1) & 0x00FF0000 ) ) / 4 ) & 0x00FF0000;
-                        int a = ( ( ( ( ( *src1 & 0xFF000000 ) >> 8 ) + ( ( *(src1+1) & 0xFF000000 ) >> 8 ) + ( ( *src2 & 0xFF000000 ) >> 8 ) + ( ( *(src2+1) & 0xFF000000 ) >> 8 ) ) / 4 ) & 0x00FF0000 ) << 8;
-                        *ptr++ = r | g | b | a;
-                        src1 += 2;
-                        src2 += 2;
-                    }
-                    src1 += m_size.x * 2;
-                    src2 += m_size.x * 2;
-                }
-                lines++;
-                if( lines >= m_lines )
-                {
-                    lines = 0;
-                    m_sema.unlock();
-                }
-            }
-
-            if( lines != 0 )
-            {
-                m_sema.unlock();
-            }
-        } );
-    }
-}
-
-BitmapDownsampled::~BitmapDownsampled()
-{
-}
diff --git a/thirdparty/etcpak/BitmapDownsampled.hpp b/thirdparty/etcpak/BitmapDownsampled.hpp
deleted file mode 100644
index b7313808df..0000000000
--- a/thirdparty/etcpak/BitmapDownsampled.hpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __DARKRL__BITMAPDOWNSAMPLED_HPP__
-#define __DARKRL__BITMAPDOWNSAMPLED_HPP__
-
-#include "Bitmap.hpp"
-
-class BitmapDownsampled : public Bitmap
-{
-public:
-    BitmapDownsampled( const Bitmap& bmp, unsigned int lines );
-    ~BitmapDownsampled();
-};
-
-#endif
diff --git a/thirdparty/etcpak/BlockData.cpp b/thirdparty/etcpak/BlockData.cpp
deleted file mode 100644
index 4906e69492..0000000000
--- a/thirdparty/etcpak/BlockData.cpp
+++ /dev/null
@@ -1,1296 +0,0 @@
-#include <assert.h>
-#include <string.h>
-
-#include "BlockData.hpp"
-#include "ColorSpace.hpp"
-#include "Debug.hpp"
-#include "MipMap.hpp"
-#include "mmap.hpp"
-#include "ProcessRGB.hpp"
-#include "ProcessDxtc.hpp"
-#include "Tables.hpp"
-#include "TaskDispatch.hpp"
-
-#ifdef __ARM_NEON
-#  include <arm_neon.h>
-#endif
-
-#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
-#  ifdef _MSC_VER
-#    include <intrin.h>
-#    include <Windows.h>
-#    define _bswap(x) _byteswap_ulong(x)
-#    define _bswap64(x) _byteswap_uint64(x)
-#  else
-#    include <x86intrin.h>
-#  endif
-#endif
-
-#ifndef _bswap
-#  define _bswap(x) __builtin_bswap32(x)
-#  define _bswap64(x) __builtin_bswap64(x)
-#endif
-
-static uint8_t table59T58H[8] = { 3,6,11,16,23,32,41,64 };
-
-BlockData::BlockData( const char* fn )
-    : m_file( fopen( fn, "rb" ) )
-{
-    assert( m_file );
-    fseek( m_file, 0, SEEK_END );
-    m_maplen = ftell( m_file );
-    fseek( m_file, 0, SEEK_SET );
-    m_data = (uint8_t*)mmap( nullptr, m_maplen, PROT_READ, MAP_SHARED, fileno( m_file ), 0 );
-
-    auto data32 = (uint32_t*)m_data;
-    if( *data32 == 0x03525650 )
-    {
-        // PVR
-        switch( *(data32+2) )
-        {
-        case 6:
-            m_type = Etc1;
-            break;
-        case 7:
-            m_type = Dxt1;
-            break;
-        case 11:
-            m_type = Dxt5;
-            break;
-        case 22:
-            m_type = Etc2_RGB;
-            break;
-        case 23:
-            m_type = Etc2_RGBA;
-            break;
-        default:
-            assert( false );
-            break;
-        }
-
-        m_size.y = *(data32+6);
-        m_size.x = *(data32+7);
-        m_dataOffset = 52 + *(data32+12);
-    }
-    else if( *data32 == 0x58544BAB )
-    {
-        // KTX
-        switch( *(data32+7) )
-        {
-        case 0x9274:
-            m_type = Etc2_RGB;
-            break;
-        case 0x9278:
-            m_type = Etc2_RGBA;
-            break;
-        default:
-            assert( false );
-            break;
-        }
-
-        m_size.x = *(data32+9);
-        m_size.y = *(data32+10);
-        m_dataOffset = sizeof( uint32_t ) * 17 + *(data32+15);
-    }
-    else
-    {
-        assert( false );
-    }
-}
-
-static uint8_t* OpenForWriting( const char* fn, size_t len, const v2i& size, FILE** f, int levels, BlockData::Type type )
-{
-    *f = fopen( fn, "wb+" );
-    assert( *f );
-    fseek( *f, len - 1, SEEK_SET );
-    const char zero = 0;
-    fwrite( &zero, 1, 1, *f );
-    fseek( *f, 0, SEEK_SET );
-
-    auto ret = (uint8_t*)mmap( nullptr, len, PROT_WRITE, MAP_SHARED, fileno( *f ), 0 );
-    auto dst = (uint32_t*)ret;
-
-    *dst++ = 0x03525650;  // version
-    *dst++ = 0;           // flags
-    switch( type )        // pixelformat[0]
-    {
-    case BlockData::Etc1:
-        *dst++ = 6;
-        break;
-    case BlockData::Etc2_RGB:
-        *dst++ = 22;
-        break;
-    case BlockData::Etc2_RGBA:
-        *dst++ = 23;
-        break;
-    case BlockData::Dxt1:
-        *dst++ = 7;
-        break;
-    case BlockData::Dxt5:
-        *dst++ = 11;
-        break;
-    default:
-        assert( false );
-        break;
-    }
-    *dst++ = 0;           // pixelformat[1]
-    *dst++ = 0;           // colourspace
-    *dst++ = 0;           // channel type
-    *dst++ = size.y;      // height
-    *dst++ = size.x;      // width
-    *dst++ = 1;           // depth
-    *dst++ = 1;           // num surfs
-    *dst++ = 1;           // num faces
-    *dst++ = levels;      // mipmap count
-    *dst++ = 0;           // metadata size
-
-    return ret;
-}
-
-static int AdjustSizeForMipmaps( const v2i& size, int levels )
-{
-    int len = 0;
-    v2i current = size;
-    for( int i=1; i<levels; i++ )
-    {
-        assert( current.x != 1 || current.y != 1 );
-        current.x = std::max( 1, current.x / 2 );
-        current.y = std::max( 1, current.y / 2 );
-        len += std::max( 4, current.x ) * std::max( 4, current.y ) / 2;
-    }
-    assert( current.x == 1 && current.y == 1 );
-    return len;
-}
-
-BlockData::BlockData( const char* fn, const v2i& size, bool mipmap, Type type )
-    : m_size( size )
-    , m_dataOffset( 52 )
-    , m_maplen( m_size.x*m_size.y/2 )
-    , m_type( type )
-{
-    assert( m_size.x%4 == 0 && m_size.y%4 == 0 );
-
-    uint32_t cnt = m_size.x * m_size.y / 16;
-    DBGPRINT( cnt << " blocks" );
-
-    int levels = 1;
-
-    if( mipmap )
-    {
-        levels = NumberOfMipLevels( size );
-        DBGPRINT( "Number of mipmaps: " << levels );
-        m_maplen += AdjustSizeForMipmaps( size, levels );
-    }
-
-    if( type == Etc2_RGBA || type == Dxt5 ) m_maplen *= 2;
-
-    m_maplen += m_dataOffset;
-    m_data = OpenForWriting( fn, m_maplen, m_size, &m_file, levels, type );
-}
-
-BlockData::BlockData( const v2i& size, bool mipmap, Type type )
-    : m_size( size )
-    , m_dataOffset( 52 )
-    , m_file( nullptr )
-    , m_maplen( m_size.x*m_size.y/2 )
-    , m_type( type )
-{
-    assert( m_size.x%4 == 0 && m_size.y%4 == 0 );
-    if( mipmap )
-    {
-        const int levels = NumberOfMipLevels( size );
-        m_maplen += AdjustSizeForMipmaps( size, levels );
-    }
-
-    if( type == Etc2_RGBA || type == Dxt5 ) m_maplen *= 2;
-
-    m_maplen += m_dataOffset;
-    m_data = new uint8_t[m_maplen];
-}
-
-BlockData::~BlockData()
-{
-    if( m_file )
-    {
-        munmap( m_data, m_maplen );
-        fclose( m_file );
-    }
-    else
-    {
-        delete[] m_data;
-    }
-}
-
-void BlockData::Process( const uint32_t* src, uint32_t blocks, size_t offset, size_t width, Channels type, bool dither )
-{
-    auto dst = ((uint64_t*)( m_data + m_dataOffset )) + offset;
-
-    if( type == Channels::Alpha )
-    {
-        if( m_type != Etc1 )
-        {
-            CompressEtc2Alpha( src, dst, blocks, width );
-        }
-        else
-        {
-            CompressEtc1Alpha( src, dst, blocks, width );
-        }
-    }
-    else
-    {
-        switch( m_type )
-        {
-        case Etc1:
-            if( dither )
-            {
-                CompressEtc1RgbDither( src, dst, blocks, width );
-            }
-            else
-            {
-                CompressEtc1Rgb( src, dst, blocks, width );
-            }
-            break;
-        case Etc2_RGB:
-            CompressEtc2Rgb( src, dst, blocks, width );
-            break;
-        case Dxt1:
-            if( dither )
-            {
-                CompressDxt1Dither( src, dst, blocks, width );
-            }
-            else
-            {
-                CompressDxt1( src, dst, blocks, width );
-            }
-            break;
-        default:
-            assert( false );
-            break;
-        }
-    }
-}
-
-void BlockData::ProcessRGBA( const uint32_t* src, uint32_t blocks, size_t offset, size_t width )
-{
-    auto dst = ((uint64_t*)( m_data + m_dataOffset )) + offset * 2;
-
-    switch( m_type )
-    {
-    case Etc2_RGBA:
-        CompressEtc2Rgba( src, dst, blocks, width );
-        break;
-    case Dxt5:
-        CompressDxt5( src, dst, blocks, width );
-        break;
-    default:
-        assert( false );
-        break;
-    }
-}
-
-namespace
-{
-
-static etcpak_force_inline int32_t expand6(uint32_t value)
-{
-    return (value << 2) | (value >> 4);
-}
-
-static etcpak_force_inline int32_t expand7(uint32_t value)
-{
-    return (value << 1) | (value >> 6);
-}
-
-static etcpak_force_inline void DecodeT( uint64_t block, uint32_t* dst, uint32_t w )
-{
-    const auto r0 = ( block >> 24 ) & 0x1B;
-    const auto rh0 = ( r0 >> 3 ) & 0x3;
-    const auto rl0 = r0 & 0x3;
-    const auto g0 = ( block >> 20 ) & 0xF;
-    const auto b0 = ( block >> 16 ) & 0xF;
-
-    const auto r1 = ( block >> 12 ) & 0xF;
-    const auto g1 = ( block >> 8 ) & 0xF;
-    const auto b1 = ( block >> 4 ) & 0xF;
-
-    const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
-    const auto cg0 = ( g0 << 4 ) | g0;
-    const auto cb0 = ( b0 << 4 ) | b0;
-
-    const auto cr1 = ( r1 << 4 ) | r1;
-    const auto cg1 = ( g1 << 4 ) | g1;
-    const auto cb1 = ( b1 << 4 ) | b1;
-
-    const auto codeword_hi = ( block >> 2 ) & 0x3;
-    const auto codeword_lo = block & 0x1;
-    const auto codeword = ( codeword_hi << 1 ) | codeword_lo;
-
-    const auto c2r = clampu8( cr1 + table59T58H[codeword] );
-    const auto c2g = clampu8( cg1 + table59T58H[codeword] );
-    const auto c2b = clampu8( cb1 + table59T58H[codeword] );
-
-    const auto c3r = clampu8( cr1 - table59T58H[codeword] );
-    const auto c3g = clampu8( cg1 - table59T58H[codeword] );
-    const auto c3b = clampu8( cb1 - table59T58H[codeword] );
-
-    const uint32_t col_tab[4] = {
-        uint32_t(cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) | 0xFF000000),
-        uint32_t(c2r | ( c2g << 8 ) | ( c2b << 16 ) | 0xFF000000),
-        uint32_t(cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) | 0xFF000000),
-        uint32_t(c3r | ( c3g << 8 ) | ( c3b << 16 ) | 0xFF000000)
-    };
-
-    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
-    for( uint8_t j = 0; j < 4; j++ )
-    {
-        for( uint8_t i = 0; i < 4; i++ )
-        {
-            //2bit indices distributed on two lane 16bit numbers
-            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1) | ( ( indexes >> ( j + i * 4 ) ) & 0x1);
-            dst[j * w + i] = col_tab[index];
-        }
-    }
-}
-
-static etcpak_force_inline void DecodeTAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
-{
-    const auto r0 = ( block >> 24 ) & 0x1B;
-    const auto rh0 = ( r0 >> 3 ) & 0x3;
-    const auto rl0 = r0 & 0x3;
-    const auto g0 = ( block >> 20 ) & 0xF;
-    const auto b0 = ( block >> 16 ) & 0xF;
-
-    const auto r1 = ( block >> 12 ) & 0xF;
-    const auto g1 = ( block >> 8 ) & 0xF;
-    const auto b1 = ( block >> 4 ) & 0xF;
-
-    const auto cr0 = ( ( rh0 << 6 ) | ( rl0 << 4 ) | ( rh0 << 2 ) | rl0);
-    const auto cg0 = ( g0 << 4 ) | g0;
-    const auto cb0 = ( b0 << 4 ) | b0;
-
-    const auto cr1 = ( r1 << 4 ) | r1;
-    const auto cg1 = ( g1 << 4 ) | g1;
-    const auto cb1 = ( b1 << 4 ) | b1;
-
-    const auto codeword_hi = ( block >> 2 ) & 0x3;
-    const auto codeword_lo = block & 0x1;
-    const auto codeword = (codeword_hi << 1) | codeword_lo;
-
-    const int32_t base = alpha >> 56;
-    const int32_t mul = ( alpha >> 52 ) & 0xF;
-    const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
-
-    const auto c2r = clampu8( cr1 + table59T58H[codeword] );
-    const auto c2g = clampu8( cg1 + table59T58H[codeword] );
-    const auto c2b = clampu8( cb1 + table59T58H[codeword] );
-
-    const auto c3r = clampu8( cr1 - table59T58H[codeword] );
-    const auto c3g = clampu8( cg1 - table59T58H[codeword] );
-    const auto c3b = clampu8( cb1 - table59T58H[codeword] );
-
-    const uint32_t col_tab[4] = {
-        uint32_t(cr0 | ( cg0 << 8 ) | ( cb0 << 16 )),
-        uint32_t(c2r | ( c2g << 8 ) | ( c2b << 16 )),
-        uint32_t(cr1 | ( cg1 << 8 ) | ( cb1 << 16 )),
-        uint32_t(c3r | ( c3g << 8 ) | ( c3b << 16 ))
-    };
-
-    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
-    for( uint8_t j = 0; j < 4; j++ )
-    {
-        for( uint8_t i = 0; i < 4; i++ )
-        {
-            //2bit indices distributed on two lane 16bit numbers
-            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
-            const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12 ) ) & 0x7];
-            const uint32_t a = clampu8( base + amod * mul );
-            dst[j * w + i] = col_tab[index] | ( a << 24 );
-        }
-    }
-}
-
-static etcpak_force_inline void DecodeH( uint64_t block, uint32_t* dst, uint32_t w )
-{
-    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
-
-    const auto r0444 = ( block >> 27 ) & 0xF;
-    const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
-    const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
-
-    const auto r1444 = ( block >> 11 ) & 0xF;
-    const auto g1444 = ( block >> 7 ) & 0xF;
-    const auto b1444 = ( block >> 3 ) & 0xF;
-
-    const auto r0 = ( r0444 << 4 ) | r0444;
-    const auto g0 = ( g0444 << 4 ) | g0444;
-    const auto b0 = ( b0444 << 4 ) | b0444;
-
-    const auto r1 = ( r1444 << 4 ) | r1444;
-    const auto g1 = ( g1444 << 4 ) | g1444;
-    const auto b1 = ( b1444 << 4 ) | b1444;
-
-    const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
-    const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
-    const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
-    const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
-    const auto codeword = codeword_hi | codeword_lo;
-
-    const uint32_t col_tab[] = {
-        uint32_t(clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 )),
-        uint32_t(clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 )),
-        uint32_t(clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 )),
-        uint32_t(clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ))
-    };
-
-    for( uint8_t j = 0; j < 4; j++ )
-    {
-        for( uint8_t i = 0; i < 4; i++ )
-        {
-            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
-            dst[j * w + i] = col_tab[index] | 0xFF000000;
-        }
-    }
-}
-
-static etcpak_force_inline void DecodeHAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
-{
-    const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
-
-    const auto r0444 = ( block >> 27 ) & 0xF;
-    const auto g0444 = ( ( block >> 20 ) & 0x1 ) | ( ( ( block >> 24 ) & 0x7 ) << 1 );
-    const auto b0444 = ( ( block >> 15 ) & 0x7 ) | ( ( ( block >> 19 ) & 0x1 ) << 3 );
-
-    const auto r1444 = ( block >> 11 ) & 0xF;
-    const auto g1444 = ( block >> 7 ) & 0xF;
-    const auto b1444 = ( block >> 3 ) & 0xF;
-
-    const auto r0 = ( r0444 << 4 ) | r0444;
-    const auto g0 = ( g0444 << 4 ) | g0444;
-    const auto b0 = ( b0444 << 4 ) | b0444;
-
-    const auto r1 = ( r1444 << 4 ) | r1444;
-    const auto g1 = ( g1444 << 4 ) | g1444;
-    const auto b1 = ( b1444 << 4 ) | b1444;
-
-    const auto codeword_hi = ( ( block & 0x1 ) << 1 ) | ( ( block & 0x4 ) );
-    const auto c0 = ( r0444 << 8 ) | ( g0444 << 4 ) | ( b0444 << 0 );
-    const auto c1 = ( block >> 3 ) & ( ( 1 << 12 ) - 1 );
-    const auto codeword_lo = ( c0 >= c1 ) ? 1 : 0;
-    const auto codeword = codeword_hi | codeword_lo;
-
-    const int32_t base = alpha >> 56;
-    const int32_t mul = ( alpha >> 52 ) & 0xF;
-    const auto tbl = g_alpha[(alpha >> 48) & 0xF];
-
-    const uint32_t col_tab[] = {
-        uint32_t(clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 )),
-        uint32_t(clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 )),
-        uint32_t(clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 )),
-        uint32_t(clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ))
-    };
-
-    for( uint8_t j = 0; j < 4; j++ )
-    {
-        for( uint8_t i = 0; i < 4; i++ )
-        {
-            const uint8_t index = ( ( ( indexes >> ( j + i * 4 + 16 ) ) & 0x1 ) << 1 ) | ( ( indexes >> ( j + i * 4 ) ) & 0x1 );
-            const auto amod = tbl[( alpha >> ( 45 - j * 3 - i * 12) ) & 0x7];
-            const uint32_t a = clampu8( base + amod * mul );
-            dst[j * w + i] = col_tab[index] | ( a << 24 );
-        }
-    }
-}
-
-static etcpak_force_inline void DecodePlanar( uint64_t block, uint32_t* dst, uint32_t w )
-{
-    const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
-    const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
-    const auto rv = expand6((block >> (13 + 32)) & 0x3F);
-
-    const auto bh = expand6((block >> (19 + 32)) & 0x3F);
-    const auto gh = expand7((block >> (25 + 32)) & 0x7F);
-
-    const auto rh0 = (block >> (32 - 32)) & 0x01;
-    const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
-    const auto rh = expand6(rh0 | rh1);
-
-    const auto bo0 = (block >> (39 - 32)) & 0x07;
-    const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
-    const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
-    const auto bo = expand6(bo0 | bo1 | bo2);
-    const auto go0 = (block >> (49 - 32)) & 0x3F;
-    const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
-    const auto go = expand7(go0 | go1);
-    const auto ro = expand6((block >> (57 - 32)) & 0x3F);
-
-#ifdef __ARM_NEON
-    uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
-    int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
-    init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
-    int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
-    init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 ) | ( uint64_t(0xFFF) << 48 );
-    int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
-
-    for( int j=0; j<4; j++ )
-    {
-        for( int i=0; i<4; i++ )
-        {
-            uint8x8_t c = vqshrun_n_s16( col, 2 );
-            vst1_lane_u32( dst+j*w+i, vreinterpret_u32_u8( c ), 0 );
-            col = vaddq_s16( col, chco );
-        }
-        col = vaddq_s16( col, cvco );
-    }
-#elif defined __AVX2__
-    const auto R0 = 4*ro+2;
-    const auto G0 = 4*go+2;
-    const auto B0 = 4*bo+2;
-    const auto RHO = rh-ro;
-    const auto GHO = gh-go;
-    const auto BHO = bh-bo;
-
-    __m256i cvco = _mm256_setr_epi16( rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0, rv - ro, gv - go, bv - bo, 0 );
-    __m256i col = _mm256_setr_epi16( R0, G0, B0, 0xFFF, R0+RHO, G0+GHO, B0+BHO, 0xFFF, R0+2*RHO, G0+2*GHO, B0+2*BHO, 0xFFF, R0+3*RHO, G0+3*GHO, B0+3*BHO, 0xFFF );
-
-    for( int j=0; j<4; j++ )
-    {
-        __m256i c = _mm256_srai_epi16( col, 2 );
-        __m128i s = _mm_packus_epi16( _mm256_castsi256_si128( c ), _mm256_extracti128_si256( c, 1 ) );
-        _mm_storeu_si128( (__m128i*)(dst+j*w), s );
-        col = _mm256_add_epi16( col, cvco );
-    }
-#elif defined __SSE4_1__
-    __m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
-    __m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
-    __m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0xFFF, 0, 0, 0, 0 );
-
-    for( int j=0; j<4; j++ )
-    {
-        for( int i=0; i<4; i++ )
-        {
-            __m128i c = _mm_srai_epi16( col, 2 );
-            __m128i s = _mm_packus_epi16( c, c );
-            dst[j*w+i] = _mm_cvtsi128_si32( s );
-            col = _mm_add_epi16( col, chco );
-        }
-        col = _mm_add_epi16( col, cvco );
-    }
-#else
-    for( int j=0; j<4; j++ )
-    {
-        for( int i=0; i<4; i++ )
-        {
-            const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
-            const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
-            const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
-            if( ( ( r | g | b ) & ~0xFF ) == 0 )
-            {
-                dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
-            }
-            else
-            {
-                const auto rc = clampu8( r );
-                const auto gc = clampu8( g );
-                const auto bc = clampu8( b );
-                dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
-            }
-        }
-    }
-#endif
-}
-
-static etcpak_force_inline void DecodePlanarAlpha( uint64_t block, uint64_t alpha, uint32_t* dst, uint32_t w )
-{
-    const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
-    const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
-    const auto rv = expand6((block >> (13 + 32)) & 0x3F);
-
-    const auto bh = expand6((block >> (19 + 32)) & 0x3F);
-    const auto gh = expand7((block >> (25 + 32)) & 0x7F);
-
-    const auto rh0 = (block >> (32 - 32)) & 0x01;
-    const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
-    const auto rh = expand6(rh0 | rh1);
-
-    const auto bo0 = (block >> (39 - 32)) & 0x07;
-    const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
-    const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
-    const auto bo = expand6(bo0 | bo1 | bo2);
-    const auto go0 = (block >> (49 - 32)) & 0x3F;
-    const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
-    const auto go = expand7(go0 | go1);
-    const auto ro = expand6((block >> (57 - 32)) & 0x3F);
-
-    const int32_t base = alpha >> 56;
-    const int32_t mul = ( alpha >> 52 ) & 0xF;
-    const auto tbl = g_alpha[( alpha >> 48 ) & 0xF];
-
-#ifdef __ARM_NEON
-    uint64_t init = uint64_t(uint16_t(rh-ro)) | ( uint64_t(uint16_t(gh-go)) << 16 ) | ( uint64_t(uint16_t(bh-bo)) << 32 );
-    int16x8_t chco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
-    init = uint64_t(uint16_t( (rv-ro) - 4 * (rh-ro) )) | ( uint64_t(uint16_t( (gv-go) - 4 * (gh-go) )) << 16 ) | ( uint64_t(uint16_t( (bv-bo) - 4 * (bh-bo) )) << 32 );
-    int16x8_t cvco = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
-    init = uint64_t(4*ro+2) | ( uint64_t(4*go+2) << 16 ) | ( uint64_t(4*bo+2) << 32 );
-    int16x8_t col = vreinterpretq_s16_u64( vdupq_n_u64( init ) );
-
-    for( int j=0; j<4; j++ )
-    {
-        for( int i=0; i<4; i++ )
-        {
-            const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
-            const uint32_t a = clampu8( base + amod * mul );
-            uint8x8_t c = vqshrun_n_s16( col, 2 );
-            dst[j*w+i] = vget_lane_u32( vreinterpret_u32_u8( c ), 0 ) | ( a << 24 );
-            col = vaddq_s16( col, chco );
-        }
-        col = vaddq_s16( col, cvco );
-    }
-#elif defined __SSE4_1__
-    __m128i chco = _mm_setr_epi16( rh - ro, gh - go, bh - bo, 0, 0, 0, 0, 0 );
-    __m128i cvco = _mm_setr_epi16( (rv - ro) - 4 * (rh - ro), (gv - go) - 4 * (gh - go), (bv - bo) - 4 * (bh - bo), 0, 0, 0, 0, 0 );
-    __m128i col = _mm_setr_epi16( 4*ro+2, 4*go+2, 4*bo+2, 0, 0, 0, 0, 0 );
-
-    for( int j=0; j<4; j++ )
-    {
-        for( int i=0; i<4; i++ )
-        {
-            const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
-            const uint32_t a = clampu8( base + amod * mul );
-            __m128i c = _mm_srai_epi16( col, 2 );
-            __m128i s = _mm_packus_epi16( c, c );
-            dst[j*w+i] = _mm_cvtsi128_si32( s ) | ( a << 24 );
-            col = _mm_add_epi16( col, chco );
-        }
-        col = _mm_add_epi16( col, cvco );
-    }
-#else
-    for (auto j = 0; j < 4; j++)
-    {
-        for (auto i = 0; i < 4; i++)
-        {
-            const uint32_t r = (i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2;
-            const uint32_t g = (i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2;
-            const uint32_t b = (i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2;
-            const auto amod = tbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
-            const uint32_t a = clampu8( base + amod * mul );
-            if( ( ( r | g | b ) & ~0xFF ) == 0 )
-            {
-                dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
-            }
-            else
-            {
-                const auto rc = clampu8( r );
-                const auto gc = clampu8( g );
-                const auto bc = clampu8( b );
-                dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
-            }
-        }
-    }
-#endif
-}
-
-}
-
-BitmapPtr BlockData::Decode()
-{
-    switch( m_type )
-    {
-    case Etc1:
-    case Etc2_RGB:
-        return DecodeRGB();
-    case Etc2_RGBA:
-        return DecodeRGBA();
-    case Dxt1:
-        return DecodeDxt1();
-    case Dxt5:
-        return DecodeDxt5();
-    default:
-        assert( false );
-        return nullptr;
-    }
-}
-
-static etcpak_force_inline uint64_t ConvertByteOrder( uint64_t d )
-{
-    uint32_t word[2];
-    memcpy( word, &d, 8 );
-    word[0] = _bswap( word[0] );
-    word[1] = _bswap( word[1] );
-    memcpy( &d, word, 8 );
-    return d;
-}
-
-static etcpak_force_inline void DecodeRGBPart( uint64_t d, uint32_t* dst, uint32_t w )
-{
-    d = ConvertByteOrder( d );
-
-    uint32_t br[2], bg[2], bb[2];
-
-    if( d & 0x2 )
-    {
-        int32_t dr, dg, db;
-
-        uint32_t r0 = ( d & 0xF8000000 ) >> 27;
-        uint32_t g0 = ( d & 0x00F80000 ) >> 19;
-        uint32_t b0 = ( d & 0x0000F800 ) >> 11;
-
-        dr = ( int32_t(d) << 5 ) >> 29;
-        dg = ( int32_t(d) << 13 ) >> 29;
-        db = ( int32_t(d) << 21 ) >> 29;
-
-        int32_t r1 = int32_t(r0) + dr;
-        int32_t g1 = int32_t(g0) + dg;
-        int32_t b1 = int32_t(b0) + db;
-
-        // T mode
-        if ( (r1 < 0) || (r1 > 31) )
-        {
-            DecodeT( d, dst, w );
-            return;
-        }
-
-        // H mode
-        if ((g1 < 0) || (g1 > 31))
-        {
-            DecodeH( d, dst, w );
-            return;
-        }
-
-        // P mode
-        if( (b1 < 0) || (b1 > 31) )
-        {
-            DecodePlanar( d, dst, w );
-            return;
-        }
-
-        br[0] = ( r0 << 3 ) | ( r0 >> 2 );
-        br[1] = ( r1 << 3 ) | ( r1 >> 2 );
-        bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
-        bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
-        bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
-        bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
-    }
-    else
-    {
-        br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
-        br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
-        bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
-        bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
-        bb[0] = ( ( d & 0x0000F000 ) >> 8  ) | ( ( d & 0x0000F000 ) >> 12 );
-        bb[1] = ( ( d & 0x00000F00 ) >> 4  ) | ( ( d & 0x00000F00 ) >> 8  );
-    }
-
-    unsigned int tcw[2];
-    tcw[0] = ( d & 0xE0 ) >> 5;
-    tcw[1] = ( d & 0x1C ) >> 2;
-
-    uint32_t b1 = ( d >> 32 ) & 0xFFFF;
-    uint32_t b2 = ( d >> 48 );
-
-    b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
-    b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
-    b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
-    b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
-
-    b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
-    b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
-    b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
-    b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
-
-    uint32_t idx = b1 | ( b2 << 1 );
-
-    if( d & 0x1 )
-    {
-        for( int i=0; i<4; i++ )
-        {
-            for( int j=0; j<4; j++ )
-            {
-                const auto mod = g_table[tcw[j/2]][idx & 0x3];
-                const auto r = br[j/2] + mod;
-                const auto g = bg[j/2] + mod;
-                const auto b = bb[j/2] + mod;
-                if( ( ( r | g | b ) & ~0xFF ) == 0 )
-                {
-                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
-                }
-                else
-                {
-                    const auto rc = clampu8( r );
-                    const auto gc = clampu8( g );
-                    const auto bc = clampu8( b );
-                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
-                }
-                idx >>= 2;
-            }
-        }
-    }
-    else
-    {
-        for( int i=0; i<4; i++ )
-        {
-            const auto tbl = g_table[tcw[i/2]];
-            const auto cr = br[i/2];
-            const auto cg = bg[i/2];
-            const auto cb = bb[i/2];
-
-            for( int j=0; j<4; j++ )
-            {
-                const auto mod = tbl[idx & 0x3];
-                const auto r = cr + mod;
-                const auto g = cg + mod;
-                const auto b = cb + mod;
-                if( ( ( r | g | b ) & ~0xFF ) == 0 )
-                {
-                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
-                }
-                else
-                {
-                    const auto rc = clampu8( r );
-                    const auto gc = clampu8( g );
-                    const auto bc = clampu8( b );
-                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | 0xFF000000;
-                }
-                idx >>= 2;
-            }
-        }
-    }
-}
-
-static etcpak_force_inline void DecodeRGBAPart( uint64_t d, uint64_t alpha, uint32_t* dst, uint32_t w )
-{
-    d = ConvertByteOrder( d );
-    alpha = _bswap64( alpha );
-
-    uint32_t br[2], bg[2], bb[2];
-
-    if( d & 0x2 )
-    {
-        int32_t dr, dg, db;
-
-        uint32_t r0 = ( d & 0xF8000000 ) >> 27;
-        uint32_t g0 = ( d & 0x00F80000 ) >> 19;
-        uint32_t b0 = ( d & 0x0000F800 ) >> 11;
-
-        dr = ( int32_t(d) << 5 ) >> 29;
-        dg = ( int32_t(d) << 13 ) >> 29;
-        db = ( int32_t(d) << 21 ) >> 29;
-
-        int32_t r1 = int32_t(r0) + dr;
-        int32_t g1 = int32_t(g0) + dg;
-        int32_t b1 = int32_t(b0) + db;
-
-        // T mode
-        if ( (r1 < 0) || (r1 > 31) )
-        {
-            DecodeTAlpha( d, alpha, dst, w );
-            return;
-        }
-
-        // H mode
-        if ( (g1 < 0) || (g1 > 31) )
-        {
-            DecodeHAlpha( d, alpha, dst, w );
-            return;
-        }
-
-        // P mode
-        if ( (b1 < 0) || (b1 > 31) )
-        {
-            DecodePlanarAlpha( d, alpha, dst, w );
-            return;
-        }
-
-        br[0] = ( r0 << 3 ) | ( r0 >> 2 );
-        br[1] = ( r1 << 3 ) | ( r1 >> 2 );
-        bg[0] = ( g0 << 3 ) | ( g0 >> 2 );
-        bg[1] = ( g1 << 3 ) | ( g1 >> 2 );
-        bb[0] = ( b0 << 3 ) | ( b0 >> 2 );
-        bb[1] = ( b1 << 3 ) | ( b1 >> 2 );
-    }
-    else
-    {
-        br[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
-        br[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
-        bg[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
-        bg[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
-        bb[0] = ( ( d & 0x0000F000 ) >> 8  ) | ( ( d & 0x0000F000 ) >> 12 );
-        bb[1] = ( ( d & 0x00000F00 ) >> 4  ) | ( ( d & 0x00000F00 ) >> 8  );
-    }
-
-    unsigned int tcw[2];
-    tcw[0] = ( d & 0xE0 ) >> 5;
-    tcw[1] = ( d & 0x1C ) >> 2;
-
-    uint32_t b1 = ( d >> 32 ) & 0xFFFF;
-    uint32_t b2 = ( d >> 48 );
-
-    b1 = ( b1 | ( b1 << 8 ) ) & 0x00FF00FF;
-    b1 = ( b1 | ( b1 << 4 ) ) & 0x0F0F0F0F;
-    b1 = ( b1 | ( b1 << 2 ) ) & 0x33333333;
-    b1 = ( b1 | ( b1 << 1 ) ) & 0x55555555;
-
-    b2 = ( b2 | ( b2 << 8 ) ) & 0x00FF00FF;
-    b2 = ( b2 | ( b2 << 4 ) ) & 0x0F0F0F0F;
-    b2 = ( b2 | ( b2 << 2 ) ) & 0x33333333;
-    b2 = ( b2 | ( b2 << 1 ) ) & 0x55555555;
-
-    uint32_t idx = b1 | ( b2 << 1 );
-
-    const int32_t base = alpha >> 56;
-    const int32_t mul = ( alpha >> 52 ) & 0xF;
-    const auto atbl = g_alpha[( alpha >> 48 ) & 0xF];
-
-    if( d & 0x1 )
-    {
-        for( int i=0; i<4; i++ )
-        {
-            for( int j=0; j<4; j++ )
-            {
-                const auto mod = g_table[tcw[j/2]][idx & 0x3];
-                const auto r = br[j/2] + mod;
-                const auto g = bg[j/2] + mod;
-                const auto b = bb[j/2] + mod;
-                const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
-                const uint32_t a = clampu8( base + amod * mul );
-                if( ( ( r | g | b ) & ~0xFF ) == 0 )
-                {
-                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
-                }
-                else
-                {
-                    const auto rc = clampu8( r );
-                    const auto gc = clampu8( g );
-                    const auto bc = clampu8( b );
-                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
-                }
-                idx >>= 2;
-            }
-        }
-    }
-    else
-    {
-        for( int i=0; i<4; i++ )
-        {
-            const auto tbl = g_table[tcw[i/2]];
-            const auto cr = br[i/2];
-            const auto cg = bg[i/2];
-            const auto cb = bb[i/2];
-
-            for( int j=0; j<4; j++ )
-            {
-                const auto mod = tbl[idx & 0x3];
-                const auto r = cr + mod;
-                const auto g = cg + mod;
-                const auto b = cb + mod;
-                const auto amod = atbl[(alpha >> ( 45 - j*3 - i*12 )) & 0x7];
-                const uint32_t a = clampu8( base + amod * mul );
-                if( ( ( r | g | b ) & ~0xFF ) == 0 )
-                {
-                    dst[j*w+i] = r | ( g << 8 ) | ( b << 16 ) | ( a << 24 );
-                }
-                else
-                {
-                    const auto rc = clampu8( r );
-                    const auto gc = clampu8( g );
-                    const auto bc = clampu8( b );
-                    dst[j*w+i] = rc | ( gc << 8 ) | ( bc << 16 ) | ( a << 24 );
-                }
-                idx >>= 2;
-            }
-        }
-    }
-}
-
-BitmapPtr BlockData::DecodeRGB()
-{
-    auto ret = std::make_shared<Bitmap>( m_size );
-
-    const uint64_t* src = (const uint64_t*)( m_data + m_dataOffset );
-    uint32_t* dst = ret->Data();
-
-    for( int y=0; y<m_size.y/4; y++ )
-    {
-        for( int x=0; x<m_size.x/4; x++ )
-        {
-            uint64_t d = *src++;
-            DecodeRGBPart( d, dst, m_size.x );
-            dst += 4;
-        }
-        dst += m_size.x*3;
-    }
-
-    return ret;
-}
-
-BitmapPtr BlockData::DecodeRGBA()
-{
-    auto ret = std::make_shared<Bitmap>( m_size );
-
-    const uint64_t* src = (const uint64_t*)( m_data + m_dataOffset );
-    uint32_t* dst = ret->Data();
-
-    for( int y=0; y<m_size.y/4; y++ )
-    {
-        for( int x=0; x<m_size.x/4; x++ )
-        {
-            uint64_t a = *src++;
-            uint64_t d = *src++;
-            DecodeRGBAPart( d, a, dst, m_size.x );
-            dst += 4;
-        }
-        dst += m_size.x*3;
-    }
-
-    return ret;
-}
-
-static etcpak_force_inline void DecodeDxt1Part( uint64_t d, uint32_t* dst, uint32_t w )
-{
-    uint8_t* in = (uint8_t*)&d;
-    uint16_t c0, c1;
-    uint32_t idx;
-    memcpy( &c0, in, 2 );
-    memcpy( &c1, in+2, 2 );
-    memcpy( &idx, in+4, 4 );
-
-    uint8_t r0 = ( ( c0 & 0xF800 ) >> 8 ) | ( ( c0 & 0xF800 ) >> 13 );
-    uint8_t g0 = ( ( c0 & 0x07E0 ) >> 3 ) | ( ( c0 & 0x07E0 ) >> 9 );
-    uint8_t b0 = ( ( c0 & 0x001F ) << 3 ) | ( ( c0 & 0x001F ) >> 2 );
-
-    uint8_t r1 = ( ( c1 & 0xF800 ) >> 8 ) | ( ( c1 & 0xF800 ) >> 13 );
-    uint8_t g1 = ( ( c1 & 0x07E0 ) >> 3 ) | ( ( c1 & 0x07E0 ) >> 9 );
-    uint8_t b1 = ( ( c1 & 0x001F ) << 3 ) | ( ( c1 & 0x001F ) >> 2 );
-
-    uint32_t dict[4];
-
-    dict[0] = 0xFF000000 | ( b0 << 16 ) | ( g0 << 8 ) | r0;
-    dict[1] = 0xFF000000 | ( b1 << 16 ) | ( g1 << 8 ) | r1;
-
-    uint32_t r, g, b;
-    if( c0 > c1 )
-    {
-        r = (2*r0+r1)/3;
-        g = (2*g0+g1)/3;
-        b = (2*b0+b1)/3;
-        dict[2] = 0xFF000000 | ( b << 16 ) | ( g << 8 ) | r;
-        r = (2*r1+r0)/3;
-        g = (2*g1+g0)/3;
-        b = (2*b1+b0)/3;
-        dict[3] = 0xFF000000 | ( b << 16 ) | ( g << 8 ) | r;
-    }
-    else
-    {
-        r = (int(r0)+r1)/2;
-        g = (int(g0)+g1)/2;
-        b = (int(b0)+b1)/2;
-        dict[2] = 0xFF000000 | ( b << 16 ) | ( g << 8 ) | r;
-        dict[3] = 0xFF000000;
-    }
-
-    memcpy( dst+0, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+1, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+2, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+3, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    dst += w;
-
-    memcpy( dst+0, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+1, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+2, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+3, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    dst += w;
-
-    memcpy( dst+0, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+1, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+2, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+3, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    dst += w;
-
-    memcpy( dst+0, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+1, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+2, dict + (idx & 0x3), 4 );
-    idx >>= 2;
-    memcpy( dst+3, dict + (idx & 0x3), 4 );
-}
-
-static etcpak_force_inline void DecodeDxt5Part( uint64_t a, uint64_t d, uint32_t* dst, uint32_t w )
-{
-    uint8_t* ain = (uint8_t*)&a;
-    uint8_t a0, a1;
-    uint64_t aidx = 0;
-    memcpy( &a0, ain, 1 );
-    memcpy( &a1, ain+1, 1 );
-    memcpy( &aidx, ain+2, 6 );
-
-    uint8_t* in = (uint8_t*)&d;
-    uint16_t c0, c1;
-    uint32_t idx;
-    memcpy( &c0, in, 2 );
-    memcpy( &c1, in+2, 2 );
-    memcpy( &idx, in+4, 4 );
-
-    uint32_t adict[8];
-    adict[0] = a0 << 24;
-    adict[1] = a1 << 24;
-    if( a0 > a1 )
-    {
-        adict[2] = ( (6*a0+1*a1)/7 ) << 24;
-        adict[3] = ( (5*a0+2*a1)/7 ) << 24;
-        adict[4] = ( (4*a0+3*a1)/7 ) << 24;
-        adict[5] = ( (3*a0+4*a1)/7 ) << 24;
-        adict[6] = ( (2*a0+5*a1)/7 ) << 24;
-        adict[7] = ( (1*a0+6*a1)/7 ) << 24;
-    }
-    else
-    {
-        adict[2] = ( (4*a0+1*a1)/5 ) << 24;
-        adict[3] = ( (3*a0+2*a1)/5 ) << 24;
-        adict[4] = ( (2*a0+3*a1)/5 ) << 24;
-        adict[5] = ( (1*a0+4*a1)/5 ) << 24;
-        adict[6] = 0;
-        adict[7] = 0xFF000000;
-    }
-
-    uint8_t r0 = ( ( c0 & 0xF800 ) >> 8 ) | ( ( c0 & 0xF800 ) >> 13 );
-    uint8_t g0 = ( ( c0 & 0x07E0 ) >> 3 ) | ( ( c0 & 0x07E0 ) >> 9 );
-    uint8_t b0 = ( ( c0 & 0x001F ) << 3 ) | ( ( c0 & 0x001F ) >> 2 );
-
-    uint8_t r1 = ( ( c1 & 0xF800 ) >> 8 ) | ( ( c1 & 0xF800 ) >> 13 );
-    uint8_t g1 = ( ( c1 & 0x07E0 ) >> 3 ) | ( ( c1 & 0x07E0 ) >> 9 );
-    uint8_t b1 = ( ( c1 & 0x001F ) << 3 ) | ( ( c1 & 0x001F ) >> 2 );
-
-    uint32_t dict[4];
-
-    dict[0] = ( b0 << 16 ) | ( g0 << 8 ) | r0;
-    dict[1] = ( b1 << 16 ) | ( g1 << 8 ) | r1;
-
-    uint32_t r, g, b;
-    if( c0 > c1 )
-    {
-        r = (2*r0+r1)/3;
-        g = (2*g0+g1)/3;
-        b = (2*b0+b1)/3;
-        dict[2] = ( b << 16 ) | ( g << 8 ) | r;
-        r = (2*r1+r0)/3;
-        g = (2*g1+g0)/3;
-        b = (2*b1+b0)/3;
-        dict[3] = ( b << 16 ) | ( g << 8 ) | r;
-    }
-    else
-    {
-        r = (int(r0)+r1)/2;
-        g = (int(g0)+g1)/2;
-        b = (int(b0)+b1)/2;
-        dict[2] = ( b << 16 ) | ( g << 8 ) | r;
-        dict[3] = 0;
-    }
-
-    dst[0] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[1] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[2] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[3] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst += w;
-
-    dst[0] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[1] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[2] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[3] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst += w;
-
-    dst[0] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[1] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[2] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[3] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst += w;
-
-    dst[0] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[1] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[2] = dict[idx & 0x3] | adict[aidx & 0x7];
-    idx >>= 2;
-    aidx >>= 3;
-    dst[3] = dict[idx & 0x3] | adict[aidx & 0x7];
-}
-
-BitmapPtr BlockData::DecodeDxt1()
-{
-    auto ret = std::make_shared<Bitmap>( m_size );
-
-    const uint64_t* src = (const uint64_t*)( m_data + m_dataOffset );
-    uint32_t* dst = ret->Data();
-
-    for( int y=0; y<m_size.y/4; y++ )
-    {
-        for( int x=0; x<m_size.x/4; x++ )
-        {
-            uint64_t d = *src++;
-            DecodeDxt1Part( d, dst, m_size.x );
-            dst += 4;
-        }
-        dst += m_size.x*3;
-    }
-
-    return ret;
-}
-
-BitmapPtr BlockData::DecodeDxt5()
-{
-    auto ret = std::make_shared<Bitmap>( m_size );
-
-    const uint64_t* src = (const uint64_t*)( m_data + m_dataOffset );
-    uint32_t* dst = ret->Data();
-
-    for( int y=0; y<m_size.y/4; y++ )
-    {
-        for( int x=0; x<m_size.x/4; x++ )
-        {
-            uint64_t a = *src++;
-            uint64_t d = *src++;
-            DecodeDxt5Part( a, d, dst, m_size.x );
-            dst += 4;
-        }
-        dst += m_size.x*3;
-    }
-
-    return ret;
-}
diff --git a/thirdparty/etcpak/BlockData.hpp b/thirdparty/etcpak/BlockData.hpp
deleted file mode 100644
index 209e35b4e6..0000000000
--- a/thirdparty/etcpak/BlockData.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef __BLOCKDATA_HPP__
-#define __BLOCKDATA_HPP__
-
-#include <condition_variable>
-#include <future>
-#include <memory>
-#include <mutex>
-#include <stdint.h>
-#include <stdio.h>
-#include <vector>
-
-#include "Bitmap.hpp"
-#include "ForceInline.hpp"
-#include "Vector.hpp"
-
-class BlockData
-{
-public:
-    enum Type
-    {
-        Etc1,
-        Etc2_RGB,
-        Etc2_RGBA,
-        Dxt1,
-        Dxt5
-    };
-
-    BlockData( const char* fn );
-    BlockData( const char* fn, const v2i& size, bool mipmap, Type type );
-    BlockData( const v2i& size, bool mipmap, Type type );
-    ~BlockData();
-
-    BitmapPtr Decode();
-
-    void Process( const uint32_t* src, uint32_t blocks, size_t offset, size_t width, Channels type, bool dither );
-    void ProcessRGBA( const uint32_t* src, uint32_t blocks, size_t offset, size_t width );
-
-    const v2i& Size() const { return m_size; }
-
-private:
-    etcpak_no_inline BitmapPtr DecodeRGB();
-    etcpak_no_inline BitmapPtr DecodeRGBA();
-    etcpak_no_inline BitmapPtr DecodeDxt1();
-    etcpak_no_inline BitmapPtr DecodeDxt5();
-
-    uint8_t* m_data;
-    v2i m_size;
-    size_t m_dataOffset;
-    FILE* m_file;
-    size_t m_maplen;
-    Type m_type;
-};
-
-typedef std::shared_ptr<BlockData> BlockDataPtr;
-
-#endif
diff --git a/thirdparty/etcpak/ColorSpace.cpp b/thirdparty/etcpak/ColorSpace.cpp
deleted file mode 100644
index 0411541066..0000000000
--- a/thirdparty/etcpak/ColorSpace.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <math.h>
-#include <stdint.h>
-
-#include "Math.hpp"
-#include "ColorSpace.hpp"
-
-namespace Color
-{
-
-    static const XYZ white( v3b( 255, 255, 255 ) );
-    static const v3f rwhite( 1.f / white.x, 1.f / white.y, 1.f / white.z );
-
-
-    XYZ::XYZ( float _x, float _y, float _z )
-        : x( _x )
-        , y( _y )
-        , z( _z )
-    {
-    }
-
-    XYZ::XYZ( const v3b& rgb )
-    {
-        const float r = rgb.x / 255.f;
-        const float g = rgb.y / 255.f;
-        const float b = rgb.z / 255.f;
-
-        const float rl = sRGB2linear( r );
-        const float gl = sRGB2linear( g );
-        const float bl = sRGB2linear( b );
-
-        x = 0.4124f * rl + 0.3576f * gl + 0.1805f * bl;
-        y = 0.2126f * rl + 0.7152f * gl + 0.0722f * bl;
-        z = 0.0193f * rl + 0.1192f * gl + 0.9505f * bl;
-    }
-
-    static float revlab( float t )
-    {
-        const float p1 = 6.f/29.f;
-        const float p2 = 4.f/29.f;
-
-        if( t > p1 )
-        {
-            return t*t*t;
-        }
-        else
-        {
-            return 3 * sq( p1 ) * ( t - p2 );
-        }
-    }
-
-    XYZ::XYZ( const Lab& lab )
-    {
-        y = white.y * revlab( 1.f/116.f * ( lab.L + 16 ) );
-        x = white.x * revlab( 1.f/116.f * ( lab.L + 16 ) + 1.f/500.f * lab.a );
-        z = white.z * revlab( 1.f/116.f * ( lab.L + 16 ) - 1.f/200.f * lab.b );
-    }
-
-    v3i XYZ::RGB() const
-    {
-        const float rl =  3.2406f * x - 1.5372f * y - 0.4986f * z;
-        const float gl = -0.9689f * x + 1.8758f * y + 0.0415f * z;
-        const float bl =  0.0557f * x - 0.2040f * y + 1.0570f * z;
-
-        const float r = linear2sRGB( rl );
-        const float g = linear2sRGB( gl );
-        const float b = linear2sRGB( bl );
-
-        return v3i( clampu8( int32_t( r * 255 ) ), clampu8( int32_t( g * 255 ) ), clampu8( int32_t( b * 255 ) ) );
-    }
-
-
-    Lab::Lab()
-        : L( 0 )
-        , a( 0 )
-        , b( 0 )
-    {
-    }
-
-    Lab::Lab( float L, float a, float b )
-        : L( L )
-        , a( a )
-        , b( b )
-    {
-    }
-
-    static float labfunc( float t )
-    {
-        const float p1 = (6.f/29.f)*(6.f/29.f)*(6.f/29.f);
-        const float p2 = (1.f/3.f)*(29.f/6.f)*(29.f/6.f);
-        const float p3 = (4.f/29.f);
-
-        if( t > p1 )
-        {
-            return pow( t, 1.f/3.f );
-        }
-        else
-        {
-            return p2 * t + p3;
-        }
-    }
-
-    Lab::Lab( const XYZ& xyz )
-    {
-        L = 116 * labfunc( xyz.y * rwhite.y ) - 16;
-        a = 500 * ( labfunc( xyz.x * rwhite.x ) - labfunc( xyz.y * rwhite.y ) );
-        b = 200 * ( labfunc( xyz.y * rwhite.y ) - labfunc( xyz.z * rwhite.z ) );
-    }
-
-    Lab::Lab( const v3b& rgb )
-    {
-        new(this) Lab( XYZ( rgb ) );
-    }
-
-}
diff --git a/thirdparty/etcpak/ColorSpace.hpp b/thirdparty/etcpak/ColorSpace.hpp
deleted file mode 100644
index c9d0a9cf3f..0000000000
--- a/thirdparty/etcpak/ColorSpace.hpp
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __DARKRL__COLORSPACE_HPP__
-#define __DARKRL__COLORSPACE_HPP__
-
-#include "Vector.hpp"
-
-namespace Color
-{
-
-    class Lab;
-
-    class XYZ
-    {
-    public:
-        XYZ( float x, float y, float z );
-        XYZ( const v3b& rgb );
-        XYZ( const Lab& lab );
-
-        v3i RGB() const;
-
-        float x, y, z;
-    };
-
-    class Lab
-    {
-    public:
-        Lab();
-        Lab( float L, float a, float b );
-        Lab( const XYZ& xyz );
-        Lab( const v3b& rgb );
-
-        float L, a, b;
-    };
-
-}
-
-#endif
diff --git a/thirdparty/etcpak/DataProvider.cpp b/thirdparty/etcpak/DataProvider.cpp
deleted file mode 100644
index 6bd4b105ed..0000000000
--- a/thirdparty/etcpak/DataProvider.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <assert.h>
-#include <utility>
-
-#include "BitmapDownsampled.hpp"
-#include "DataProvider.hpp"
-#include "MipMap.hpp"
-
-DataProvider::DataProvider( const char* fn, bool mipmap, bool bgr )
-    : m_offset( 0 )
-    , m_mipmap( mipmap )
-    , m_done( false )
-    , m_lines( 32 )
-{
-    m_bmp.emplace_back( new Bitmap( fn, m_lines, bgr ) );
-    m_current = m_bmp[0].get();
-}
-
-DataProvider::~DataProvider()
-{
-}
-
-unsigned int DataProvider::NumberOfParts() const
-{
-    unsigned int parts = ( ( m_bmp[0]->Size().y / 4 ) + m_lines - 1 ) / m_lines;
-
-    if( m_mipmap )
-    {
-        v2i current = m_bmp[0]->Size();
-        int levels = NumberOfMipLevels( current );
-        unsigned int lines = m_lines;
-        for( int i=1; i<levels; i++ )
-        {
-            assert( current.x != 1 || current.y != 1 );
-            current.x = std::max( 1, current.x / 2 );
-            current.y = std::max( 1, current.y / 2 );
-            lines *= 2;
-            parts += ( ( std::max( 4, current.y ) / 4 ) + lines - 1 ) / lines;
-        }
-        assert( current.x == 1 && current.y == 1 );
-    }
-
-    return parts;
-}
-
-DataPart DataProvider::NextPart()
-{
-    assert( !m_done );
-
-    unsigned int lines = m_lines;
-    bool done;
-
-    const auto ptr = m_current->NextBlock( lines, done );
-    DataPart ret = {
-        ptr,
-        std::max<unsigned int>( 4, m_current->Size().x ),
-        lines,
-        m_offset
-    };
-
-    m_offset += m_current->Size().x / 4 * lines;
-
-    if( done )
-    {
-        if( m_mipmap && ( m_current->Size().x != 1 || m_current->Size().y != 1 ) )
-        {
-            m_lines *= 2;
-            m_bmp.emplace_back( new BitmapDownsampled( *m_current, m_lines ) );
-            m_current = m_bmp[m_bmp.size()-1].get();
-        }
-        else
-        {
-            m_done = true;
-        }
-    }
-
-    return ret;
-}
diff --git a/thirdparty/etcpak/DataProvider.hpp b/thirdparty/etcpak/DataProvider.hpp
deleted file mode 100644
index e773801ed6..0000000000
--- a/thirdparty/etcpak/DataProvider.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef __DATAPROVIDER_HPP__
-#define __DATAPROVIDER_HPP__
-
-#include <memory>
-#include <stdint.h>
-#include <vector>
-
-#include "Bitmap.hpp"
-
-struct DataPart
-{
-    const uint32_t* src;
-    unsigned int width;
-    unsigned int lines;
-    unsigned int offset;
-};
-
-class DataProvider
-{
-public:
-    DataProvider( const char* fn, bool mipmap, bool bgr );
-    ~DataProvider();
-
-    unsigned int NumberOfParts() const;
-
-    DataPart NextPart();
-
-    bool Alpha() const { return m_bmp[0]->Alpha(); }
-    const v2i& Size() const { return m_bmp[0]->Size(); }
-    const Bitmap& ImageData() const { return *m_bmp[0]; }
-
-private:
-    std::vector<std::unique_ptr<Bitmap>> m_bmp;
-    Bitmap* m_current;
-    unsigned int m_offset;
-    unsigned int m_lines;
-    bool m_mipmap;
-    bool m_done;
-};
-
-#endif
diff --git a/thirdparty/etcpak/Debug.cpp b/thirdparty/etcpak/Debug.cpp
deleted file mode 100644
index 72dc4e0526..0000000000
--- a/thirdparty/etcpak/Debug.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <algorithm>
-#include <vector>
-#include "Debug.hpp"
-
-static std::vector<DebugLog::Callback*> s_callbacks;
-
-void DebugLog::Message( const char* msg )
-{
-    for( auto it = s_callbacks.begin(); it != s_callbacks.end(); ++it )
-    {
-        (*it)->OnDebugMessage( msg );
-    }
-}
-
-void DebugLog::AddCallback( Callback* c )
-{
-    const auto it = std::find( s_callbacks.begin(), s_callbacks.end(), c );
-    if( it == s_callbacks.end() )
-    {
-        s_callbacks.push_back( c );
-    }
-}
-
-void DebugLog::RemoveCallback( Callback* c )
-{
-    const auto it = std::find( s_callbacks.begin(), s_callbacks.end(), c );
-    if( it != s_callbacks.end() )
-    {
-        s_callbacks.erase( it );
-    }
-}
diff --git a/thirdparty/etcpak/Debug.hpp b/thirdparty/etcpak/Debug.hpp
deleted file mode 100644
index 524eaa7baf..0000000000
--- a/thirdparty/etcpak/Debug.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __DARKRL__DEBUG_HPP__
-#define __DARKRL__DEBUG_HPP__
-
-#ifdef DEBUG
-#  include <sstream>
-#  define DBGPRINT(msg) { std::stringstream __buf; __buf << msg; DebugLog::Message( __buf.str().c_str() ); }
-#else
-#  define DBGPRINT(msg) ((void)0)
-#endif
-
-class DebugLog
-{
-public:
-    struct Callback
-    {
-        virtual void OnDebugMessage( const char* msg ) = 0;
-    };
-
-    static void Message( const char* msg );
-    static void AddCallback( Callback* c );
-    static void RemoveCallback( Callback* c );
-
-private:
-    DebugLog() {}
-};
-
-#endif
diff --git a/thirdparty/etcpak/Error.cpp b/thirdparty/etcpak/Error.cpp
deleted file mode 100644
index 014ecdab66..0000000000
--- a/thirdparty/etcpak/Error.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <stdint.h>
-
-#include "Error.hpp"
-#include "Math.hpp"
-
-float CalcMSE3( const Bitmap& bmp, const Bitmap& out )
-{
-    float err = 0;
-
-    const uint32_t* p1 = bmp.Data();
-    const uint32_t* p2 = out.Data();
-    size_t cnt = bmp.Size().x * bmp.Size().y;
-
-    for( size_t i=0; i<cnt; i++ )
-    {
-        uint32_t c1 = *p1++;
-        uint32_t c2 = *p2++;
-
-        err += sq( ( c1 & 0x000000FF ) - ( c2 & 0x000000FF ) );
-        err += sq( ( ( c1 & 0x0000FF00 ) >> 8 ) - ( ( c2 & 0x0000FF00 ) >> 8 ) );
-        err += sq( ( ( c1 & 0x00FF0000 ) >> 16 ) - ( ( c2 & 0x00FF0000 ) >> 16 ) );
-    }
-
-    err /= cnt * 3;
-
-    return err;
-}
-
-float CalcMSE1( const Bitmap& bmp, const Bitmap& out )
-{
-    float err = 0;
-
-    const uint32_t* p1 = bmp.Data();
-    const uint32_t* p2 = out.Data();
-    size_t cnt = bmp.Size().x * bmp.Size().y;
-
-    for( size_t i=0; i<cnt; i++ )
-    {
-        uint32_t c1 = *p1++;
-        uint32_t c2 = *p2++;
-
-        err += sq( ( c1 >> 24 ) - ( c2 & 0xFF ) );
-    }
-
-    err /= cnt;
-
-    return err;
-}
diff --git a/thirdparty/etcpak/Error.hpp b/thirdparty/etcpak/Error.hpp
deleted file mode 100644
index 9817754b74..0000000000
--- a/thirdparty/etcpak/Error.hpp
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ERROR_HPP__
-#define __ERROR_HPP__
-
-#include "Bitmap.hpp"
-
-float CalcMSE3( const Bitmap& bmp, const Bitmap& out );
-float CalcMSE1( const Bitmap& bmp, const Bitmap& out );
-
-#endif
diff --git a/thirdparty/etcpak/MipMap.hpp b/thirdparty/etcpak/MipMap.hpp
deleted file mode 100644
index d3b4bc9e7c..0000000000
--- a/thirdparty/etcpak/MipMap.hpp
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __MIPMAP_HPP__
-#define __MIPMAP_HPP__
-
-#include "Vector.hpp"
-
-inline int NumberOfMipLevels( const v2i& size )
-{
-    return (int)floor( log2( std::max( size.x, size.y ) ) ) + 1;
-}
-
-#endif
diff --git a/thirdparty/etcpak/Semaphore.hpp b/thirdparty/etcpak/Semaphore.hpp
deleted file mode 100644
index 9e42dbb9e0..0000000000
--- a/thirdparty/etcpak/Semaphore.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef __DARKRL__SEMAPHORE_HPP__
-#define __DARKRL__SEMAPHORE_HPP__
-
-#include <condition_variable>
-#include <mutex>
-
-class Semaphore
-{
-public:
-    Semaphore( int count ) : m_count( count ) {}
-
-    void lock()
-    {
-        std::unique_lock<std::mutex> lock( m_mutex );
-        m_cv.wait( lock, [this](){ return m_count != 0; } );
-        m_count--;
-    }
-
-    void unlock()
-    {
-        std::lock_guard<std::mutex> lock( m_mutex );
-        m_count++;
-        m_cv.notify_one();
-    }
-
-    bool try_lock()
-    {
-        std::lock_guard<std::mutex> lock( m_mutex );
-        if( m_count == 0 )
-        {
-            return false;
-        }
-        else
-        {
-            m_count--;
-            return true;
-        }
-    }
-
-private:
-    std::mutex m_mutex;
-    std::condition_variable m_cv;
-    unsigned int m_count;
-};
-
-#endif
diff --git a/thirdparty/etcpak/System.cpp b/thirdparty/etcpak/System.cpp
deleted file mode 100644
index 041f2676e8..0000000000
--- a/thirdparty/etcpak/System.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-#include <algorithm>
-#ifdef _WIN32
-#  include <windows.h>
-#else
-#  include <unistd.h>
-#endif
-
-#include "System.hpp"
-
-unsigned int System::CPUCores()
-{
-    static unsigned int cores = 0;
-    if( cores == 0 )
-    {
-        int tmp;
-#ifdef _WIN32
-        SYSTEM_INFO info;
-        GetSystemInfo( &info );
-        tmp = (int)info.dwNumberOfProcessors;
-#else
-#  ifndef _SC_NPROCESSORS_ONLN
-#    ifdef _SC_NPROC_ONLN
-#      define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
-#    elif defined _SC_CRAY_NCPU
-#      define _SC_NPROCESSORS_ONLN _SC_CRAY_NCPU
-#    endif
-#  endif
-        tmp = (int)(long)sysconf( _SC_NPROCESSORS_ONLN );
-#endif
-        cores = (unsigned int)std::max( tmp, 1 );
-    }
-    return cores;
-}
-
-void System::SetThreadName( std::thread& thread, const char* name )
-{
-#ifdef _MSC_VER
-    const DWORD MS_VC_EXCEPTION=0x406D1388;
-
-#  pragma pack( push, 8 )
-    struct THREADNAME_INFO
-    {
-       DWORD dwType;
-       LPCSTR szName;
-       DWORD dwThreadID;
-       DWORD dwFlags;
-    };
-#  pragma pack(pop)
-
-    DWORD ThreadId = GetThreadId( static_cast<HANDLE>( thread.native_handle() ) );
-    THREADNAME_INFO info;
-    info.dwType = 0x1000;
-    info.szName = name;
-    info.dwThreadID = ThreadId;
-    info.dwFlags = 0;
-
-    __try
-    {
-       RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
-    }
-    __except(EXCEPTION_EXECUTE_HANDLER)
-    {
-    }
-#endif
-}
diff --git a/thirdparty/etcpak/System.hpp b/thirdparty/etcpak/System.hpp
deleted file mode 100644
index 1a09bb15e1..0000000000
--- a/thirdparty/etcpak/System.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef __DARKRL__SYSTEM_HPP__
-#define __DARKRL__SYSTEM_HPP__
-
-#include <thread>
-
-class System
-{
-public:
-    System() = delete;
-
-    static unsigned int CPUCores();
-    static void SetThreadName( std::thread& thread, const char* name );
-};
-
-#endif
diff --git a/thirdparty/etcpak/TaskDispatch.cpp b/thirdparty/etcpak/TaskDispatch.cpp
deleted file mode 100644
index b1ba17953b..0000000000
--- a/thirdparty/etcpak/TaskDispatch.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-#include <assert.h>
-#include <stdio.h>
-#ifndef _MSC_VER
-#include <pthread.h>
-#endif
-
-#include "Debug.hpp"
-#include "System.hpp"
-#include "TaskDispatch.hpp"
-
-static TaskDispatch* s_instance = nullptr;
-
-TaskDispatch::TaskDispatch( size_t workers )
-    : m_exit( false )
-    , m_jobs( 0 )
-{
-    assert( !s_instance );
-    s_instance = this;
-
-    assert( workers >= 1 );
-    workers--;
-
-    m_workers.reserve( workers );
-    for( size_t i=0; i<workers; i++ )
-    {
-        char tmp[16];
-        sprintf( tmp, "Worker %zu", i );
-#ifdef _MSC_VER
-        auto worker = std::thread( [this]{ Worker(); } );
-        System::SetThreadName( worker, tmp );
-#else // Using pthread.
-        auto worker = std::thread( [this, tmp]{
-#ifdef __APPLE__
-            pthread_setname_np( tmp );
-#else // Linux or MinGW.
-            pthread_setname_np( pthread_self(), tmp );
-#endif
-            Worker();
-        } );
-#endif
-        m_workers.emplace_back( std::move( worker ) );
-    }
-
-    DBGPRINT( "Task dispatcher with " << m_workers.size() + 1 << " workers" );
-}
-
-TaskDispatch::~TaskDispatch()
-{
-    m_exit = true;
-    m_queueLock.lock();
-    m_cvWork.notify_all();
-    m_queueLock.unlock();
-
-    for( auto& worker : m_workers )
-    {
-        worker.join();
-    }
-
-    assert( s_instance );
-    s_instance = nullptr;
-}
-
-void TaskDispatch::Queue( const std::function<void(void)>& f )
-{
-    std::unique_lock<std::mutex> lock( s_instance->m_queueLock );
-    s_instance->m_queue.emplace_back( f );
-    const auto size = s_instance->m_queue.size();
-    lock.unlock();
-    if( size > 1 )
-    {
-        s_instance->m_cvWork.notify_one();
-    }
-}
-
-void TaskDispatch::Queue( std::function<void(void)>&& f )
-{
-    std::unique_lock<std::mutex> lock( s_instance->m_queueLock );
-    s_instance->m_queue.emplace_back( std::move( f ) );
-    const auto size = s_instance->m_queue.size();
-    lock.unlock();
-    if( size > 1 )
-    {
-        s_instance->m_cvWork.notify_one();
-    }
-}
-
-void TaskDispatch::Sync()
-{
-    std::unique_lock<std::mutex> lock( s_instance->m_queueLock );
-    while( !s_instance->m_queue.empty() )
-    {
-        auto f = s_instance->m_queue.back();
-        s_instance->m_queue.pop_back();
-        lock.unlock();
-        f();
-        lock.lock();
-    }
-    s_instance->m_cvJobs.wait( lock, []{ return s_instance->m_jobs == 0; } );
-}
-
-void TaskDispatch::Worker()
-{
-    for(;;)
-    {
-        std::unique_lock<std::mutex> lock( m_queueLock );
-        m_cvWork.wait( lock, [this]{ return !m_queue.empty() || m_exit; } );
-        if( m_exit ) return;
-        auto f = m_queue.back();
-        m_queue.pop_back();
-        m_jobs++;
-        lock.unlock();
-        f();
-        lock.lock();
-        m_jobs--;
-        bool notify = m_jobs == 0 && m_queue.empty();
-        lock.unlock();
-        if( notify )
-        {
-            m_cvJobs.notify_all();
-        }
-    }
-}
diff --git a/thirdparty/etcpak/TaskDispatch.hpp b/thirdparty/etcpak/TaskDispatch.hpp
deleted file mode 100644
index b513de4c0c..0000000000
--- a/thirdparty/etcpak/TaskDispatch.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __DARKRL__TASKDISPATCH_HPP__
-#define __DARKRL__TASKDISPATCH_HPP__
-
-#include <atomic>
-#include <condition_variable>
-#include <functional>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-class TaskDispatch
-{
-public:
-    TaskDispatch( size_t workers );
-    ~TaskDispatch();
-
-    static void Queue( const std::function<void(void)>& f );
-    static void Queue( std::function<void(void)>&& f );
-
-    static void Sync();
-
-private:
-    void Worker();
-
-    std::vector<std::function<void(void)>> m_queue;
-    std::mutex m_queueLock;
-    std::condition_variable m_cvWork, m_cvJobs;
-    std::atomic<bool> m_exit;
-    size_t m_jobs;
-
-    std::vector<std::thread> m_workers;
-};
-
-#endif
diff --git a/thirdparty/etcpak/Timing.cpp b/thirdparty/etcpak/Timing.cpp
deleted file mode 100644
index 2af851f9a9..0000000000
--- a/thirdparty/etcpak/Timing.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include <chrono>
-
-#include "Timing.hpp"
-
-uint64_t GetTime()
-{
-    return std::chrono::time_point_cast<std::chrono::microseconds>( std::chrono::high_resolution_clock::now() ).time_since_epoch().count();
-}
diff --git a/thirdparty/etcpak/Timing.hpp b/thirdparty/etcpak/Timing.hpp
deleted file mode 100644
index 3767e20f24..0000000000
--- a/thirdparty/etcpak/Timing.hpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __DARKRL__TIMING_HPP__
-#define __DARKRL__TIMING_HPP__
-
-#include <stdint.h>
-
-uint64_t GetTime();
-
-#endif
diff --git a/thirdparty/etcpak/lz4/lz4.c b/thirdparty/etcpak/lz4/lz4.c
deleted file mode 100644
index 08cf6b5cd7..0000000000
--- a/thirdparty/etcpak/lz4/lz4.c
+++ /dev/null
@@ -1,1516 +0,0 @@
-/*
-   LZ4 - Fast LZ compression algorithm
-   Copyright (C) 2011-2015, Yann Collet.
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - LZ4 source repository : https://github.com/Cyan4973/lz4
-   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
-*/
-
-
-/**************************************
-*  Tuning parameters
-**************************************/
-/*
- * HEAPMODE :
- * Select how default compression functions will allocate memory for their hash table,
- * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
- */
-#define HEAPMODE 0
-
-/*
- * ACCELERATION_DEFAULT :
- * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
- */
-#define ACCELERATION_DEFAULT 1
-
-
-/**************************************
-*  CPU Feature Detection
-**************************************/
-/*
- * LZ4_FORCE_SW_BITCOUNT
- * Define this parameter if your target system or compiler does not support hardware bit count
- */
-#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
-#  define LZ4_FORCE_SW_BITCOUNT
-#endif
-
-
-/**************************************
-*  Includes
-**************************************/
-#include "lz4.h"
-
-
-/**************************************
-*  Compiler Options
-**************************************/
-#ifdef _MSC_VER    /* Visual Studio */
-#  define FORCE_INLINE static __forceinline
-#  include <intrin.h>
-#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
-#  pragma warning(disable : 4293)        /* disable: C4293: too large shift (32-bits) */
-#else
-#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
-#    if defined(__GNUC__) || defined(__clang__)
-#      define FORCE_INLINE static inline __attribute__((always_inline))
-#    else
-#      define FORCE_INLINE static inline
-#    endif
-#  else
-#    define FORCE_INLINE static
-#  endif   /* __STDC_VERSION__ */
-#endif  /* _MSC_VER */
-
-/* LZ4_GCC_VERSION is defined into lz4.h */
-#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
-#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
-#else
-#  define expect(expr,value)    (expr)
-#endif
-
-#define likely(expr)     expect((expr) != 0, 1)
-#define unlikely(expr)   expect((expr) != 0, 0)
-
-
-/**************************************
-*  Memory routines
-**************************************/
-#include <stdlib.h>   /* malloc, calloc, free */
-#define ALLOCATOR(n,s) calloc(n,s)
-#define FREEMEM        free
-#include <string.h>   /* memset, memcpy */
-#define MEM_INIT       memset
-
-
-/**************************************
-*  Basic Types
-**************************************/
-#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
-# include <stdint.h>
-  typedef  uint8_t BYTE;
-  typedef uint16_t U16;
-  typedef uint32_t U32;
-  typedef  int32_t S32;
-  typedef uint64_t U64;
-#else
-  typedef unsigned char       BYTE;
-  typedef unsigned short      U16;
-  typedef unsigned int        U32;
-  typedef   signed int        S32;
-  typedef unsigned long long  U64;
-#endif
-
-
-/**************************************
-*  Reading and writing into memory
-**************************************/
-#define STEPSIZE sizeof(size_t)
-
-static unsigned LZ4_64bits(void) { return sizeof(void*)==8; }
-
-static unsigned LZ4_isLittleEndian(void)
-{
-    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
-    return one.c[0];
-}
-
-
-static U16 LZ4_read16(const void* memPtr)
-{
-    U16 val16;
-    memcpy(&val16, memPtr, 2);
-    return val16;
-}
-
-static U16 LZ4_readLE16(const void* memPtr)
-{
-    if (LZ4_isLittleEndian())
-    {
-        return LZ4_read16(memPtr);
-    }
-    else
-    {
-        const BYTE* p = (const BYTE*)memPtr;
-        return (U16)((U16)p[0] + (p[1]<<8));
-    }
-}
-
-static void LZ4_writeLE16(void* memPtr, U16 value)
-{
-    if (LZ4_isLittleEndian())
-    {
-        memcpy(memPtr, &value, 2);
-    }
-    else
-    {
-        BYTE* p = (BYTE*)memPtr;
-        p[0] = (BYTE) value;
-        p[1] = (BYTE)(value>>8);
-    }
-}
-
-static U32 LZ4_read32(const void* memPtr)
-{
-    U32 val32;
-    memcpy(&val32, memPtr, 4);
-    return val32;
-}
-
-static U64 LZ4_read64(const void* memPtr)
-{
-    U64 val64;
-    memcpy(&val64, memPtr, 8);
-    return val64;
-}
-
-static size_t LZ4_read_ARCH(const void* p)
-{
-    if (LZ4_64bits())
-        return (size_t)LZ4_read64(p);
-    else
-        return (size_t)LZ4_read32(p);
-}
-
-
-static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); }
-
-static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); }
-
-/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
-static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
-{
-    BYTE* d = (BYTE*)dstPtr;
-    const BYTE* s = (const BYTE*)srcPtr;
-    BYTE* e = (BYTE*)dstEnd;
-    do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
-}
-
-
-/**************************************
-*  Common Constants
-**************************************/
-#define MINMATCH 4
-
-#define COPYLENGTH 8
-#define LASTLITERALS 5
-#define MFLIMIT (COPYLENGTH+MINMATCH)
-static const int LZ4_minLength = (MFLIMIT+1);
-
-#define KB *(1 <<10)
-#define MB *(1 <<20)
-#define GB *(1U<<30)
-
-#define MAXD_LOG 16
-#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
-
-#define ML_BITS  4
-#define ML_MASK  ((1U<<ML_BITS)-1)
-#define RUN_BITS (8-ML_BITS)
-#define RUN_MASK ((1U<<RUN_BITS)-1)
-
-
-/**************************************
-*  Common Utils
-**************************************/
-#define LZ4_STATIC_ASSERT(c)    { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
-
-
-/**************************************
-*  Common functions
-**************************************/
-static unsigned LZ4_NbCommonBytes (register size_t val)
-{
-    if (LZ4_isLittleEndian())
-    {
-        if (LZ4_64bits())
-        {
-#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanForward64( &r, (U64)val );
-            return (int)(r>>3);
-#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_ctzll((U64)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
-            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
-#       endif
-        }
-        else /* 32 bits */
-        {
-#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r;
-            _BitScanForward( &r, (U32)val );
-            return (int)(r>>3);
-#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_ctz((U32)val) >> 3);
-#       else
-            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
-            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
-#       endif
-        }
-    }
-    else   /* Big Endian CPU */
-    {
-        if (LZ4_64bits())
-        {
-#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanReverse64( &r, val );
-            return (unsigned)(r>>3);
-#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_clzll((U64)val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
-            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        }
-        else /* 32 bits */
-        {
-#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            unsigned long r = 0;
-            _BitScanReverse( &r, (unsigned long)val );
-            return (unsigned)(r>>3);
-#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
-            return (__builtin_clz((U32)val) >> 3);
-#       else
-            unsigned r;
-            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
-            r += (!val);
-            return r;
-#       endif
-        }
-    }
-}
-
-static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
-{
-    const BYTE* const pStart = pIn;
-
-    while (likely(pIn<pInLimit-(STEPSIZE-1)))
-    {
-        size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
-        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
-        pIn += LZ4_NbCommonBytes(diff);
-        return (unsigned)(pIn - pStart);
-    }
-
-    if (LZ4_64bits()) if ((pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
-    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
-    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
-    return (unsigned)(pIn - pStart);
-}
-
-
-#ifndef LZ4_COMMONDEFS_ONLY
-/**************************************
-*  Local Constants
-**************************************/
-#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
-#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
-#define HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
-
-static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
-static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
-
-
-/**************************************
-*  Local Structures and types
-**************************************/
-typedef struct {
-    U32 hashTable[HASH_SIZE_U32];
-    U32 currentOffset;
-    U32 initCheck;
-    const BYTE* dictionary;
-    BYTE* bufferStart;   /* obsolete, used for slideInputBuffer */
-    U32 dictSize;
-} LZ4_stream_t_internal;
-
-typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
-typedef enum { byPtr, byU32, byU16 } tableType_t;
-
-typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
-typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
-
-typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
-typedef enum { full = 0, partial = 1 } earlyEnd_directive;
-
-
-/**************************************
-*  Local Utils
-**************************************/
-int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
-int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
-int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
-
-
-
-/********************************
-*  Compression functions
-********************************/
-
-static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType)
-{
-    if (tableType == byU16)
-        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
-    else
-        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
-}
-
-static const U64 prime5bytes = 889523592379ULL;
-static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType)
-{
-    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
-    const U32 hashMask = (1<<hashLog) - 1;
-    return ((sequence * prime5bytes) >> (40 - hashLog)) & hashMask;
-}
-
-static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType)
-{
-    if (LZ4_64bits())
-        return LZ4_hashSequence64(sequence, tableType);
-    return LZ4_hashSequence((U32)sequence, tableType);
-}
-
-static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); }
-
-static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase)
-{
-    switch (tableType)
-    {
-    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
-    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
-    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
-    }
-}
-
-static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
-{
-    U32 h = LZ4_hashPosition(p, tableType);
-    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
-}
-
-static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
-{
-    if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
-    if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
-    { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
-}
-
-static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
-{
-    U32 h = LZ4_hashPosition(p, tableType);
-    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
-}
-
-FORCE_INLINE int LZ4_compress_generic(
-                 void* const ctx,
-                 const char* const source,
-                 char* const dest,
-                 const int inputSize,
-                 const int maxOutputSize,
-                 const limitedOutput_directive outputLimited,
-                 const tableType_t tableType,
-                 const dict_directive dict,
-                 const dictIssue_directive dictIssue,
-                 const U32 acceleration)
-{
-    LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
-
-    const BYTE* ip = (const BYTE*) source;
-    const BYTE* base;
-    const BYTE* lowLimit;
-    const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
-    const BYTE* const dictionary = dictPtr->dictionary;
-    const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
-    const size_t dictDelta = dictEnd - (const BYTE*)source;
-    const BYTE* anchor = (const BYTE*) source;
-    const BYTE* const iend = ip + inputSize;
-    const BYTE* const mflimit = iend - MFLIMIT;
-    const BYTE* const matchlimit = iend - LASTLITERALS;
-
-    BYTE* op = (BYTE*) dest;
-    BYTE* const olimit = op + maxOutputSize;
-
-    U32 forwardH;
-    size_t refDelta=0;
-
-    /* Init conditions */
-    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;   /* Unsupported input size, too large (or negative) */
-    switch(dict)
-    {
-    case noDict:
-    default:
-        base = (const BYTE*)source;
-        lowLimit = (const BYTE*)source;
-        break;
-    case withPrefix64k:
-        base = (const BYTE*)source - dictPtr->currentOffset;
-        lowLimit = (const BYTE*)source - dictPtr->dictSize;
-        break;
-    case usingExtDict:
-        base = (const BYTE*)source - dictPtr->currentOffset;
-        lowLimit = (const BYTE*)source;
-        break;
-    }
-    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
-    if (inputSize<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
-
-    /* First Byte */
-    LZ4_putPosition(ip, ctx, tableType, base);
-    ip++; forwardH = LZ4_hashPosition(ip, tableType);
-
-    /* Main Loop */
-    for ( ; ; )
-    {
-        const BYTE* match;
-        BYTE* token;
-        {
-            const BYTE* forwardIp = ip;
-            unsigned step = 1;
-            unsigned searchMatchNb = acceleration << LZ4_skipTrigger;
-
-            /* Find a match */
-            do {
-                U32 h = forwardH;
-                ip = forwardIp;
-                forwardIp += step;
-                step = (searchMatchNb++ >> LZ4_skipTrigger);
-
-                if (unlikely(forwardIp > mflimit)) goto _last_literals;
-
-                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
-                if (dict==usingExtDict)
-                {
-                    if (match<(const BYTE*)source)
-                    {
-                        refDelta = dictDelta;
-                        lowLimit = dictionary;
-                    }
-                    else
-                    {
-                        refDelta = 0;
-                        lowLimit = (const BYTE*)source;
-                    }
-                }
-                forwardH = LZ4_hashPosition(forwardIp, tableType);
-                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
-
-            } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0)
-                || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
-                || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) );
-        }
-
-        /* Catch up */
-        while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; }
-
-        {
-            /* Encode Literal length */
-            unsigned litLength = (unsigned)(ip - anchor);
-            token = op++;
-            if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
-                return 0;   /* Check output limit */
-            if (litLength>=RUN_MASK)
-            {
-                int len = (int)litLength-RUN_MASK;
-                *token=(RUN_MASK<<ML_BITS);
-                for(; len >= 255 ; len-=255) *op++ = 255;
-                *op++ = (BYTE)len;
-            }
-            else *token = (BYTE)(litLength<<ML_BITS);
-
-            /* Copy Literals */
-            LZ4_wildCopy(op, anchor, op+litLength);
-            op+=litLength;
-        }
-
-_next_match:
-        /* Encode Offset */
-        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
-
-        /* Encode MatchLength */
-        {
-            unsigned matchLength;
-
-            if ((dict==usingExtDict) && (lowLimit==dictionary))
-            {
-                const BYTE* limit;
-                match += refDelta;
-                limit = ip + (dictEnd-match);
-                if (limit > matchlimit) limit = matchlimit;
-                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
-                ip += MINMATCH + matchLength;
-                if (ip==limit)
-                {
-                    unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
-                    matchLength += more;
-                    ip += more;
-                }
-            }
-            else
-            {
-                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
-                ip += MINMATCH + matchLength;
-            }
-
-            if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
-                return 0;    /* Check output limit */
-            if (matchLength>=ML_MASK)
-            {
-                *token += ML_MASK;
-                matchLength -= ML_MASK;
-                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
-                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
-                *op++ = (BYTE)matchLength;
-            }
-            else *token += (BYTE)(matchLength);
-        }
-
-        anchor = ip;
-
-        /* Test end of chunk */
-        if (ip > mflimit) break;
-
-        /* Fill table */
-        LZ4_putPosition(ip-2, ctx, tableType, base);
-
-        /* Test next position */
-        match = LZ4_getPosition(ip, ctx, tableType, base);
-        if (dict==usingExtDict)
-        {
-            if (match<(const BYTE*)source)
-            {
-                refDelta = dictDelta;
-                lowLimit = dictionary;
-            }
-            else
-            {
-                refDelta = 0;
-                lowLimit = (const BYTE*)source;
-            }
-        }
-        LZ4_putPosition(ip, ctx, tableType, base);
-        if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1)
-            && (match+MAX_DISTANCE>=ip)
-            && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) )
-        { token=op++; *token=0; goto _next_match; }
-
-        /* Prepare next loop */
-        forwardH = LZ4_hashPosition(++ip, tableType);
-    }
-
-_last_literals:
-    /* Encode Last Literals */
-    {
-        const size_t lastRun = (size_t)(iend - anchor);
-        if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
-            return 0;   /* Check output limit */
-        if (lastRun >= RUN_MASK)
-        {
-            size_t accumulator = lastRun - RUN_MASK;
-            *op++ = RUN_MASK << ML_BITS;
-            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
-            *op++ = (BYTE) accumulator;
-        }
-        else
-        {
-            *op++ = (BYTE)(lastRun<<ML_BITS);
-        }
-        memcpy(op, anchor, lastRun);
-        op += lastRun;
-    }
-
-    /* End */
-    return (int) (((char*)op)-dest);
-}
-
-
-int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
-{
-    LZ4_resetStream((LZ4_stream_t*)state);
-    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
-
-    if (maxOutputSize >= LZ4_compressBound(inputSize))
-    {
-        if (inputSize < LZ4_64Klimit)
-            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16,                        noDict, noDictIssue, acceleration);
-        else
-            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
-    }
-    else
-    {
-        if (inputSize < LZ4_64Klimit)
-            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
-        else
-            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
-    }
-}
-
-
-int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
-{
-#if (HEAPMODE)
-    void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
-#else
-    LZ4_stream_t ctx;
-    void* ctxPtr = &ctx;
-#endif
-
-    int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
-
-#if (HEAPMODE)
-    FREEMEM(ctxPtr);
-#endif
-    return result;
-}
-
-
-int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize)
-{
-    return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1);
-}
-
-
-/* hidden debug function */
-/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */
-int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
-{
-    LZ4_stream_t ctx;
-
-    LZ4_resetStream(&ctx);
-
-    if (inputSize < LZ4_64Klimit)
-        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
-    else
-        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
-}
-
-
-/********************************
-*  destSize variant
-********************************/
-
-static int LZ4_compress_destSize_generic(
-                       void* const ctx,
-                 const char* const src,
-                       char* const dst,
-                       int*  const srcSizePtr,
-                 const int targetDstSize,
-                 const tableType_t tableType)
-{
-    const BYTE* ip = (const BYTE*) src;
-    const BYTE* base = (const BYTE*) src;
-    const BYTE* lowLimit = (const BYTE*) src;
-    const BYTE* anchor = ip;
-    const BYTE* const iend = ip + *srcSizePtr;
-    const BYTE* const mflimit = iend - MFLIMIT;
-    const BYTE* const matchlimit = iend - LASTLITERALS;
-
-    BYTE* op = (BYTE*) dst;
-    BYTE* const oend = op + targetDstSize;
-    BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */;
-    BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */);
-    BYTE* const oMaxSeq = oMaxLit - 1 /* token */;
-
-    U32 forwardH;
-
-
-    /* Init conditions */
-    if (targetDstSize < 1) return 0;                                     /* Impossible to store anything */
-    if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0;            /* Unsupported input size, too large (or negative) */
-    if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
-    if (*srcSizePtr<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
-
-    /* First Byte */
-    *srcSizePtr = 0;
-    LZ4_putPosition(ip, ctx, tableType, base);
-    ip++; forwardH = LZ4_hashPosition(ip, tableType);
-
-    /* Main Loop */
-    for ( ; ; )
-    {
-        const BYTE* match;
-        BYTE* token;
-        {
-            const BYTE* forwardIp = ip;
-            unsigned step = 1;
-            unsigned searchMatchNb = 1 << LZ4_skipTrigger;
-
-            /* Find a match */
-            do {
-                U32 h = forwardH;
-                ip = forwardIp;
-                forwardIp += step;
-                step = (searchMatchNb++ >> LZ4_skipTrigger);
-
-                if (unlikely(forwardIp > mflimit))
-                    goto _last_literals;
-
-                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
-                forwardH = LZ4_hashPosition(forwardIp, tableType);
-                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
-
-            } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
-                || (LZ4_read32(match) != LZ4_read32(ip)) );
-        }
-
-        /* Catch up */
-        while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
-
-        {
-            /* Encode Literal length */
-            unsigned litLength = (unsigned)(ip - anchor);
-            token = op++;
-            if (op + ((litLength+240)/255) + litLength > oMaxLit)
-            {
-                /* Not enough space for a last match */
-                op--;
-                goto _last_literals;
-            }
-            if (litLength>=RUN_MASK)
-            {
-                unsigned len = litLength - RUN_MASK;
-                *token=(RUN_MASK<<ML_BITS);
-                for(; len >= 255 ; len-=255) *op++ = 255;
-                *op++ = (BYTE)len;
-            }
-            else *token = (BYTE)(litLength<<ML_BITS);
-
-            /* Copy Literals */
-            LZ4_wildCopy(op, anchor, op+litLength);
-            op += litLength;
-        }
-
-_next_match:
-        /* Encode Offset */
-        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
-
-        /* Encode MatchLength */
-        {
-            size_t matchLength;
-
-            matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
-
-            if (op + ((matchLength+240)/255) > oMaxMatch)
-            {
-                /* Match description too long : reduce it */
-                matchLength = (15-1) + (oMaxMatch-op) * 255;
-            }
-            //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH);
-            ip += MINMATCH + matchLength;
-
-            if (matchLength>=ML_MASK)
-            {
-                *token += ML_MASK;
-                matchLength -= ML_MASK;
-                while (matchLength >= 255) { matchLength-=255; *op++ = 255; }
-                *op++ = (BYTE)matchLength;
-            }
-            else *token += (BYTE)(matchLength);
-        }
-
-        anchor = ip;
-
-        /* Test end of block */
-        if (ip > mflimit) break;
-        if (op > oMaxSeq) break;
-
-        /* Fill table */
-        LZ4_putPosition(ip-2, ctx, tableType, base);
-
-        /* Test next position */
-        match = LZ4_getPosition(ip, ctx, tableType, base);
-        LZ4_putPosition(ip, ctx, tableType, base);
-        if ( (match+MAX_DISTANCE>=ip)
-            && (LZ4_read32(match)==LZ4_read32(ip)) )
-        { token=op++; *token=0; goto _next_match; }
-
-        /* Prepare next loop */
-        forwardH = LZ4_hashPosition(++ip, tableType);
-    }
-
-_last_literals:
-    /* Encode Last Literals */
-    {
-        size_t lastRunSize = (size_t)(iend - anchor);
-        if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend)
-        {
-            /* adapt lastRunSize to fill 'dst' */
-            lastRunSize  = (oend-op) - 1;
-            lastRunSize -= (lastRunSize+240)/255;
-        }
-        ip = anchor + lastRunSize;
-
-        if (lastRunSize >= RUN_MASK)
-        {
-            size_t accumulator = lastRunSize - RUN_MASK;
-            *op++ = RUN_MASK << ML_BITS;
-            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
-            *op++ = (BYTE) accumulator;
-        }
-        else
-        {
-            *op++ = (BYTE)(lastRunSize<<ML_BITS);
-        }
-        memcpy(op, anchor, lastRunSize);
-        op += lastRunSize;
-    }
-
-    /* End */
-    *srcSizePtr = (int) (((const char*)ip)-src);
-    return (int) (((char*)op)-dst);
-}
-
-
-static int LZ4_compress_destSize_extState (void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
-{
-    LZ4_resetStream((LZ4_stream_t*)state);
-
-    if (targetDstSize >= LZ4_compressBound(*srcSizePtr))   /* compression success is guaranteed */
-    {
-        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
-    }
-    else
-    {
-        if (*srcSizePtr < LZ4_64Klimit)
-            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16);
-        else
-            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr);
-    }
-}
-
-
-int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
-{
-#if (HEAPMODE)
-    void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
-#else
-    LZ4_stream_t ctxBody;
-    void* ctx = &ctxBody;
-#endif
-
-    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
-
-#if (HEAPMODE)
-    FREEMEM(ctx);
-#endif
-    return result;
-}
-
-
-
-/********************************
-*  Streaming functions
-********************************/
-
-LZ4_stream_t* LZ4_createStream(void)
-{
-    LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64);
-    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
-    LZ4_resetStream(lz4s);
-    return lz4s;
-}
-
-void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
-{
-    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
-}
-
-int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
-{
-    FREEMEM(LZ4_stream);
-    return (0);
-}
-
-
-#define HASH_UNIT sizeof(size_t)
-int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
-{
-    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
-    const BYTE* p = (const BYTE*)dictionary;
-    const BYTE* const dictEnd = p + dictSize;
-    const BYTE* base;
-
-    if ((dict->initCheck) || (dict->currentOffset > 1 GB))  /* Uninitialized structure, or reuse overflow */
-        LZ4_resetStream(LZ4_dict);
-
-    if (dictSize < (int)HASH_UNIT)
-    {
-        dict->dictionary = NULL;
-        dict->dictSize = 0;
-        return 0;
-    }
-
-    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
-    dict->currentOffset += 64 KB;
-    base = p - dict->currentOffset;
-    dict->dictionary = p;
-    dict->dictSize = (U32)(dictEnd - p);
-    dict->currentOffset += dict->dictSize;
-
-    while (p <= dictEnd-HASH_UNIT)
-    {
-        LZ4_putPosition(p, dict->hashTable, byU32, base);
-        p+=3;
-    }
-
-    return dict->dictSize;
-}
-
-
-static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
-{
-    if ((LZ4_dict->currentOffset > 0x80000000) ||
-        ((size_t)LZ4_dict->currentOffset > (size_t)src))   /* address space overflow */
-    {
-        /* rescale hash table */
-        U32 delta = LZ4_dict->currentOffset - 64 KB;
-        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
-        int i;
-        for (i=0; i<HASH_SIZE_U32; i++)
-        {
-            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
-            else LZ4_dict->hashTable[i] -= delta;
-        }
-        LZ4_dict->currentOffset = 64 KB;
-        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
-        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
-    }
-}
-
-
-int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
-{
-    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
-    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
-
-    const BYTE* smallest = (const BYTE*) source;
-    if (streamPtr->initCheck) return 0;   /* Uninitialized structure detected */
-    if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
-    LZ4_renormDictT(streamPtr, smallest);
-    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
-
-    /* Check overlapping input/dictionary space */
-    {
-        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
-        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
-        {
-            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
-            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
-            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
-            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
-        }
-    }
-
-    /* prefix mode : source data follows dictionary */
-    if (dictEnd == (const BYTE*)source)
-    {
-        int result;
-        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration);
-        else
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration);
-        streamPtr->dictSize += (U32)inputSize;
-        streamPtr->currentOffset += (U32)inputSize;
-        return result;
-    }
-
-    /* external dictionary mode */
-    {
-        int result;
-        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration);
-        else
-            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration);
-        streamPtr->dictionary = (const BYTE*)source;
-        streamPtr->dictSize = (U32)inputSize;
-        streamPtr->currentOffset += (U32)inputSize;
-        return result;
-    }
-}
-
-
-/* Hidden debug function, to force external dictionary mode */
-int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
-{
-    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
-    int result;
-    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
-
-    const BYTE* smallest = dictEnd;
-    if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
-    LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
-
-    result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
-
-    streamPtr->dictionary = (const BYTE*)source;
-    streamPtr->dictSize = (U32)inputSize;
-    streamPtr->currentOffset += (U32)inputSize;
-
-    return result;
-}
-
-
-int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
-{
-    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
-    const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
-
-    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
-    if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
-
-    memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
-
-    dict->dictionary = (const BYTE*)safeBuffer;
-    dict->dictSize = (U32)dictSize;
-
-    return dictSize;
-}
-
-
-
-/*******************************
-*  Decompression functions
-*******************************/
-/*
- * This generic decompression function cover all use cases.
- * It shall be instantiated several times, using different sets of directives
- * Note that it is essential this generic function is really inlined,
- * in order to remove useless branches during compilation optimization.
- */
-FORCE_INLINE int LZ4_decompress_generic(
-                 const char* const source,
-                 char* const dest,
-                 int inputSize,
-                 int outputSize,         /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
-
-                 int endOnInput,         /* endOnOutputSize, endOnInputSize */
-                 int partialDecoding,    /* full, partial */
-                 int targetOutputSize,   /* only used if partialDecoding==partial */
-                 int dict,               /* noDict, withPrefix64k, usingExtDict */
-                 const BYTE* const lowPrefix,  /* == dest if dict == noDict */
-                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
-                 const size_t dictSize         /* note : = 0 if noDict */
-                 )
-{
-    /* Local Variables */
-    const BYTE* ip = (const BYTE*) source;
-    const BYTE* const iend = ip + inputSize;
-
-    BYTE* op = (BYTE*) dest;
-    BYTE* const oend = op + outputSize;
-    BYTE* cpy;
-    BYTE* oexit = op + targetOutputSize;
-    const BYTE* const lowLimit = lowPrefix - dictSize;
-
-    const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
-    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
-    const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
-
-    const int safeDecode = (endOnInput==endOnInputSize);
-    const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
-
-
-    /* Special cases */
-    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;                         /* targetOutputSize too high => decode everything */
-    if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
-    if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
-
-
-    /* Main Loop */
-    while (1)
-    {
-        unsigned token;
-        size_t length;
-        const BYTE* match;
-
-        /* get literal length */
-        token = *ip++;
-        if ((length=(token>>ML_BITS)) == RUN_MASK)
-        {
-            unsigned s;
-            do
-            {
-                s = *ip++;
-                length += s;
-            }
-            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
-            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
-            if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
-        }
-
-        /* copy literals */
-        cpy = op+length;
-        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
-            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
-        {
-            if (partialDecoding)
-            {
-                if (cpy > oend) goto _output_error;                           /* Error : write attempt beyond end of output buffer */
-                if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
-            }
-            else
-            {
-                if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
-                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
-            }
-            memcpy(op, ip, length);
-            ip += length;
-            op += length;
-            break;     /* Necessarily EOF, due to parsing restrictions */
-        }
-        LZ4_wildCopy(op, ip, cpy);
-        ip += length; op = cpy;
-
-        /* get offset */
-        match = cpy - LZ4_readLE16(ip); ip+=2;
-        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
-
-        /* get matchlength */
-        length = token & ML_MASK;
-        if (length == ML_MASK)
-        {
-            unsigned s;
-            do
-            {
-                if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
-                s = *ip++;
-                length += s;
-            } while (s==255);
-            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error;   /* overflow detection */
-        }
-        length += MINMATCH;
-
-        /* check external dictionary */
-        if ((dict==usingExtDict) && (match < lowPrefix))
-        {
-            if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error;   /* doesn't respect parsing restriction */
-
-            if (length <= (size_t)(lowPrefix-match))
-            {
-                /* match can be copied as a single segment from external dictionary */
-                match = dictEnd - (lowPrefix-match);
-                memmove(op, match, length); op += length;
-            }
-            else
-            {
-                /* match encompass external dictionary and current segment */
-                size_t copySize = (size_t)(lowPrefix-match);
-                memcpy(op, dictEnd - copySize, copySize);
-                op += copySize;
-                copySize = length - copySize;
-                if (copySize > (size_t)(op-lowPrefix))   /* overlap within current segment */
-                {
-                    BYTE* const endOfMatch = op + copySize;
-                    const BYTE* copyFrom = lowPrefix;
-                    while (op < endOfMatch) *op++ = *copyFrom++;
-                }
-                else
-                {
-                    memcpy(op, lowPrefix, copySize);
-                    op += copySize;
-                }
-            }
-            continue;
-        }
-
-        /* copy repeated sequence */
-        cpy = op + length;
-        if (unlikely((op-match)<8))
-        {
-            const size_t dec64 = dec64table[op-match];
-            op[0] = match[0];
-            op[1] = match[1];
-            op[2] = match[2];
-            op[3] = match[3];
-            match += dec32table[op-match];
-            LZ4_copy4(op+4, match);
-            op += 8; match -= dec64;
-        } else { LZ4_copy8(op, match); op+=8; match+=8; }
-
-        if (unlikely(cpy>oend-12))
-        {
-            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals */
-            if (op < oend-8)
-            {
-                LZ4_wildCopy(op, match, oend-8);
-                match += (oend-8) - op;
-                op = oend-8;
-            }
-            while (op<cpy) *op++ = *match++;
-        }
-        else
-            LZ4_wildCopy(op, match, cpy);
-        op=cpy;   /* correction */
-    }
-
-    /* end of decoding */
-    if (endOnInput)
-       return (int) (((char*)op)-dest);     /* Nb of output bytes decoded */
-    else
-       return (int) (((const char*)ip)-source);   /* Nb of input bytes read */
-
-    /* Overflow error detected */
-_output_error:
-    return (int) (-(((const char*)ip)-source))-1;
-}
-
-
-int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0);
-}
-
-int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0);
-}
-
-int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
-{
-    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB);
-}
-
-
-/* streaming decompression functions */
-
-typedef struct
-{
-    const BYTE* externalDict;
-    size_t extDictSize;
-    const BYTE* prefixEnd;
-    size_t prefixSize;
-} LZ4_streamDecode_t_internal;
-
-/*
- * If you prefer dynamic allocation methods,
- * LZ4_createStreamDecode()
- * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
- */
-LZ4_streamDecode_t* LZ4_createStreamDecode(void)
-{
-    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(1, sizeof(LZ4_streamDecode_t));
-    return lz4s;
-}
-
-int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
-{
-    FREEMEM(LZ4_stream);
-    return 0;
-}
-
-/*
- * LZ4_setStreamDecode
- * Use this function to instruct where to find the dictionary
- * This function is not necessary if previous data is still available where it was decoded.
- * Loading a size of 0 is allowed (same effect as no dictionary).
- * Return : 1 if OK, 0 if error
- */
-int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
-    lz4sd->prefixSize = (size_t) dictSize;
-    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
-    lz4sd->externalDict = NULL;
-    lz4sd->extDictSize  = 0;
-    return 1;
-}
-
-/*
-*_continue() :
-    These decoding functions allow decompression of multiple blocks in "streaming" mode.
-    Previously decoded blocks must still be available at the memory position where they were decoded.
-    If it's not possible, save the relevant part of decoded data into a safe buffer,
-    and indicate where it stands using LZ4_setStreamDecode()
-*/
-int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
-    int result;
-
-    if (lz4sd->prefixEnd == (BYTE*)dest)
-    {
-        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
-                                        endOnInputSize, full, 0,
-                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize += result;
-        lz4sd->prefixEnd  += result;
-    }
-    else
-    {
-        lz4sd->extDictSize = lz4sd->prefixSize;
-        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
-        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
-                                        endOnInputSize, full, 0,
-                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize = result;
-        lz4sd->prefixEnd  = (BYTE*)dest + result;
-    }
-
-    return result;
-}
-
-int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
-{
-    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
-    int result;
-
-    if (lz4sd->prefixEnd == (BYTE*)dest)
-    {
-        result = LZ4_decompress_generic(source, dest, 0, originalSize,
-                                        endOnOutputSize, full, 0,
-                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize += originalSize;
-        lz4sd->prefixEnd  += originalSize;
-    }
-    else
-    {
-        lz4sd->extDictSize = lz4sd->prefixSize;
-        lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize;
-        result = LZ4_decompress_generic(source, dest, 0, originalSize,
-                                        endOnOutputSize, full, 0,
-                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
-        if (result <= 0) return result;
-        lz4sd->prefixSize = originalSize;
-        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
-    }
-
-    return result;
-}
-
-
-/*
-Advanced decoding functions :
-*_usingDict() :
-    These decoding functions work the same as "_continue" ones,
-    the dictionary must be explicitly provided within parameters
-*/
-
-FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
-{
-    if (dictSize==0)
-        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0);
-    if (dictStart+dictSize == dest)
-    {
-        if (dictSize >= (int)(64 KB - 1))
-            return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0);
-        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0);
-    }
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
-}
-
-int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
-{
-    return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
-}
-
-int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
-{
-    return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
-}
-
-/* debug function */
-int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
-}
-
-
-/***************************************************
-*  Obsolete Functions
-***************************************************/
-/* obsolete compression functions */
-int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); }
-int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); }
-int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); }
-int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); }
-int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); }
-int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); }
-
-/*
-These function names are deprecated and should no longer be used.
-They are only provided here for compatibility with older user programs.
-- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
-- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
-*/
-int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
-int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
-
-
-/* Obsolete Streaming functions */
-
-int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
-
-static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base)
-{
-    MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
-    lz4ds->bufferStart = base;
-}
-
-int LZ4_resetStreamState(void* state, char* inputBuffer)
-{
-    if ((((size_t)state) & 3) != 0) return 1;   /* Error : pointer is not aligned on 4-bytes boundary */
-    LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer);
-    return 0;
-}
-
-void* LZ4_create (char* inputBuffer)
-{
-    void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64);
-    LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer);
-    return lz4ds;
-}
-
-char* LZ4_slideInputBuffer (void* LZ4_Data)
-{
-    LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data;
-    int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB);
-    return (char*)(ctx->bufferStart + dictSize);
-}
-
-/* Obsolete streaming decompression functions */
-
-int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
-{
-    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
-}
-
-int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
-{
-    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
-}
-
-#endif   /* LZ4_COMMONDEFS_ONLY */
-
diff --git a/thirdparty/etcpak/lz4/lz4.h b/thirdparty/etcpak/lz4/lz4.h
deleted file mode 100644
index 3e74002256..0000000000
--- a/thirdparty/etcpak/lz4/lz4.h
+++ /dev/null
@@ -1,360 +0,0 @@
-/*
-   LZ4 - Fast LZ compression algorithm
-   Header File
-   Copyright (C) 2011-2015, Yann Collet.
-
-   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-   Redistribution and use in source and binary forms, with or without
-   modification, are permitted provided that the following conditions are
-   met:
-
-       * Redistributions of source code must retain the above copyright
-   notice, this list of conditions and the following disclaimer.
-       * Redistributions in binary form must reproduce the above
-   copyright notice, this list of conditions and the following disclaimer
-   in the documentation and/or other materials provided with the
-   distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-   You can contact the author at :
-   - LZ4 source repository : https://github.com/Cyan4973/lz4
-   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
-*/
-#pragma once
-
-#if defined (__cplusplus)
-extern "C" {
-#endif
-
-/*
- * lz4.h provides block compression functions, and gives full buffer control to programmer.
- * If you need to generate inter-operable compressed data (respecting LZ4 frame specification),
- * and can let the library handle its own memory, please use lz4frame.h instead.
-*/
-
-/**************************************
-*  Version
-**************************************/
-#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
-#define LZ4_VERSION_MINOR    7    /* for new (non-breaking) interface capabilities */
-#define LZ4_VERSION_RELEASE  1    /* for tweaks, bug-fixes, or development */
-#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
-int LZ4_versionNumber (void);
-
-/**************************************
-*  Tuning parameter
-**************************************/
-/*
- * LZ4_MEMORY_USAGE :
- * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
- * Increasing memory usage improves compression ratio
- * Reduced memory usage can improve speed, due to cache effect
- * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
- */
-#define LZ4_MEMORY_USAGE 14
-
-
-/**************************************
-*  Simple Functions
-**************************************/
-
-int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
-int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
-
-/*
-LZ4_compress_default() :
-    Compresses 'sourceSize' bytes from buffer 'source'
-    into already allocated 'dest' buffer of size 'maxDestSize'.
-    Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize).
-    It also runs faster, so it's a recommended setting.
-    If the function cannot compress 'source' into a more limited 'dest' budget,
-    compression stops *immediately*, and the function result is zero.
-    As a consequence, 'dest' content is not valid.
-    This function never writes outside 'dest' buffer, nor read outside 'source' buffer.
-        sourceSize  : Max supported value is LZ4_MAX_INPUT_VALUE
-        maxDestSize : full or partial size of buffer 'dest' (which must be already allocated)
-        return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize)
-              or 0 if compression fails
-
-LZ4_decompress_safe() :
-    compressedSize : is the precise full size of the compressed block.
-    maxDecompressedSize : is the size of destination buffer, which must be already allocated.
-    return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize)
-             If destination buffer is not large enough, decoding will stop and output an error code (<0).
-             If the source stream is detected malformed, the function will stop decoding and return a negative result.
-             This function is protected against buffer overflow exploits, including malicious data packets.
-             It never writes outside output buffer, nor reads outside input buffer.
-*/
-
-
-/**************************************
-*  Advanced Functions
-**************************************/
-#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
-#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
-
-/*
-LZ4_compressBound() :
-    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
-    This function is primarily useful for memory allocation purposes (destination buffer size).
-    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
-    Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize)
-        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
-        return : maximum output size in a "worst case" scenario
-              or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
-*/
-int LZ4_compressBound(int inputSize);
-
-/*
-LZ4_compress_fast() :
-    Same as LZ4_compress_default(), but allows to select an "acceleration" factor.
-    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
-    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
-    An acceleration value of "1" is the same as regular LZ4_compress_default()
-    Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1.
-*/
-int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
-
-
-/*
-LZ4_compress_fast_extState() :
-    Same compression function, just using an externally allocated memory space to store compression state.
-    Use LZ4_sizeofState() to know how much memory must be allocated,
-    and allocate it on 8-bytes boundaries (using malloc() typically).
-    Then, provide it as 'void* state' to compression function.
-*/
-int LZ4_sizeofState(void);
-int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
-
-
-/*
-LZ4_compress_destSize() :
-    Reverse the logic, by compressing as much data as possible from 'source' buffer
-    into already allocated buffer 'dest' of size 'targetDestSize'.
-    This function either compresses the entire 'source' content into 'dest' if it's large enough,
-    or fill 'dest' buffer completely with as much data as possible from 'source'.
-        *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'.
-                         New value is necessarily <= old value.
-        return : Nb bytes written into 'dest' (necessarily <= targetDestSize)
-              or 0 if compression fails
-*/
-int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
-
-
-/*
-LZ4_decompress_fast() :
-    originalSize : is the original and therefore uncompressed size
-    return : the number of bytes read from the source buffer (in other words, the compressed size)
-             If the source stream is detected malformed, the function will stop decoding and return a negative result.
-             Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
-    note : This function fully respect memory boundaries for properly formed compressed data.
-           It is a bit faster than LZ4_decompress_safe().
-           However, it does not provide any protection against intentionally modified data stream (malicious input).
-           Use this function in trusted environment only (data to decode comes from a trusted source).
-*/
-int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
-
-/*
-LZ4_decompress_safe_partial() :
-    This function decompress a compressed block of size 'compressedSize' at position 'source'
-    into destination buffer 'dest' of size 'maxDecompressedSize'.
-    The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
-    reducing decompression time.
-    return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize)
-       Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
-             Always control how many bytes were decoded.
-             If the source stream is detected malformed, the function will stop decoding and return a negative result.
-             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
-*/
-int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
-
-
-/***********************************************
-*  Streaming Compression Functions
-***********************************************/
-#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
-#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U64 * sizeof(long long))
-/*
- * LZ4_stream_t
- * information structure to track an LZ4 stream.
- * important : init this structure content before first use !
- * note : only allocated directly the structure if you are statically linking LZ4
- *        If you are using liblz4 as a DLL, please use below construction methods instead.
- */
-typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t;
-
-/*
- * LZ4_resetStream
- * Use this function to init an allocated LZ4_stream_t structure
- */
-void LZ4_resetStream (LZ4_stream_t* streamPtr);
-
-/*
- * LZ4_createStream will allocate and initialize an LZ4_stream_t structure
- * LZ4_freeStream releases its memory.
- * In the context of a DLL (liblz4), please use these methods rather than the static struct.
- * They are more future proof, in case of a change of LZ4_stream_t size.
- */
-LZ4_stream_t* LZ4_createStream(void);
-int           LZ4_freeStream (LZ4_stream_t* streamPtr);
-
-/*
- * LZ4_loadDict
- * Use this function to load a static dictionary into LZ4_stream.
- * Any previous data will be forgotten, only 'dictionary' will remain in memory.
- * Loading a size of 0 is allowed.
- * Return : dictionary size, in bytes (necessarily <= 64 KB)
- */
-int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
-
-/*
- * LZ4_compress_fast_continue
- * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
- * Important : Previous data blocks are assumed to still be present and unmodified !
- * 'dst' buffer must be already allocated.
- * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
- * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
- */
-int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
-
-/*
- * LZ4_saveDict
- * If previously compressed data block is not guaranteed to remain available at its memory location
- * save it into a safer place (char* safeBuffer)
- * Note : you don't need to call LZ4_loadDict() afterwards,
- *        dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue()
- * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error
- */
-int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
-
-
-/************************************************
-*  Streaming Decompression Functions
-************************************************/
-
-#define LZ4_STREAMDECODESIZE_U64  4
-#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
-typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
-/*
- * LZ4_streamDecode_t
- * information structure to track an LZ4 stream.
- * init this structure content using LZ4_setStreamDecode or memset() before first use !
- *
- * In the context of a DLL (liblz4) please prefer usage of construction methods below.
- * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future.
- * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
- * LZ4_freeStreamDecode releases its memory.
- */
-LZ4_streamDecode_t* LZ4_createStreamDecode(void);
-int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
-
-/*
- * LZ4_setStreamDecode
- * Use this function to instruct where to find the dictionary.
- * Setting a size of 0 is allowed (same effect as reset).
- * Return : 1 if OK, 0 if error
- */
-int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
-
-/*
-*_continue() :
-    These decoding functions allow decompression of multiple blocks in "streaming" mode.
-    Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB)
-    In the case of a ring buffers, decoding buffer must be either :
-    - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
-      In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
-    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
-      maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block.
-      In which case, encoding and decoding buffers do not need to be synchronized,
-      and encoding ring buffer can have any size, including small ones ( < 64 KB).
-    - _At least_ 64 KB + 8 bytes + maxBlockSize.
-      In which case, encoding and decoding buffers do not need to be synchronized,
-      and encoding ring buffer can have any size, including larger than decoding buffer.
-    Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
-    and indicate where it is saved using LZ4_setStreamDecode()
-*/
-int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
-int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
-
-
-/*
-Advanced decoding functions :
-*_usingDict() :
-    These decoding functions work the same as
-    a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue()
-    They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure.
-*/
-int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
-int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
-
-
-
-/**************************************
-*  Obsolete Functions
-**************************************/
-/* Deprecate Warnings */
-/* Should these warnings messages be a problem,
-   it is generally possible to disable them,
-   with -Wno-deprecated-declarations for gcc
-   or _CRT_SECURE_NO_WARNINGS in Visual for example.
-   You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
-#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
-#  define LZ4_DEPRECATE_WARNING_DEFBLOCK
-#  define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-#  if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
-#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
-#  elif (LZ4_GCC_VERSION >= 301)
-#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
-#  elif defined(_MSC_VER)
-#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
-#  else
-#    pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
-#    define LZ4_DEPRECATED(message)
-#  endif
-#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */
-
-/* Obsolete compression functions */
-/* These functions are planned to start generate warnings by r131 approximately */
-int LZ4_compress               (const char* source, char* dest, int sourceSize);
-int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
-int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
-int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
-int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
-int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
-
-/* Obsolete decompression functions */
-/* These function names are completely deprecated and must no longer be used.
-   They are only provided here for compatibility with older programs.
-    - LZ4_uncompress is the same as LZ4_decompress_fast
-    - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
-   These function prototypes are now disabled; uncomment them only if you really need them.
-   It is highly recommended to stop using these prototypes and migrate to maintained ones */
-/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
-/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
-
-/* Obsolete streaming functions; use new streaming interface whenever possible */
-LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer);
-LZ4_DEPRECATED("use LZ4_createStream() instead") int   LZ4_sizeofStreamState(void);
-LZ4_DEPRECATED("use LZ4_resetStream() instead")  int   LZ4_resetStreamState(void* state, char* inputBuffer);
-LZ4_DEPRECATED("use LZ4_saveDict() instead")     char* LZ4_slideInputBuffer (void* state);
-
-/* Obsolete streaming decoding functions */
-LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
-LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
-
-
-#if defined (__cplusplus)
-}
-#endif
diff --git a/thirdparty/etcpak/mmap.cpp b/thirdparty/etcpak/mmap.cpp
deleted file mode 100644
index c2460ee9e4..0000000000
--- a/thirdparty/etcpak/mmap.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#include "mmap.hpp"
-
-#ifdef _WIN32
-#  include <io.h>
-#  include <windows.h>
-
-void* mmap( void* addr, size_t length, int prot, int flags, int fd, off_t offset )
-{
-    HANDLE hnd;
-    void* map = nullptr;
-
-    switch( prot )
-    {
-    case PROT_READ:
-        if( hnd = CreateFileMapping( HANDLE( _get_osfhandle( fd ) ), nullptr, PAGE_READONLY, 0, DWORD( length ), nullptr ) )
-        {
-            map = MapViewOfFile( hnd, FILE_MAP_READ, 0, 0, length );
-            CloseHandle( hnd );
-        }
-        break;
-    case PROT_WRITE:
-        if( hnd = CreateFileMapping( HANDLE( _get_osfhandle( fd ) ), nullptr, PAGE_READWRITE, 0, DWORD( length ), nullptr ) )
-        {
-            map = MapViewOfFile( hnd, FILE_MAP_WRITE, 0, 0, length );
-            CloseHandle( hnd );
-        }
-        break;
-    }
-
-    return map ? (char*)map + offset : (void*)-1;
-}
-
-int munmap( void* addr, size_t length )
-{
-    return UnmapViewOfFile( addr ) != 0 ? 0 : -1;
-}
-
-#endif
diff --git a/thirdparty/etcpak/mmap.hpp b/thirdparty/etcpak/mmap.hpp
deleted file mode 100644
index e4cfe7759c..0000000000
--- a/thirdparty/etcpak/mmap.hpp
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __MMAP_HPP__
-#define __MMAP_HPP__
-
-#ifndef _WIN32
-#  include <sys/mman.h>
-#else
-#  include <string.h>
-#  include <sys/types.h>
-
-#  define PROT_READ 1
-#  define PROT_WRITE 2
-#  define MAP_SHARED 0
-
-void* mmap( void* addr, size_t length, int prot, int flags, int fd, off_t offset );
-int munmap( void* addr, size_t length );
-
-#endif
-
-#endif
diff --git a/thirdparty/etcpak/patches/libpng-unbundle.patch b/thirdparty/etcpak/patches/libpng-unbundle.patch
deleted file mode 100644
index e3c07412c6..0000000000
--- a/thirdparty/etcpak/patches/libpng-unbundle.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/thirdparty/etcpak/Bitmap.cpp b/thirdparty/etcpak/Bitmap.cpp
-index 6aa36f5caa..ef318318ac 100644
---- a/thirdparty/etcpak/Bitmap.cpp
-+++ b/thirdparty/etcpak/Bitmap.cpp
-@@ -3,7 +3,7 @@
- #include <string.h>
- #include <assert.h>
- 
--#include "libpng/png.h"
-+#include <png.h>
- #include "lz4/lz4.h"
- 
- #include "Bitmap.hpp"
diff --git a/thirdparty/etcpak/patches/llvm-c++11-narrowing-errors.patch b/thirdparty/etcpak/patches/llvm-c++11-narrowing-errors.patch
deleted file mode 100644
index ab0d1e63a2..0000000000
--- a/thirdparty/etcpak/patches/llvm-c++11-narrowing-errors.patch
+++ /dev/null
@@ -1,64 +0,0 @@
-diff --git a/thirdparty/etcpak/BlockData.cpp b/thirdparty/etcpak/BlockData.cpp
-index bd738085f3..395b55246b 100644
---- a/thirdparty/etcpak/BlockData.cpp
-+++ b/thirdparty/etcpak/BlockData.cpp
-@@ -334,10 +334,10 @@ static etcpak_force_inline void DecodeT( uint64_t block, uint32_t* dst, uint32_t
-     const auto c3b = clampu8( cb1 - table59T58H[codeword] );
- 
-     const uint32_t col_tab[4] = {
--        cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) | 0xFF000000,
--        c2r | ( c2g << 8 ) | ( c2b << 16 ) | 0xFF000000,
--        cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) | 0xFF000000,
--        c3r | ( c3g << 8 ) | ( c3b << 16 ) | 0xFF000000
-+        uint32_t(cr0 | ( cg0 << 8 ) | ( cb0 << 16 ) | 0xFF000000),
-+        uint32_t(c2r | ( c2g << 8 ) | ( c2b << 16 ) | 0xFF000000),
-+        uint32_t(cr1 | ( cg1 << 8 ) | ( cb1 << 16 ) | 0xFF000000),
-+        uint32_t(c3r | ( c3g << 8 ) | ( c3b << 16 ) | 0xFF000000)
-     };
- 
-     const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
-@@ -389,10 +389,10 @@ static etcpak_force_inline void DecodeTAlpha( uint64_t block, uint64_t alpha, ui
-     const auto c3b = clampu8( cb1 - table59T58H[codeword] );
- 
-     const uint32_t col_tab[4] = {
--        cr0 | ( cg0 << 8 ) | ( cb0 << 16 ),
--        c2r | ( c2g << 8 ) | ( c2b << 16 ),
--        cr1 | ( cg1 << 8 ) | ( cb1 << 16 ),
--        c3r | ( c3g << 8 ) | ( c3b << 16 )
-+        uint32_t(cr0 | ( cg0 << 8 ) | ( cb0 << 16 )),
-+        uint32_t(c2r | ( c2g << 8 ) | ( c2b << 16 )),
-+        uint32_t(cr1 | ( cg1 << 8 ) | ( cb1 << 16 )),
-+        uint32_t(c3r | ( c3g << 8 ) | ( c3b << 16 ))
-     };
- 
-     const uint32_t indexes = ( block >> 32 ) & 0xFFFFFFFF;
-@@ -436,10 +436,10 @@ static etcpak_force_inline void DecodeH( uint64_t block, uint32_t* dst, uint32_t
-     const auto codeword = codeword_hi | codeword_lo;
- 
-     const uint32_t col_tab[] = {
--        clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ),
--        clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ),
--        clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ),
--        clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 )
-+        uint32_t(clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 )),
-+        uint32_t(clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 )),
-+        uint32_t(clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 )),
-+        uint32_t(clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ))
-     };
- 
-     for( uint8_t j = 0; j < 4; j++ )
-@@ -483,10 +483,10 @@ static etcpak_force_inline void DecodeHAlpha( uint64_t block, uint64_t alpha, ui
-     const auto tbl = g_alpha[(alpha >> 48) & 0xF];
- 
-     const uint32_t col_tab[] = {
--        clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 ),
--        clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 ),
--        clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 ),
--        clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 )
-+        uint32_t(clampu8( r0 + table59T58H[codeword] ) | ( clampu8( g0 + table59T58H[codeword] ) << 8 ) | ( clampu8( b0 + table59T58H[codeword] ) << 16 )),
-+        uint32_t(clampu8( r0 - table59T58H[codeword] ) | ( clampu8( g0 - table59T58H[codeword] ) << 8 ) | ( clampu8( b0 - table59T58H[codeword] ) << 16 )),
-+        uint32_t(clampu8( r1 + table59T58H[codeword] ) | ( clampu8( g1 + table59T58H[codeword] ) << 8 ) | ( clampu8( b1 + table59T58H[codeword] ) << 16 )),
-+        uint32_t(clampu8( r1 - table59T58H[codeword] ) | ( clampu8( g1 - table59T58H[codeword] ) << 8 ) | ( clampu8( b1 - table59T58H[codeword] ) << 16 ))
-     };
- 
-     for( uint8_t j = 0; j < 4; j++ )
diff --git a/thirdparty/etcpak/patches/pthread-setname.patch b/thirdparty/etcpak/patches/pthread-setname.patch
deleted file mode 100644
index e2b009a1b3..0000000000
--- a/thirdparty/etcpak/patches/pthread-setname.patch
+++ /dev/null
@@ -1,66 +0,0 @@
-diff --git a/thirdparty/etcpak/System.cpp b/thirdparty/etcpak/System.cpp
-index 1383d0ecd0..041f2676e8 100644
---- a/thirdparty/etcpak/System.cpp
-+++ b/thirdparty/etcpak/System.cpp
-@@ -2,7 +2,6 @@
- #ifdef _WIN32
- #  include <windows.h>
- #else
--#  include <pthread.h>
- #  include <unistd.h>
- #endif
- 
-@@ -35,7 +34,7 @@ unsigned int System::CPUCores()
- 
- void System::SetThreadName( std::thread& thread, const char* name )
- {
--#ifdef _WIN32
-+#ifdef _MSC_VER
-     const DWORD MS_VC_EXCEPTION=0x406D1388;
- 
- #  pragma pack( push, 8 )
-@@ -62,7 +61,5 @@ void System::SetThreadName( std::thread& thread, const char* name )
-     __except(EXCEPTION_EXECUTE_HANDLER)
-     {
-     }
--#elif !defined(__APPLE__)
--    pthread_setname_np( thread.native_handle(), name );
- #endif
- }
-diff --git a/thirdparty/etcpak/TaskDispatch.cpp b/thirdparty/etcpak/TaskDispatch.cpp
-index 7287da4de2..b1ba17953b 100644
---- a/thirdparty/etcpak/TaskDispatch.cpp
-+++ b/thirdparty/etcpak/TaskDispatch.cpp
-@@ -1,5 +1,8 @@
- #include <assert.h>
- #include <stdio.h>
-+#ifndef _MSC_VER
-+#include <pthread.h>
-+#endif
- 
- #include "Debug.hpp"
- #include "System.hpp"
-@@ -22,15 +25,19 @@ TaskDispatch::TaskDispatch( size_t workers )
-     {
-         char tmp[16];
-         sprintf( tmp, "Worker %zu", i );
--#ifdef __APPLE__
-+#ifdef _MSC_VER
-+        auto worker = std::thread( [this]{ Worker(); } );
-+        System::SetThreadName( worker, tmp );
-+#else // Using pthread.
-         auto worker = std::thread( [this, tmp]{
-+#ifdef __APPLE__
-             pthread_setname_np( tmp );
-+#else // Linux or MinGW.
-+            pthread_setname_np( pthread_self(), tmp );
-+#endif
-             Worker();
-         } );
--#else
--        auto worker = std::thread( [this]{ Worker(); } );
- #endif
--        System::SetThreadName( worker, tmp );
-         m_workers.emplace_back( std::move( worker ) );
-     }
- 
diff --git a/thirdparty/etcpak/patches/windows-mingw-bswap.patch b/thirdparty/etcpak/patches/windows-mingw-bswap.patch
deleted file mode 100644
index c09192f573..0000000000
--- a/thirdparty/etcpak/patches/windows-mingw-bswap.patch
+++ /dev/null
@@ -1,50 +0,0 @@
-diff --git a/thirdparty/etcpak/BlockData.cpp b/thirdparty/etcpak/BlockData.cpp
-index a2cd032c5b..bd738085f3 100644
---- a/thirdparty/etcpak/BlockData.cpp
-+++ b/thirdparty/etcpak/BlockData.cpp
-@@ -15,7 +15,7 @@
- #  include <arm_neon.h>
- #endif
- 
--#ifdef __SSE4_1__
-+#if defined __SSE4_1__ || defined __AVX2__ || defined _MSC_VER
- #  ifdef _MSC_VER
- #    include <intrin.h>
- #    include <Windows.h>
-@@ -24,12 +24,6 @@
- #  else
- #    include <x86intrin.h>
- #  endif
--#else
--#  ifndef _MSC_VER
--#    include <byteswap.h>
--#    define _bswap(x) bswap_32(x)
--#    define _bswap64(x) bswap_64(x)
--#  endif
- #endif
- 
- #ifndef _bswap
-diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp
-index 220d5c55e2..9dc5a78b67 100644
---- a/thirdparty/etcpak/ProcessRGB.cpp
-+++ b/thirdparty/etcpak/ProcessRGB.cpp
-@@ -1,5 +1,6 @@
- #include <array>
- #include <string.h>
-+#include <limits>
- 
- #ifdef __ARM_NEON
- #  include <arm_neon.h>
-@@ -21,12 +22,6 @@
- #  else
- #    include <x86intrin.h>
- #  endif
--#else
--#  ifndef _MSC_VER
--#    include <byteswap.h>
--#    define _bswap(x) bswap_32(x)
--#    define _bswap64(x) bswap_64(x)
--#  endif
- #endif
- 
- #ifndef _bswap
diff --git a/thirdparty/meshoptimizer/LICENSE.md b/thirdparty/meshoptimizer/LICENSE.md
index 4fcd766d22..3c52415f62 100644
--- a/thirdparty/meshoptimizer/LICENSE.md
+++ b/thirdparty/meshoptimizer/LICENSE.md
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2016-2020 Arseny Kapoulkine
+Copyright (c) 2016-2021 Arseny Kapoulkine
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/thirdparty/meshoptimizer/clusterizer.cpp b/thirdparty/meshoptimizer/clusterizer.cpp
index f7d88c5136..f8aad7b49c 100644
--- a/thirdparty/meshoptimizer/clusterizer.cpp
+++ b/thirdparty/meshoptimizer/clusterizer.cpp
@@ -2,6 +2,7 @@
 #include "meshoptimizer.h"
 
 #include <assert.h>
+#include <float.h>
 #include <math.h>
 #include <string.h>
 
@@ -12,6 +13,68 @@
 namespace meshopt
 {
 
+// This must be <= 255 since index 0xff is used internally to indice a vertex that doesn't belong to a meshlet
+const size_t kMeshletMaxVertices = 255;
+
+// A reasonable limit is around 2*max_vertices or less
+const size_t kMeshletMaxTriangles = 512;
+
+struct TriangleAdjacency2
+{
+	unsigned int* counts;
+	unsigned int* offsets;
+	unsigned int* data;
+};
+
+static void buildTriangleAdjacency(TriangleAdjacency2& adjacency, const unsigned int* indices, size_t index_count, size_t vertex_count, meshopt_Allocator& allocator)
+{
+	size_t face_count = index_count / 3;
+
+	// allocate arrays
+	adjacency.counts = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.offsets = allocator.allocate<unsigned int>(vertex_count);
+	adjacency.data = allocator.allocate<unsigned int>(index_count);
+
+	// fill triangle counts
+	memset(adjacency.counts, 0, vertex_count * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; ++i)
+	{
+		assert(indices[i] < vertex_count);
+
+		adjacency.counts[indices[i]]++;
+	}
+
+	// fill offset table
+	unsigned int offset = 0;
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		adjacency.offsets[i] = offset;
+		offset += adjacency.counts[i];
+	}
+
+	assert(offset == index_count);
+
+	// fill triangle data
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+
+		adjacency.data[adjacency.offsets[a]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[b]++] = unsigned(i);
+		adjacency.data[adjacency.offsets[c]++] = unsigned(i);
+	}
+
+	// fix offsets that have been disturbed by the previous pass
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		assert(adjacency.offsets[i] >= adjacency.counts[i]);
+
+		adjacency.offsets[i] -= adjacency.counts[i];
+	}
+}
+
 static void computeBoundingSphere(float result[4], const float points[][3], size_t count)
 {
 	assert(count > 0);
@@ -82,13 +145,310 @@ static void computeBoundingSphere(float result[4], const float points[][3], size
 	result[3] = radius;
 }
 
+struct Cone
+{
+	float px, py, pz;
+	float nx, ny, nz;
+};
+
+static float getMeshletScore(float distance2, float spread, float cone_weight, float expected_radius)
+{
+	float cone = 1.f - spread * cone_weight;
+	float cone_clamped = cone < 1e-3f ? 1e-3f : cone;
+
+	return (1 + sqrtf(distance2) / expected_radius * (1 - cone_weight)) * cone_clamped;
+}
+
+static Cone getMeshletCone(const Cone& acc, unsigned int triangle_count)
+{
+	Cone result = acc;
+
+	float center_scale = triangle_count == 0 ? 0.f : 1.f / float(triangle_count);
+
+	result.px *= center_scale;
+	result.py *= center_scale;
+	result.pz *= center_scale;
+
+	float axis_length = result.nx * result.nx + result.ny * result.ny + result.nz * result.nz;
+	float axis_scale = axis_length == 0.f ? 0.f : 1.f / sqrtf(axis_length);
+
+	result.nx *= axis_scale;
+	result.ny *= axis_scale;
+	result.nz *= axis_scale;
+
+	return result;
+}
+
+static float computeTriangleCones(Cone* triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	(void)vertex_count;
+
+	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
+	size_t face_count = index_count / 3;
+
+	float mesh_area = 0;
+
+	for (size_t i = 0; i < face_count; ++i)
+	{
+		unsigned int a = indices[i * 3 + 0], b = indices[i * 3 + 1], c = indices[i * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		const float* p0 = vertex_positions + vertex_stride_float * a;
+		const float* p1 = vertex_positions + vertex_stride_float * b;
+		const float* p2 = vertex_positions + vertex_stride_float * c;
+
+		float p10[3] = {p1[0] - p0[0], p1[1] - p0[1], p1[2] - p0[2]};
+		float p20[3] = {p2[0] - p0[0], p2[1] - p0[1], p2[2] - p0[2]};
+
+		float normalx = p10[1] * p20[2] - p10[2] * p20[1];
+		float normaly = p10[2] * p20[0] - p10[0] * p20[2];
+		float normalz = p10[0] * p20[1] - p10[1] * p20[0];
+
+		float area = sqrtf(normalx * normalx + normaly * normaly + normalz * normalz);
+		float invarea = (area == 0.f) ? 0.f : 1.f / area;
+
+		triangles[i].px = (p0[0] + p1[0] + p2[0]) / 3.f;
+		triangles[i].py = (p0[1] + p1[1] + p2[1]) / 3.f;
+		triangles[i].pz = (p0[2] + p1[2] + p2[2]) / 3.f;
+
+		triangles[i].nx = normalx * invarea;
+		triangles[i].ny = normaly * invarea;
+		triangles[i].nz = normalz * invarea;
+
+		mesh_area += area;
+	}
+
+	return mesh_area;
+}
+
+static void finishMeshlet(meshopt_Meshlet& meshlet, unsigned char* meshlet_triangles)
+{
+	size_t offset = meshlet.triangle_offset + meshlet.triangle_count * 3;
+
+	// fill 4b padding with 0
+	while (offset & 3)
+		meshlet_triangles[offset++] = 0;
+}
+
+static bool appendMeshlet(meshopt_Meshlet& meshlet, unsigned int a, unsigned int b, unsigned int c, unsigned char* used, meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, size_t meshlet_offset, size_t max_vertices, size_t max_triangles)
+{
+	unsigned char& av = used[a];
+	unsigned char& bv = used[b];
+	unsigned char& cv = used[c];
+
+	bool result = false;
+
+	unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+
+	if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+	{
+		meshlets[meshlet_offset] = meshlet;
+
+		for (size_t j = 0; j < meshlet.vertex_count; ++j)
+			used[meshlet_vertices[meshlet.vertex_offset + j]] = 0xff;
+
+		finishMeshlet(meshlet, meshlet_triangles);
+
+		meshlet.vertex_offset += meshlet.vertex_count;
+		meshlet.triangle_offset += (meshlet.triangle_count * 3 + 3) & ~3; // 4b padding
+		meshlet.vertex_count = 0;
+		meshlet.triangle_count = 0;
+
+		result = true;
+	}
+
+	if (av == 0xff)
+	{
+		av = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = a;
+	}
+
+	if (bv == 0xff)
+	{
+		bv = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = b;
+	}
+
+	if (cv == 0xff)
+	{
+		cv = (unsigned char)meshlet.vertex_count;
+		meshlet_vertices[meshlet.vertex_offset + meshlet.vertex_count++] = c;
+	}
+
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 0] = av;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 1] = bv;
+	meshlet_triangles[meshlet.triangle_offset + meshlet.triangle_count * 3 + 2] = cv;
+	meshlet.triangle_count++;
+
+	return result;
+}
+
+struct KDNode
+{
+	union
+	{
+		float split;
+		unsigned int index;
+	};
+
+	// leaves: axis = 3, children = number of extra points after this one (0 if 'index' is the only point)
+	// branches: axis != 3, left subtree = skip 1, right subtree = skip 1+children
+	unsigned int axis : 2;
+	unsigned int children : 30;
+};
+
+static size_t kdtreePartition(unsigned int* indices, size_t count, const float* points, size_t stride, unsigned int axis, float pivot)
+{
+	size_t m = 0;
+
+	// invariant: elements in range [0, m) are < pivot, elements in range [m, i) are >= pivot
+	for (size_t i = 0; i < count; ++i)
+	{
+		float v = points[indices[i] * stride + axis];
+
+		// swap(m, i) unconditionally
+		unsigned int t = indices[m];
+		indices[m] = indices[i];
+		indices[i] = t;
+
+		// when v >= pivot, we swap i with m without advancing it, preserving invariants
+		m += v < pivot;
+	}
+
+	return m;
+}
+
+static size_t kdtreeBuildLeaf(size_t offset, KDNode* nodes, size_t node_count, unsigned int* indices, size_t count)
+{
+	assert(offset + count <= node_count);
+	(void)node_count;
+
+	KDNode& result = nodes[offset];
+
+	result.index = indices[0];
+	result.axis = 3;
+	result.children = unsigned(count - 1);
+
+	// all remaining points are stored in nodes immediately following the leaf
+	for (size_t i = 1; i < count; ++i)
+	{
+		KDNode& tail = nodes[offset + i];
+
+		tail.index = indices[i];
+		tail.axis = 3;
+		tail.children = ~0u >> 2; // bogus value to prevent misuse
+	}
+
+	return offset + count;
+}
+
+static size_t kdtreeBuild(size_t offset, KDNode* nodes, size_t node_count, const float* points, size_t stride, unsigned int* indices, size_t count, size_t leaf_size)
+{
+	assert(count > 0);
+	assert(offset < node_count);
+
+	if (count <= leaf_size)
+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
+
+	float mean[3] = {};
+	float vars[3] = {};
+	float runc = 1, runs = 1;
+
+	// gather statistics on the points in the subtree using Welford's algorithm
+	for (size_t i = 0; i < count; ++i, runc += 1.f, runs = 1.f / runc)
+	{
+		const float* point = points + indices[i] * stride;
+
+		for (int k = 0; k < 3; ++k)
+		{
+			float delta = point[k] - mean[k];
+			mean[k] += delta * runs;
+			vars[k] += delta * (point[k] - mean[k]);
+		}
+	}
+
+	// split axis is one where the variance is largest
+	unsigned int axis = vars[0] >= vars[1] && vars[0] >= vars[2] ? 0 : vars[1] >= vars[2] ? 1
+	                                                                                      : 2;
+
+	float split = mean[axis];
+	size_t middle = kdtreePartition(indices, count, points, stride, axis, split);
+
+	// when the partition is degenerate simply consolidate the points into a single node
+	if (middle <= leaf_size / 2 || middle >= count - leaf_size / 2)
+		return kdtreeBuildLeaf(offset, nodes, node_count, indices, count);
+
+	KDNode& result = nodes[offset];
+
+	result.split = split;
+	result.axis = axis;
+
+	// left subtree is right after our node
+	size_t next_offset = kdtreeBuild(offset + 1, nodes, node_count, points, stride, indices, middle, leaf_size);
+
+	// distance to the right subtree is represented explicitly
+	result.children = unsigned(next_offset - offset - 1);
+
+	return kdtreeBuild(next_offset, nodes, node_count, points, stride, indices + middle, count - middle, leaf_size);
+}
+
+static void kdtreeNearest(KDNode* nodes, unsigned int root, const float* points, size_t stride, const unsigned char* emitted_flags, const float* position, unsigned int& result, float& limit)
+{
+	const KDNode& node = nodes[root];
+
+	if (node.axis == 3)
+	{
+		// leaf
+		for (unsigned int i = 0; i <= node.children; ++i)
+		{
+			unsigned int index = nodes[root + i].index;
+
+			if (emitted_flags[index])
+				continue;
+
+			const float* point = points + index * stride;
+
+			float distance2 =
+			    (point[0] - position[0]) * (point[0] - position[0]) +
+			    (point[1] - position[1]) * (point[1] - position[1]) +
+			    (point[2] - position[2]) * (point[2] - position[2]);
+			float distance = sqrtf(distance2);
+
+			if (distance < limit)
+			{
+				result = index;
+				limit = distance;
+			}
+		}
+	}
+	else
+	{
+		// branch; we order recursion to process the node that search position is in first
+		float delta = position[node.axis] - node.split;
+		unsigned int first = (delta <= 0) ? 0 : node.children;
+		unsigned int second = first ^ node.children;
+
+		kdtreeNearest(nodes, root + 1 + first, points, stride, emitted_flags, position, result, limit);
+
+		// only process the other node if it can have a match based on closest distance so far
+		if (fabsf(delta) <= limit)
+			kdtreeNearest(nodes, root + 1 + second, points, stride, emitted_flags, position, result, limit);
+	}
+}
+
 } // namespace meshopt
 
 size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles)
 {
+	using namespace meshopt;
+
 	assert(index_count % 3 == 0);
-	assert(max_vertices >= 3);
-	assert(max_triangles >= 1);
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+
+	(void)kMeshletMaxVertices;
+	(void)kMeshletMaxTriangles;
 
 	// meshlet construction is limited by max vertices and max triangles per meshlet
 	// the worst case is that the input is an unindexed stream since this equally stresses both limits
@@ -100,77 +460,226 @@ size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_
 	return meshlet_limit_vertices > meshlet_limit_triangles ? meshlet_limit_vertices : meshlet_limit_triangles;
 }
 
-size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
 {
+	using namespace meshopt;
+
 	assert(index_count % 3 == 0);
-	assert(max_vertices >= 3);
-	assert(max_triangles >= 1);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
 
 	meshopt_Allocator allocator;
 
-	meshopt_Meshlet meshlet;
-	memset(&meshlet, 0, sizeof(meshlet));
+	TriangleAdjacency2 adjacency = {};
+	buildTriangleAdjacency(adjacency, indices, index_count, vertex_count, allocator);
+
+	unsigned int* live_triangles = allocator.allocate<unsigned int>(vertex_count);
+	memcpy(live_triangles, adjacency.counts, vertex_count * sizeof(unsigned int));
+
+	size_t face_count = index_count / 3;
+
+	unsigned char* emitted_flags = allocator.allocate<unsigned char>(face_count);
+	memset(emitted_flags, 0, face_count);
+
+	// for each triangle, precompute centroid & normal to use for scoring
+	Cone* triangles = allocator.allocate<Cone>(face_count);
+	float mesh_area = computeTriangleCones(triangles, indices, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+
+	// assuming each meshlet is a square patch, expected radius is sqrt(expected area)
+	float triangle_area_avg = face_count == 0 ? 0.f : mesh_area / float(face_count) * 0.5f;
+	float meshlet_expected_radius = sqrtf(triangle_area_avg * max_triangles) * 0.5f;
+
+	// build a kd-tree for nearest neighbor lookup
+	unsigned int* kdindices = allocator.allocate<unsigned int>(face_count);
+	for (size_t i = 0; i < face_count; ++i)
+		kdindices[i] = unsigned(i);
 
-	assert(max_vertices <= sizeof(meshlet.vertices) / sizeof(meshlet.vertices[0]));
-	assert(max_triangles <= sizeof(meshlet.indices) / 3);
+	KDNode* nodes = allocator.allocate<KDNode>(face_count * 2);
+	kdtreeBuild(0, nodes, face_count * 2, &triangles[0].px, sizeof(Cone) / sizeof(float), kdindices, face_count, /* leaf_size= */ 8);
 
 	// index of the vertex in the meshlet, 0xff if the vertex isn't used
 	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
 	memset(used, -1, vertex_count);
 
-	size_t offset = 0;
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
 
-	for (size_t i = 0; i < index_count; i += 3)
-	{
-		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+	Cone meshlet_cone_acc = {};
 
-		unsigned char& av = used[a];
-		unsigned char& bv = used[b];
-		unsigned char& cv = used[c];
+	for (;;)
+	{
+		unsigned int best_triangle = ~0u;
+		unsigned int best_extra = 5;
+		float best_score = FLT_MAX;
 
-		unsigned int used_extra = (av == 0xff) + (bv == 0xff) + (cv == 0xff);
+		Cone meshlet_cone = getMeshletCone(meshlet_cone_acc, meshlet.triangle_count);
 
-		if (meshlet.vertex_count + used_extra > max_vertices || meshlet.triangle_count >= max_triangles)
+		for (size_t i = 0; i < meshlet.vertex_count; ++i)
 		{
-			destination[offset++] = meshlet;
+			unsigned int index = meshlet_vertices[meshlet.vertex_offset + i];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t j = 0; j < neighbours_size; ++j)
+			{
+				unsigned int triangle = neighbours[j];
+				assert(!emitted_flags[triangle]);
+
+				unsigned int a = indices[triangle * 3 + 0], b = indices[triangle * 3 + 1], c = indices[triangle * 3 + 2];
+				assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+				unsigned int extra = (used[a] == 0xff) + (used[b] == 0xff) + (used[c] == 0xff);
+
+				// triangles that don't add new vertices to meshlets are max. priority
+				if (extra != 0)
+				{
+					// artificially increase the priority of dangling triangles as they're expensive to add to new meshlets
+					if (live_triangles[a] == 1 || live_triangles[b] == 1 || live_triangles[c] == 1)
+						extra = 0;
+
+					extra++;
+				}
+
+				// since topology-based priority is always more important than the score, we can skip scoring in some cases
+				if (extra > best_extra)
+					continue;
+
+				const Cone& tri_cone = triangles[triangle];
+
+				float distance2 =
+				    (tri_cone.px - meshlet_cone.px) * (tri_cone.px - meshlet_cone.px) +
+				    (tri_cone.py - meshlet_cone.py) * (tri_cone.py - meshlet_cone.py) +
+				    (tri_cone.pz - meshlet_cone.pz) * (tri_cone.pz - meshlet_cone.pz);
 
-			for (size_t j = 0; j < meshlet.vertex_count; ++j)
-				used[meshlet.vertices[j]] = 0xff;
+				float spread = tri_cone.nx * meshlet_cone.nx + tri_cone.ny * meshlet_cone.ny + tri_cone.nz * meshlet_cone.nz;
 
-			memset(&meshlet, 0, sizeof(meshlet));
+				float score = getMeshletScore(distance2, spread, cone_weight, meshlet_expected_radius);
+
+				// note that topology-based priority is always more important than the score
+				// this helps maintain reasonable effectiveness of meshlet data and reduces scoring cost
+				if (extra < best_extra || score < best_score)
+				{
+					best_triangle = triangle;
+					best_extra = extra;
+					best_score = score;
+				}
+			}
 		}
 
-		if (av == 0xff)
+		if (best_triangle == ~0u)
 		{
-			av = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = a;
+			float position[3] = {meshlet_cone.px, meshlet_cone.py, meshlet_cone.pz};
+			unsigned int index = ~0u;
+			float limit = FLT_MAX;
+
+			kdtreeNearest(nodes, 0, &triangles[0].px, sizeof(Cone) / sizeof(float), emitted_flags, position, index, limit);
+
+			best_triangle = index;
 		}
 
-		if (bv == 0xff)
+		if (best_triangle == ~0u)
+			break;
+
+		unsigned int a = indices[best_triangle * 3 + 0], b = indices[best_triangle * 3 + 1], c = indices[best_triangle * 3 + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// add meshlet to the output; when the current meshlet is full we reset the accumulated bounds
+		if (appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles))
 		{
-			bv = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = b;
+			meshlet_offset++;
+			memset(&meshlet_cone_acc, 0, sizeof(meshlet_cone_acc));
 		}
 
-		if (cv == 0xff)
+		live_triangles[a]--;
+		live_triangles[b]--;
+		live_triangles[c]--;
+
+		// remove emitted triangle from adjacency data
+		// this makes sure that we spend less time traversing these lists on subsequent iterations
+		for (size_t k = 0; k < 3; ++k)
 		{
-			cv = meshlet.vertex_count;
-			meshlet.vertices[meshlet.vertex_count++] = c;
+			unsigned int index = indices[best_triangle * 3 + k];
+
+			unsigned int* neighbours = &adjacency.data[0] + adjacency.offsets[index];
+			size_t neighbours_size = adjacency.counts[index];
+
+			for (size_t i = 0; i < neighbours_size; ++i)
+			{
+				unsigned int tri = neighbours[i];
+
+				if (tri == best_triangle)
+				{
+					neighbours[i] = neighbours[neighbours_size - 1];
+					adjacency.counts[index]--;
+					break;
+				}
+			}
 		}
 
-		meshlet.indices[meshlet.triangle_count][0] = av;
-		meshlet.indices[meshlet.triangle_count][1] = bv;
-		meshlet.indices[meshlet.triangle_count][2] = cv;
-		meshlet.triangle_count++;
+		// update aggregated meshlet cone data for scoring subsequent triangles
+		meshlet_cone_acc.px += triangles[best_triangle].px;
+		meshlet_cone_acc.py += triangles[best_triangle].py;
+		meshlet_cone_acc.pz += triangles[best_triangle].pz;
+		meshlet_cone_acc.nx += triangles[best_triangle].nx;
+		meshlet_cone_acc.ny += triangles[best_triangle].ny;
+		meshlet_cone_acc.nz += triangles[best_triangle].nz;
+
+		emitted_flags[best_triangle] = 1;
+	}
+
+	if (meshlet.triangle_count)
+	{
+		finishMeshlet(meshlet, meshlet_triangles);
+
+		meshlets[meshlet_offset++] = meshlet;
+	}
+
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	return meshlet_offset;
+}
+
+size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+
+	assert(max_vertices >= 3 && max_vertices <= kMeshletMaxVertices);
+	assert(max_triangles >= 1 && max_triangles <= kMeshletMaxTriangles);
+	assert(max_triangles % 4 == 0); // ensures the caller will compute output space properly as index data is 4b aligned
+
+	meshopt_Allocator allocator;
+
+	// index of the vertex in the meshlet, 0xff if the vertex isn't used
+	unsigned char* used = allocator.allocate<unsigned char>(vertex_count);
+	memset(used, -1, vertex_count);
+
+	meshopt_Meshlet meshlet = {};
+	size_t meshlet_offset = 0;
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int a = indices[i + 0], b = indices[i + 1], c = indices[i + 2];
+		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+
+		// appends triangle to the meshlet and writes previous meshlet to the output if full
+		meshlet_offset += appendMeshlet(meshlet, a, b, c, used, meshlets, meshlet_vertices, meshlet_triangles, meshlet_offset, max_vertices, max_triangles);
 	}
 
 	if (meshlet.triangle_count)
-		destination[offset++] = meshlet;
+	{
+		finishMeshlet(meshlet, meshlet_triangles);
 
-	assert(offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+		meshlets[meshlet_offset++] = meshlet;
+	}
 
-	return offset;
+	assert(meshlet_offset <= meshopt_buildMeshletsBound(index_count, max_vertices, max_triangles));
+	return meshlet_offset;
 }
 
 meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
@@ -178,18 +687,17 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	using namespace meshopt;
 
 	assert(index_count % 3 == 0);
+	assert(index_count / 3 <= kMeshletMaxTriangles);
 	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
-	assert(index_count / 3 <= 256);
-
 	(void)vertex_count;
 
 	size_t vertex_stride_float = vertex_positions_stride / sizeof(float);
 
 	// compute triangle normals and gather triangle corners
-	float normals[256][3];
-	float corners[256][3][3];
+	float normals[kMeshletMaxTriangles][3];
+	float corners[kMeshletMaxTriangles][3][3];
 	size_t triangles = 0;
 
 	for (size_t i = 0; i < index_count; i += 3)
@@ -327,25 +835,23 @@ meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t
 	return bounds;
 }
 
-meshopt_Bounds meshopt_computeMeshletBounds(const meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
 {
+	using namespace meshopt;
+
+	assert(triangle_count <= kMeshletMaxTriangles);
 	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
 	assert(vertex_positions_stride % sizeof(float) == 0);
 
-	unsigned int indices[sizeof(meshlet->indices) / sizeof(meshlet->indices[0][0])];
+	unsigned int indices[kMeshletMaxTriangles * 3];
 
-	for (size_t i = 0; i < meshlet->triangle_count; ++i)
+	for (size_t i = 0; i < triangle_count * 3; ++i)
 	{
-		unsigned int a = meshlet->vertices[meshlet->indices[i][0]];
-		unsigned int b = meshlet->vertices[meshlet->indices[i][1]];
-		unsigned int c = meshlet->vertices[meshlet->indices[i][2]];
-
-		assert(a < vertex_count && b < vertex_count && c < vertex_count);
+		unsigned int index = meshlet_vertices[meshlet_triangles[i]];
+		assert(index < vertex_count);
 
-		indices[i * 3 + 0] = a;
-		indices[i * 3 + 1] = b;
-		indices[i * 3 + 2] = c;
+		indices[i] = index;
 	}
 
-	return meshopt_computeClusterBounds(indices, meshlet->triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
+	return meshopt_computeClusterBounds(indices, triangle_count * 3, vertex_positions, vertex_count, vertex_positions_stride);
 }
diff --git a/thirdparty/meshoptimizer/indexgenerator.cpp b/thirdparty/meshoptimizer/indexgenerator.cpp
index aa4a30efa4..f60db0dc4f 100644
--- a/thirdparty/meshoptimizer/indexgenerator.cpp
+++ b/thirdparty/meshoptimizer/indexgenerator.cpp
@@ -4,6 +4,8 @@
 #include <assert.h>
 #include <string.h>
 
+// This work is based on:
+// John McDonald, Mark Kilgard. Crack-Free Point-Normal Triangles using Adjacent Edge Normals. 2010
 namespace meshopt
 {
 
@@ -83,10 +85,49 @@ struct VertexStreamHasher
 	}
 };
 
+struct EdgeHasher
+{
+	const unsigned int* remap;
+
+	size_t hash(unsigned long long edge) const
+	{
+		unsigned int e0 = unsigned(edge >> 32);
+		unsigned int e1 = unsigned(edge);
+
+		unsigned int h1 = remap[e0];
+		unsigned int h2 = remap[e1];
+
+		const unsigned int m = 0x5bd1e995;
+
+		// MurmurHash64B finalizer
+		h1 ^= h2 >> 18;
+		h1 *= m;
+		h2 ^= h1 >> 22;
+		h2 *= m;
+		h1 ^= h2 >> 17;
+		h1 *= m;
+		h2 ^= h1 >> 19;
+		h2 *= m;
+
+		return h2;
+	}
+
+	bool equal(unsigned long long lhs, unsigned long long rhs) const
+	{
+		unsigned int l0 = unsigned(lhs >> 32);
+		unsigned int l1 = unsigned(lhs);
+
+		unsigned int r0 = unsigned(rhs >> 32);
+		unsigned int r1 = unsigned(rhs);
+
+		return remap[l0] == remap[r0] && remap[l1] == remap[r1];
+	}
+};
+
 static size_t hashBuckets(size_t count)
 {
 	size_t buckets = 1;
-	while (buckets < count)
+	while (buckets < count + count / 4)
 		buckets *= 2;
 
 	return buckets;
@@ -119,6 +160,26 @@ static T* hashLookup(T* table, size_t buckets, const Hash& hash, const T& key, c
 	return 0;
 }
 
+static void buildPositionRemap(unsigned int* remap, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, meshopt_Allocator& allocator)
+{
+	VertexHasher vertex_hasher = {reinterpret_cast<const unsigned char*>(vertex_positions), 3 * sizeof(float), vertex_positions_stride};
+
+	size_t vertex_table_size = hashBuckets(vertex_count);
+	unsigned int* vertex_table = allocator.allocate<unsigned int>(vertex_table_size);
+	memset(vertex_table, -1, vertex_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < vertex_count; ++i)
+	{
+		unsigned int index = unsigned(i);
+		unsigned int* entry = hashLookup(vertex_table, vertex_table_size, vertex_hasher, index, ~0u);
+
+		if (*entry == ~0u)
+			*entry = index;
+
+		remap[index] = *entry;
+	}
+}
+
 } // namespace meshopt
 
 size_t meshopt_generateVertexRemap(unsigned int* destination, const unsigned int* indices, size_t index_count, const void* vertices, size_t vertex_count, size_t vertex_size)
@@ -345,3 +406,146 @@ void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const uns
 		destination[i] = remap[index];
 	}
 }
+
+void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[4] = {1, 2, 0, 1};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	unsigned int* edge_vertex_table = allocator.allocate<unsigned int>(edge_table_size);
+
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+	memset(edge_vertex_table, -1, edge_table_size * sizeof(unsigned int));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			unsigned int i2 = indices[i + next[e + 1]];
+			assert(i0 < vertex_count && i1 < vertex_count && i2 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+			{
+				*entry = edge;
+
+				// store vertex opposite to the edge
+				edge_vertex_table[entry - edge_table] = i2;
+			}
+		}
+	}
+
+	// build resulting index buffer: 6 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[6];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long* oppe = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			patch[e * 2 + 0] = i0;
+			patch[e * 2 + 1] = (*oppe == ~0ull) ? i0 : edge_vertex_table[oppe - edge_table];
+		}
+
+		memcpy(destination + i * 2, patch, sizeof(patch));
+	}
+}
+
+void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	using namespace meshopt;
+
+	assert(index_count % 3 == 0);
+	assert(vertex_positions_stride > 0 && vertex_positions_stride <= 256);
+	assert(vertex_positions_stride % sizeof(float) == 0);
+
+	meshopt_Allocator allocator;
+
+	static const int next[3] = {1, 2, 0};
+
+	// build position remap: for each vertex, which other (canonical) vertex does it map to?
+	unsigned int* remap = allocator.allocate<unsigned int>(vertex_count);
+	buildPositionRemap(remap, vertex_positions, vertex_count, vertex_positions_stride, allocator);
+
+	// build edge set; this stores all triangle edges but we can look these up by any other wedge
+	EdgeHasher edge_hasher = {remap};
+
+	size_t edge_table_size = hashBuckets(index_count);
+	unsigned long long* edge_table = allocator.allocate<unsigned long long>(edge_table_size);
+	memset(edge_table, -1, edge_table_size * sizeof(unsigned long long));
+
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			unsigned long long edge = ((unsigned long long)i0 << 32) | i1;
+			unsigned long long* entry = hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			if (*entry == ~0ull)
+				*entry = edge;
+		}
+	}
+
+	// build resulting index buffer: 12 indices for each input triangle
+	for (size_t i = 0; i < index_count; i += 3)
+	{
+		unsigned int patch[12];
+
+		for (int e = 0; e < 3; ++e)
+		{
+			unsigned int i0 = indices[i + e];
+			unsigned int i1 = indices[i + next[e]];
+			assert(i0 < vertex_count && i1 < vertex_count);
+
+			// note: this refers to the opposite edge!
+			unsigned long long edge = ((unsigned long long)i1 << 32) | i0;
+			unsigned long long oppe = *hashLookup(edge_table, edge_table_size, edge_hasher, edge, ~0ull);
+
+			// use the same edge if opposite edge doesn't exist (border)
+			oppe = (oppe == ~0ull) ? edge : oppe;
+
+			// triangle index (0, 1, 2)
+			patch[e] = i0;
+
+			// opposite edge (3, 4; 5, 6; 7, 8)
+			patch[3 + e * 2 + 0] = unsigned(oppe);
+			patch[3 + e * 2 + 1] = unsigned(oppe >> 32);
+
+			// dominant vertex (9, 10, 11)
+			patch[9 + e] = remap[i0];
+		}
+
+		memcpy(destination + i * 4, patch, sizeof(patch));
+	}
+}
diff --git a/thirdparty/meshoptimizer/meshoptimizer.h b/thirdparty/meshoptimizer/meshoptimizer.h
index 1714000384..fe8d349731 100644
--- a/thirdparty/meshoptimizer/meshoptimizer.h
+++ b/thirdparty/meshoptimizer/meshoptimizer.h
@@ -1,7 +1,7 @@
 /**
- * meshoptimizer - version 0.15
+ * meshoptimizer - version 0.16
  *
- * Copyright (C) 2016-2020, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
+ * Copyright (C) 2016-2021, by Arseny Kapoulkine (arseny.kapoulkine@gmail.com)
  * Report bugs and download new versions at https://github.com/zeux/meshoptimizer
  *
  * This library is distributed under the MIT License. See notice at the end of this file.
@@ -12,7 +12,7 @@
 #include <stddef.h>
 
 /* Version macro; major * 1000 + minor * 10 + patch */
-#define MESHOPTIMIZER_VERSION 150 /* 0.15 */
+#define MESHOPTIMIZER_VERSION 160 /* 0.16 */
 
 /* If no API is defined, assume default */
 #ifndef MESHOPTIMIZER_API
@@ -98,6 +98,35 @@ MESHOPTIMIZER_API void meshopt_generateShadowIndexBuffer(unsigned int* destinati
 MESHOPTIMIZER_API void meshopt_generateShadowIndexBufferMulti(unsigned int* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, const struct meshopt_Stream* streams, size_t stream_count);
 
 /**
+ * Generate index buffer that can be used as a geometry shader input with triangle adjacency topology
+ * Each triangle is converted into a 6-vertex patch with the following layout:
+ * - 0, 2, 4: original triangle vertices
+ * - 1, 3, 5: vertices adjacent to edges 02, 24 and 40
+ * The resulting patch can be rendered with geometry shaders using e.g. VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY.
+ * This can be used to implement algorithms like silhouette detection/expansion and other forms of GS-driven rendering.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count*2 elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateAdjacencyIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
+ * Generate index buffer that can be used for PN-AEN tessellation with crack-free displacement
+ * Each triangle is converted into a 12-vertex patch with the following layout:
+ * - 0, 1, 2: original triangle vertices
+ * - 3, 4: opposing edge for edge 0, 1
+ * - 5, 6: opposing edge for edge 1, 2
+ * - 7, 8: opposing edge for edge 2, 0
+ * - 9, 10, 11: dominant vertices for corners 0, 1, 2
+ * The resulting patch can be rendered with hardware tessellation using PN-AEN and displacement mapping.
+ * See "Tessellation on Any Budget" (John McDonald, GDC 2011) for implementation details.
+ *
+ * destination must contain enough space for the resulting index buffer (index_count*4 elements)
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ */
+MESHOPTIMIZER_EXPERIMENTAL void meshopt_generateTessellationIndexBuffer(unsigned int* destination, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+
+/**
  * Vertex transform cache optimizer
  * Reorders indices to reduce the number of GPU vertex shader invocations
  * If index buffer contains multiple ranges for multiple draw calls, this functions needs to be called on each range individually.
@@ -373,22 +402,31 @@ MESHOPTIMIZER_API struct meshopt_VertexFetchStatistics meshopt_analyzeVertexFetc
 
 struct meshopt_Meshlet
 {
-	unsigned int vertices[64];
-	unsigned char indices[126][3];
-	unsigned char triangle_count;
-	unsigned char vertex_count;
+	/* offsets within meshlet_vertices and meshlet_triangles arrays with meshlet data */
+	unsigned int vertex_offset;
+	unsigned int triangle_offset;
+
+	/* number of vertices and triangles used in the meshlet; data is stored in consecutive range defined by offset and count */
+	unsigned int vertex_count;
+	unsigned int triangle_count;
 };
 
 /**
  * Experimental: Meshlet builder
  * Splits the mesh into a set of meshlets where each meshlet has a micro index buffer indexing into meshlet vertices that refer to the original vertex buffer
  * The resulting data can be used to render meshes using NVidia programmable mesh shading pipeline, or in other cluster-based renderers.
- * For maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
+ * When using buildMeshlets, vertex positions need to be provided to minimize the size of the resulting clusters.
+ * When using buildMeshletsScan, for maximum efficiency the index buffer being converted has to be optimized for vertex cache first.
  *
- * destination must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
- * max_vertices and max_triangles can't exceed limits statically declared in meshopt_Meshlet (max_vertices <= 64, max_triangles <= 126)
+ * meshlets must contain enough space for all meshlets, worst case size can be computed with meshopt_buildMeshletsBound
+ * meshlet_vertices must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_vertices
+ * meshlet_triangles must contain enough space for all meshlets, worst case size is equal to max_meshlets * max_triangles * 3
+ * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
+ * max_vertices and max_triangles must not exceed implementation limits (max_vertices <= 255 - not 256!, max_triangles <= 512)
+ * cone_weight should be set to 0 when cone culling is not used, and a value between 0 and 1 otherwise to balance between cluster size and cone culling efficiency
  */
-MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* destination, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshlets(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
+MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsScan(struct meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const unsigned int* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 MESHOPTIMIZER_EXPERIMENTAL size_t meshopt_buildMeshletsBound(size_t index_count, size_t max_vertices, size_t max_triangles);
 
 struct meshopt_Bounds
@@ -426,10 +464,10 @@ struct meshopt_Bounds
  * to do frustum/occlusion culling, the formula that doesn't use the apex may be preferable.
  *
  * vertex_positions should have float3 position in the first 12 bytes of each vertex - similar to glVertexPointer
- * index_count should be less than or equal to 256*3 (the function assumes clusters of limited size)
+ * index_count/3 should be less than or equal to 512 (the function assumes clusters of limited size)
  */
 MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeClusterBounds(const unsigned int* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
-MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const struct meshopt_Meshlet* meshlet, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+MESHOPTIMIZER_EXPERIMENTAL struct meshopt_Bounds meshopt_computeMeshletBounds(const unsigned int* meshlet_vertices, const unsigned char* meshlet_triangles, size_t triangle_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 
 /**
  * Experimental: Spatial sorter
@@ -513,6 +551,10 @@ inline void meshopt_generateShadowIndexBuffer(T* destination, const T* indices,
 template <typename T>
 inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indices, size_t index_count, size_t vertex_count, const meshopt_Stream* streams, size_t stream_count);
 template <typename T>
+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count);
 template <typename T>
 inline void meshopt_optimizeVertexCacheStrip(T* destination, const T* indices, size_t index_count, size_t vertex_count);
@@ -547,7 +589,9 @@ inline meshopt_OverdrawStatistics meshopt_analyzeOverdraw(const T* indices, size
 template <typename T>
 inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices, size_t index_count, size_t vertex_count, size_t vertex_size);
 template <typename T>
-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight);
+template <typename T>
+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles);
 template <typename T>
 inline meshopt_Bounds meshopt_computeClusterBounds(const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride);
 template <typename T>
@@ -762,6 +806,24 @@ inline void meshopt_generateShadowIndexBufferMulti(T* destination, const T* indi
 }
 
 template <typename T>
+inline void meshopt_generateAdjacencyIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count * 2);
+
+	meshopt_generateAdjacencyIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
+inline void meshopt_generateTessellationIndexBuffer(T* destination, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+	meshopt_IndexAdapter<T> out(destination, 0, index_count * 4);
+
+	meshopt_generateTessellationIndexBuffer(out.data, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride);
+}
+
+template <typename T>
 inline void meshopt_optimizeVertexCache(T* destination, const T* indices, size_t index_count, size_t vertex_count)
 {
 	meshopt_IndexAdapter<T> in(0, indices, index_count);
@@ -908,11 +970,19 @@ inline meshopt_VertexFetchStatistics meshopt_analyzeVertexFetch(const T* indices
 }
 
 template <typename T>
-inline size_t meshopt_buildMeshlets(meshopt_Meshlet* destination, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
+inline size_t meshopt_buildMeshlets(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, const float* vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t max_vertices, size_t max_triangles, float cone_weight)
+{
+	meshopt_IndexAdapter<T> in(0, indices, index_count);
+
+	return meshopt_buildMeshlets(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_positions, vertex_count, vertex_positions_stride, max_vertices, max_triangles, cone_weight);
+}
+
+template <typename T>
+inline size_t meshopt_buildMeshletsScan(meshopt_Meshlet* meshlets, unsigned int* meshlet_vertices, unsigned char* meshlet_triangles, const T* indices, size_t index_count, size_t vertex_count, size_t max_vertices, size_t max_triangles)
 {
 	meshopt_IndexAdapter<T> in(0, indices, index_count);
 
-	return meshopt_buildMeshlets(destination, in.data, index_count, vertex_count, max_vertices, max_triangles);
+	return meshopt_buildMeshletsScan(meshlets, meshlet_vertices, meshlet_triangles, in.data, index_count, vertex_count, max_vertices, max_triangles);
 }
 
 template <typename T>
@@ -934,7 +1004,7 @@ inline void meshopt_spatialSortTriangles(T* destination, const T* indices, size_
 #endif
 
 /**
- * Copyright (c) 2016-2020 Arseny Kapoulkine
+ * Copyright (c) 2016-2021 Arseny Kapoulkine
  *
  * Permission is hereby granted, free of charge, to any person
  * obtaining a copy of this software and associated documentation
diff --git a/thirdparty/meshoptimizer/simplifier.cpp b/thirdparty/meshoptimizer/simplifier.cpp
index 942db14461..b2cb589462 100644
--- a/thirdparty/meshoptimizer/simplifier.cpp
+++ b/thirdparty/meshoptimizer/simplifier.cpp
@@ -131,7 +131,7 @@ struct PositionHasher
 static size_t hashBuckets2(size_t count)
 {
 	size_t buckets = 1;
-	while (buckets < count)
+	while (buckets < count + count / 4)
 		buckets *= 2;
 
 	return buckets;
diff --git a/thirdparty/meshoptimizer/vertexcodec.cpp b/thirdparty/meshoptimizer/vertexcodec.cpp
index 2cbfaac367..5f3ec204ab 100644
--- a/thirdparty/meshoptimizer/vertexcodec.cpp
+++ b/thirdparty/meshoptimizer/vertexcodec.cpp
@@ -710,18 +710,12 @@ static v128_t decodeShuffleMask(unsigned char mask0, unsigned char mask1)
 SIMD_TARGET
 static void wasmMoveMask(v128_t mask, unsigned char& mask0, unsigned char& mask1)
 {
-	v128_t mask_0 = wasm_v32x4_shuffle(mask, mask, 0, 2, 1, 3);
-
-	uint64_t mask_1a = wasm_i64x2_extract_lane(mask_0, 0) & 0x0804020108040201ull;
-	uint64_t mask_1b = wasm_i64x2_extract_lane(mask_0, 1) & 0x8040201080402010ull;
+	// magic constant found using z3 SMT assuming mask has 8 groups of 0xff or 0x00
+	const uint64_t magic = 0x000103070f1f3f80ull;
 
 	// TODO: This can use v8x16_bitmask in the future
-	uint64_t mask_2 = mask_1a | mask_1b;
-	uint64_t mask_4 = mask_2 | (mask_2 >> 16);
-	uint64_t mask_8 = mask_4 | (mask_4 >> 8);
-
-	mask0 = uint8_t(mask_8);
-	mask1 = uint8_t(mask_8 >> 32);
+	mask0 = uint8_t((wasm_i64x2_extract_lane(mask, 0) * magic) >> 56);
+	mask1 = uint8_t((wasm_i64x2_extract_lane(mask, 1) * magic) >> 56);
 }
 
 SIMD_TARGET